From b504b7371e994ca6e22a7bf2e31185b9b120d096 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 16 Feb 2025 08:38:35 -0800
Subject: [PATCH 001/180] [RFC][V1] LogitsProcessor interface

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py |   8 +-
 vllm/v1/attention/backends/mla/common.py |  31 ++-
 vllm/v1/sample/logits_processor.py       | 244 +++++++++++++++++++++++
 vllm/v1/sample/metadata.py               |  11 +-
 vllm/v1/sample/ops/penalties.py          |  16 --
 vllm/v1/sample/sampler.py                |  67 +------
 vllm/v1/worker/gpu_input_batch.py        |  83 ++++----
 vllm/v1/worker/gpu_model_runner.py       |  45 +++--
 vllm/v1/worker/tpu_model_runner.py       |   8 +-
 9 files changed, 354 insertions(+), 159 deletions(-)
 create mode 100644 vllm/v1/sample/logits_processor.py

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b4c7708daab..9c12406676a 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
+from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -279,9 +280,10 @@ class FlashAttentionMetadataBuilder:
     def __init__(self, runner: "GPUModelRunner"):
         self.runner = runner
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return False
+    def reorder_batch(
+            self, input_batch: "InputBatch",
+            scheduler_output: "SchedulerOutput") -> Sequence[tuple[int, int]]:
+        return ()
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int):
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 8c7179ba0a8..dbd05428970 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -186,6 +186,7 @@
 
 import functools
 from abc import abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
 
@@ -377,8 +378,11 @@ def __init__(self,
             )
             self.page_size = self.runner.block_size
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
+    def reorder_batch(
+        self,
+        input_batch: "InputBatch",
+        scheduler_output: "SchedulerOutput",
+    ) -> Sequence[tuple[int, int]]:
         # We now want to reorder the batch so that the "decode" requests are and
         # the front and the "prefill" requests are at the using the least amount
         # swaps possible. (NOTE for now we loosely use "decode" to mean requests
@@ -415,20 +419,25 @@ def reorder_batch(self, input_batch: "InputBatch",
         # the above loop
         num_decodes = len(decodes)
         num_prefills = len(prefills)
-        first_prefill = 0
-        modified_batch = False
 
+        swaps = []
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
             # with the prefill closest to the front of the batch
-            if decodes[num_decodes - i] >= num_decodes:
-                input_batch.swap_states(prefills[first_prefill],
-                                        decodes[num_decodes - i])
-                first_prefill += 1
-                modified_batch = True
-            else:
+            if decodes[num_decodes - i] < num_decodes:
                 break
 
+            i1 = prefills[i - 1]
+            i2 = decodes[num_decodes - i]
+            input_batch.swap_states(i1, i2)
+
+            # Using "move" operation of LogitsProcessors via temporary slot
+            # currently.
+            # TODO possibly add more direct swap operation to LPs
+            swaps.append((i1, input_batch.max_num_reqs))
+            swaps.append((i2, i1))
+            swaps.append((input_batch.max_num_reqs, i2))
+
         # Save for next `build` call
         # TODO(lucas): this is a bit of a hack, we should probably have a
         # better way of doing this
@@ -437,7 +446,7 @@ def reorder_batch(self, input_batch: "InputBatch",
         self._num_decode_tokens = num_decode_tokens
         self._num_prefill_tokens = num_prefill_tokens
 
-        return modified_batch
+        return swaps
 
     def _build_decode(self, input_positions: torch.Tensor,
                       block_table: torch.Tensor, seq_lens: torch.Tensor):
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
new file mode 100644
index 00000000000..fd168613649
--- /dev/null
+++ b/vllm/v1/sample/logits_processor.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+import dataclasses
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch._prims_common import DeviceLikeType
+
+from vllm import SamplingParams
+
+
+@dataclasses.dataclass
+class BatchUpdate:
+    # The current number of requests in the batch.
+    batch_size: int
+    # Batch indices of any removed requests.
+    removed: Sequence[int] = ()
+    # (from, to) batch indices of any requests
+    # moved within the batch.
+    moved: Sequence[tuple[int, int]] = ()
+    # (index, params, output_tok_ids) for new
+    # requests added to the batch.
+    added: Sequence[tuple[int, SamplingParams, list[int]]] = ()
+
+
+class LogitsProcessor(ABC):
+
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_states(
+        self,
+        batch_update: Optional[BatchUpdate] = None,
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Args:
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
+        """
+        raise NotImplementedError
+
+
+###### ----- LogitsProcessor impls below here
+
+
+class MinPLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, max_num_reqs: int, pin_memory: bool,
+                 device: DeviceLikeType):
+        self.min_p_count: int = 0
+
+        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        # Pre-allocated device tensor
+        self.min_p_gpu: torch.Tensor = torch.empty((max_num_reqs, ),
+                                                   dtype=torch.float32,
+                                                   device=device)
+        # Current slice of the device tensor
+        self.min_p: torch.Tensor = self.min_p_gpu[:0]
+
+    def update_states(self, batch_update: Optional[BatchUpdate] = None):
+        if not batch_update:
+            return
+
+        needs_update = False
+        if self.min_p_count:
+            # Process removed and moved requests.
+            for index in batch_update.removed:
+                if self.min_p_cpu[index]:
+                    self.min_p_count -= 1
+                    needs_update = True
+
+            for from_index, to_index in batch_update.moved:
+                min_p = self.min_p_cpu[from_index]
+                self.min_p_cpu[to_index] = min_p
+                if min_p:
+                    needs_update = True
+
+        # Process added requests.
+        for index, sampling_params, _ in batch_update.added:
+            min_p = sampling_params.min_p
+            self.min_p_cpu[index] = min_p
+            if min_p:
+                self.min_p_count += 1
+                needs_update = True
+
+        # Update tensors if needed.
+        size = batch_update.batch_size
+        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
+
+            self.min_p = self.min_p_gpu[:size]
+            self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
+            self.min_p.unsqueeze_(1)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.min_p_count:
+            return logits
+
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Adjust min_p
+        adjusted_min_p = max_probabilities.mul_(self.min_p)
+        # Identify valid tokens using threshold comparison
+        invalid_token_mask = probability_values < adjusted_min_p
+        # Apply mask using boolean indexing
+        logits[invalid_token_mask] = -float('inf')
+        return logits
+
+
+class LogitBiasLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, pin_memory: bool, device: torch.device):
+        self.biases: dict[int, dict[int, float]] = {}
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.bias_tensor: torch.Tensor = torch.tensor(())
+        self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
+            ()), torch.tensor(()))
+
+    def update_states(self, batch_update: Optional[BatchUpdate] = None):
+        if not batch_update:
+            return
+
+        needs_update = False
+        if self.biases:
+            # Process removed and moved requests.
+            for index in batch_update.removed:
+                if self.biases.pop(index, None):
+                    needs_update = True
+
+            for from_index, to_index in batch_update.moved:
+                if entry := self.biases.pop(from_index, None):
+                    self.biases[to_index] = entry
+                    needs_update = True
+
+        # Process added requests.
+        for index, sampling_params, _ in batch_update.added:
+            if lb := sampling_params.logit_bias:
+                self.biases[index] = lb
+                needs_update = True
+
+        # Update tensors if needed.
+        if self.biases and needs_update:
+            reqs, tok_ids, biases = [], [], []
+            for req, lb in self.biases.items():
+                reqs.extend([req] * len(lb))
+                tok_ids.extend(lb.keys())
+                biases.extend(lb.values())
+
+            self.bias_tensor = self._tensor(biases, torch.float32)
+            self.logits_slice = (self._tensor(reqs, torch.int32),
+                                 self._tensor(tok_ids, torch.int32))
+
+    def _tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return (torch.tensor(data,
+                             device="cpu",
+                             dtype=dtype,
+                             pin_memory=self.pin_memory).to(device=self.device,
+                                                            non_blocking=True))
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.biases:
+            logits[self.logits_slice] += self.bias_tensor
+        return logits
+
+
+class MinTokensLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, pin_memory: bool, device: torch.device):
+        # index -> (min_toks, output_token_ids, stop_token_ids)
+        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
+            ()), torch.tensor(()))
+
+    def update_states(self, batch_update: Optional[BatchUpdate] = None):
+        needs_update = False
+        if batch_update:
+            if self.min_toks:
+                # Process removed and moved requests.
+                for index in batch_update.removed:
+                    if self.min_toks.pop(index, None):
+                        needs_update = True
+
+                for from_index, to_index in batch_update.moved:
+                    if entry := self.min_toks.pop(from_index, None):
+                        self.min_toks[to_index] = entry
+                        needs_update = True
+
+            # Process added requests.
+            for index, sampling_params, output_tok_ids in batch_update.added:
+                if ((min_tokens := sampling_params.min_tokens)
+                        and len(output_tok_ids) < min_tokens):
+                    self.min_toks[index] = (min_tokens, output_tok_ids,
+                                            sampling_params.all_stop_token_ids)
+                    needs_update = True
+
+        if self.min_toks:
+            # Check for any requests that have attained their min tokens.
+            to_remove = tuple(index for index, (min_toks, out_tok_ids,
+                                                _) in self.min_toks.items()
+                              if len(out_tok_ids) >= min_toks)
+            if to_remove:
+                needs_update = True
+                for index in to_remove:
+                    del self.min_toks[index]
+
+            # Update tensors if needed.
+            if needs_update and self.min_toks:
+                reqs: list[int] = []
+                tok_ids: list[int] = []
+                for req, (_, _, stop_tok_ids) in self.min_toks.items():
+                    reqs.extend([req] * len(stop_tok_ids))
+                    tok_ids.extend(stop_tok_ids)
+
+                self.logits_slice = (self._tensor(reqs, torch.int32),
+                                     self._tensor(tok_ids, torch.int32))
+
+    def _tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return (torch.tensor(data,
+                             device="cpu",
+                             dtype=dtype,
+                             pin_memory=self.pin_memory).to(device=self.device,
+                                                            non_blocking=True))
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.min_toks:
+            logits[self.logits_slice] = -float("inf")
+        return logits
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index e97e1235fb3..e113c3a50c2 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -5,6 +5,8 @@
 
 import torch
 
+from vllm.v1.sample.logits_processor import LogitsProcessor
+
 
 @dataclass
 class SamplingMetadata:
@@ -15,7 +17,6 @@ class SamplingMetadata:
 
     top_p: Optional[torch.Tensor]
     top_k: Optional[torch.Tensor]
-    min_p: Optional[torch.Tensor]
 
     generators: dict[int, torch.Generator]
 
@@ -30,14 +31,12 @@ class SamplingMetadata:
 
     output_token_ids: list[list[int]]
 
-    # req_index -> (min_tokens, stop_token_ids)
-    min_tokens: dict[int, tuple[int, set[int]]]
-
-    logit_bias: list[Optional[dict[int, float]]]
-
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
     allowed_token_ids_mask: Optional[torch.Tensor]
 
     # req_index -> bad_words_token_ids
     bad_words_token_ids: dict[int, list[list[int]]]
+
+    logits_procs: list[LogitsProcessor]
+    nongreedy_logits_procs: list[LogitsProcessor]
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index ed05e3f4840..4d95bc28200 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -6,22 +6,6 @@
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 
 
-def apply_min_token_penalties(
-        logits: torch.Tensor, output_token_ids: list[list[int]],
-        min_tokens: dict[int, tuple[int, set[int]]]) -> None:
-    """
-    Applies minimum token penalty by setting the logits of the stop tokens
-    to -inf.
-    """
-    min_tokens_logits_to_penalize: list[tuple[int, int]] = []
-    for index, (min_token, stop_token_ids) in min_tokens.items():
-        if len(output_token_ids[index]) < min_token:
-            for stop_token_id in stop_token_ids:
-                min_tokens_logits_to_penalize.append((index, stop_token_id))
-    if min_tokens_logits_to_penalize:
-        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
-
-
 def apply_all_penalties(
     logits: torch.Tensor,
     prompt_token_ids: torch.Tensor,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 16561d30a6d..5fc9ee12eeb 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -7,8 +7,7 @@
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.bad_words import apply_bad_words
-from vllm.v1.sample.ops.penalties import (apply_all_penalties,
-                                          apply_min_token_penalties)
+from vllm.v1.sample.ops.penalties import apply_all_penalties
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 
 _SAMPLING_EPS = 1e-5
@@ -37,12 +36,16 @@ def forward(
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
+
         # Apply allowed token ids.
         logits = self.apply_allowed_token_ids(logits, sampling_metadata)
         # Apply bad words exclusion.
         logits = self.apply_bad_words(logits, sampling_metadata)
-        # Apply logits bias.
-        logits = self.apply_logits_bias(logits, sampling_metadata)
+
+        # Apply logits processors.
+        for processor in sampling_metadata.logits_procs:
+            logits = processor.apply(logits)
+
         # Apply penalties (e.g., min_tokens, freq_penalties).
         logits = self.apply_penalties(logits, sampling_metadata)
         # Sample the next token.
@@ -107,9 +110,9 @@ def sample(
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
-        # Apply min_p.
-        if sampling_metadata.min_p is not None:
-            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+        # Apply logits processors.
+        for processor in sampling_metadata.nongreedy_logits_procs:
+            logits = processor.apply(logits)
 
         # Apply top_k and/or top_p.
         random_sampled = self.topk_topp_sampler(
@@ -184,10 +187,6 @@ def apply_penalties(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
-        if sampling_metadata.min_tokens:
-            apply_min_token_penalties(logits,
-                                      sampling_metadata.output_token_ids,
-                                      sampling_metadata.min_tokens)
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
             logits = apply_all_penalties(
@@ -200,52 +199,6 @@ def apply_penalties(
             )
         return logits
 
-    def apply_min_p(
-        self,
-        logits: torch.Tensor,
-        min_p: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Filters logits using adaptive probability thresholding.
-        """
-        # Convert logits to probability distribution
-        probability_values = torch.nn.functional.softmax(logits, dim=-1)
-        # Calculate maximum probabilities per sequence
-        max_probabilities = torch.amax(probability_values,
-                                       dim=-1,
-                                       keepdim=True)
-        # Reshape min_p for broadcasting
-        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
-        # Identify valid tokens using threshold comparison
-        valid_token_mask = probability_values >= adjusted_min_p
-        # Apply mask using boolean indexing
-        logits[~valid_token_mask] = -float('inf')
-        return logits
-
-    def apply_logits_bias(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        # TODO(houseroad): this implementation is extremely inefficient.
-        # One idea is implement this as a PyTorch C++ op, and we may
-        # even optimize the logit_bias layout.
-
-        # Get vocabulary size from logits
-        vocab_size = logits.shape[-1]
-
-        for i, logit_bias in enumerate(sampling_metadata.logit_bias):
-            if logit_bias:
-                for token_id, bias in logit_bias.items():
-                    # Check token_id bounds to ensure within vocabulary
-                    if token_id < 0 or token_id >= vocab_size:
-                        raise ValueError(
-                            f"token_id {token_id} in logit_bias contains "
-                            f"out-of-vocab token id. Vocabulary size: "
-                            f"{vocab_size}")
-                    logits[i, token_id] += bias
-        return logits
-
     def apply_allowed_token_ids(
         self,
         logits: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index a64cb97e012..4f04072b96f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -12,6 +12,10 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
@@ -137,16 +141,6 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
-        self.min_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        self.min_p_reqs: set[str] = set()
-
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -185,8 +179,7 @@ def __init__(
             self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
-        # req_index -> (min_tokens, stop_token_ids)
-        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
+        self.prompt_token_ids: Optional[torch.Tensor] = None
 
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
@@ -207,8 +200,19 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        self.logit_bias: list[Optional[dict[int,
-                                            float]]] = [None] * max_num_reqs
+        self.logit_procs: list[LogitsProcessor] = [
+            MinTokensLogitsProcessor(pin_memory=pin_memory, device=device),
+            LogitBiasLogitsProcessor(pin_memory=pin_memory, device=device),
+        ]
+        self.nongreedy_logits_procs: list[LogitsProcessor] = [
+            MinPLogitsProcessor(
+                pin_memory=pin_memory,
+                device=device,
+                # +1 for temporary swap space
+                max_num_reqs=max_num_reqs + 1)
+        ]
+
+        # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
@@ -233,7 +237,7 @@ def add_request(
         self,
         request: "CachedRequestState",
         req_index: Optional[int] = None,
-    ) -> None:
+    ) -> int:
         if req_index is None:
             req_index = self.num_reqs
         assert req_index < self.max_num_reqs
@@ -284,11 +288,8 @@ def add_request(
         else:
             top_k = self.vocab_size
         self.top_k_cpu[req_index] = top_k
-        self.min_p_cpu[req_index] = sampling_params.min_p
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
-        if sampling_params.min_p > _SAMPLING_EPS:
-            self.min_p_reqs.add(req_id)
         if sampling_params.frequency_penalty != 0.0:
             self.frequency_penalties_reqs.add(req_id)
         self.presence_penalties_cpu[
@@ -299,9 +300,6 @@ def add_request(
             req_index] = sampling_params.repetition_penalty
         if sampling_params.repetition_penalty != 1.0:
             self.repetition_penalties_reqs.add(req_id)
-        if sampling_params.min_tokens:
-            self.min_tokens[req_index] = (sampling_params.min_tokens,
-                                          sampling_params.all_stop_token_ids)
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -312,8 +310,6 @@ def add_request(
             self.num_logprobs[req_id] = sampling_params.logprobs
         if sampling_params.prompt_logprobs is not None:
             self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
-        if sampling_params.logit_bias is not None:
-            self.logit_bias[req_index] = sampling_params.logit_bias
 
         if sampling_params.allowed_token_ids:
             self.has_allowed_token_ids.add(req_id)
@@ -351,6 +347,8 @@ def add_request(
             # No LoRA
             self.request_lora_mapping[req_index] = 0
 
+        return req_index
+
     def remove_request(self, req_id: str) -> Optional[int]:
         """This method must always be followed by a call to condense()."""
 
@@ -364,8 +362,6 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        self.min_p_reqs.discard(req_id)
-        self.min_tokens.pop(req_index, None)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -383,7 +379,6 @@ def remove_request(self, req_id: str) -> Optional[int]:
                 self.lora_id_to_lora_request.pop(lora_id)
             self.request_lora_mapping[req_index] = 0
 
-        self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             # False means we don't fill with -inf.
@@ -421,8 +416,6 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
-            self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -434,32 +427,33 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.token_ids_cpu[i2, ...] = tmp
 
         swap_dict_values(self.generators, i1, i2)
-        swap_dict_values(self.min_tokens, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
-        self.logit_bias[i1], self.logit_bias[i2] =\
-            self.logit_bias[i2], self.logit_bias[i1]
 
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             self.allowed_token_ids_mask_cpu_tensor[i1], \
                 self.allowed_token_ids_mask_cpu_tensor[i2] =\
                 self.allowed_token_ids_mask_cpu_tensor[i2], \
                     self.allowed_token_ids_mask_cpu_tensor[i1]
+
+        # TODO need to handle LogitsProcessors here
+
         self.block_table.swap_row(i1, i2)
 
-    def condense(self, empty_req_indices: list[int]) -> None:
+    def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
             self._req_ids.clear()
             self.req_output_token_ids.clear()
-            return
+            return []
 
         # NOTE(woosuk): This function assumes that the empty_req_indices
         # is sorted in descending order.
         last_req_index = num_reqs + len(empty_req_indices) - 1
+        swaps = []
         while empty_req_indices:
             # Find the largest non-empty index.
             while last_req_index in empty_req_indices:
@@ -471,6 +465,7 @@ def condense(self, empty_req_indices: list[int]) -> None:
                 break
 
             # Swap the states.
+            swaps.append((last_req_index, empty_index))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -501,20 +496,14 @@ def condense(self, empty_req_indices: list[int]) -> None:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
-            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
 
-            min_token = self.min_tokens.pop(last_req_index, None)
-            if min_token is not None:
-                self.min_tokens[empty_index] = min_token
-
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                 last_req_index]
 
-            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
-
+            # TODO convert these to LogitsProcessors
             if self.allowed_token_ids_mask_cpu_tensor is not None:
                 self.allowed_token_ids_mask_cpu_tensor[
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
@@ -524,6 +513,7 @@ def condense(self, empty_req_indices: list[int]) -> None:
                 last_req_index, None)
             if bad_words_token_ids is not None:
                 self.bad_words_token_ids[empty_index] = bad_words_token_ids
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -531,6 +521,8 @@ def condense(self, empty_req_indices: list[int]) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
+        return swaps
+
     def refresh_sampling_metadata(self):
         self.sampling_metadata = self._make_sampling_metadata()
 
@@ -545,8 +537,6 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
             copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
-        if not self.no_min_p:
-            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
 
         if not self.no_penalties:
             # Since syncing these tensors is expensive only copy them
@@ -579,7 +569,6 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
-            min_p=None if self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,
@@ -587,11 +576,11 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
             output_token_ids=cast(list[list[int]], self.req_output_token_ids),
-            min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
-            logit_bias=self.logit_bias[:num_reqs],
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
+            logits_procs=self.logit_procs,
+            nongreedy_logits_procs=self.nongreedy_logits_procs,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
@@ -655,10 +644,6 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
-    @property
-    def no_min_p(self) -> bool:
-        return len(self.min_p_reqs) == 0
-
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c3d84ab3773..b38c0cde1c6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3,6 +3,7 @@
 import gc
 import time
 import weakref
+from itertools import chain
 from typing import TYPE_CHECKING, Optional, Union
 
 import numpy as np
@@ -34,6 +35,7 @@
                                         SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
+from vllm.v1.sample.logits_processor import BatchUpdate
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.spec_decode.eagle import EagleProposer
@@ -443,6 +445,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
+        removed = removed_req_indices
+        added = []
         removed_req_indices = sorted(removed_req_indices, reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
@@ -452,11 +456,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             else:
                 # Append to the end.
                 req_index = None
-            self.input_batch.add_request(req_state, req_index)
+            req_index = self.input_batch.add_request(req_state, req_index)
+            added.append((req_index, req_state.sampling_params,
+                          req_state.output_token_ids))
 
         # Condense the batched states if there are empty indices.
         if removed_req_indices:
-            self.input_batch.condense(removed_req_indices)
+            moved = self.input_batch.condense(removed_req_indices)
+        else:
+            moved = []
+
+        # Some attention backends (namely MLA) may want to separate requests
+        # based on if the attention computation will be compute-bound or
+        # memory-bound. This gives them a hook to do that.
+        if swaps := self.attn_metadata_builder.reorder_batch(
+                self.input_batch, scheduler_output):
+            moved.extend(swaps)
+            batch_changed = True
+
+        # Update states of logits processors
+        batch_update = None if not batch_changed else BatchUpdate(
+            removed=removed,
+            moved=moved,
+            added=added,
+            batch_size=self.input_batch.num_reqs,
+        )
+
+        for processor in chain(self.input_batch.logit_procs,
+                               self.input_batch.nongreedy_logits_procs):
+            processor.update_states(batch_update)
 
         if batch_changed:
             self.input_batch.refresh_sampling_metadata()
@@ -471,14 +499,6 @@ def _prepare_inputs(
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
 
-        # Some attention backends (namely MLA) may want to separate requests
-        # based on if the attention computation will be compute-bound or
-        # memory-bound. This gives them a hook to do that.
-        modified_batch = self.attn_metadata_builder.reorder_batch(
-            self.input_batch, scheduler_output)
-        if modified_batch:
-            self.input_batch.refresh_sampling_metadata()
-
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit(num_reqs)
@@ -1468,7 +1488,6 @@ def _dummy_sampler_run(
             all_random=False,
             top_p=dummy_tensors(0.9),
             top_k=dummy_tensors(logits.size(1) - 1),
-            min_p=None,
             generators={},
             max_num_logprobs=None,
             no_penalties=True,
@@ -1477,10 +1496,10 @@ def _dummy_sampler_run(
             presence_penalties=dummy_tensors(0.1),
             repetition_penalties=dummy_tensors(0.1),
             output_token_ids=[[] for _ in range(num_reqs)],
-            min_tokens={},
-            logit_bias=[None for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
+            logits_procs=[],
+            nongreedy_logits_procs=[],
         )
         try:
             sampler_output = self.model.sample(
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c61c449e179..33d43937aa8 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1021,7 +1021,7 @@ def sample_from_hidden(
         sampling_metadata: TPUSupportedSamplingMetadata,
     ) -> torch.Tensor:
         """
-        Sample with xla-friendly function. This function is to be traced 
+        Sample with xla-friendly function. This function is to be traced
         separately from `forward` for lighter compilation overhead.
         """
         logits = self.model.compute_logits(sample_hidden_states, None)
@@ -1059,13 +1059,13 @@ def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
 
 def _get_token_paddings(min_token_size: int, max_token_size: int,
                         padding_gap: int) -> list[int]:
-    """Generate a list of padding size, starting from min_token_size, 
+    """Generate a list of padding size, starting from min_token_size,
     ending with a number that can cover max_token_size
-    
+
     If padding_gap == 0 then:
         increase 2X each time (exponential)
     else:
-        first increase the size to twice, 
+        first increase the size to twice,
         then increase the padding size by padding_gap.
     """
     # assert min_token_size is power of 2

From 55328d84a7d53d149cedf39fb50f2e793aa299b5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 18 Apr 2025 17:25:21 +0000
Subject: [PATCH 002/180] extra_args

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4639b4cea06..ed6b9927421 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -242,6 +242,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ChatCompletionNamedToolChoiceParam,
     ]] = "none"
 
+    # Custom args param
+    extra_args: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to sampling."),
+    )
+
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
@@ -514,7 +520,8 @@ def to_sampling_params(
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
-            logit_bias=self.logit_bias)
+            logit_bias=self.logit_bias,
+            extra_args=self.extra_args)
 
     def _get_guided_json_from_tool(
             self) -> Optional[Union[str, dict, BaseModel]]:
@@ -718,6 +725,12 @@ class CompletionRequest(OpenAIBaseModel):
     top_p: Optional[float] = None
     user: Optional[str] = None
 
+    # Custom args param
+    extra_args: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to sampling."),
+    )
+
     # doc: begin-completion-sampling-params
     use_beam_search: bool = False
     top_k: Optional[int] = None
@@ -932,7 +945,8 @@ def to_sampling_params(
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
-            allowed_token_ids=self.allowed_token_ids)
+            allowed_token_ids=self.allowed_token_ids,
+            extra_args=self.extra_args)
 
     @model_validator(mode="before")
     @classmethod
@@ -1586,6 +1600,12 @@ class TranscriptionRequest(OpenAIBaseModel):
     to automatically increase the temperature until certain thresholds are hit.
     """
 
+    # Custom args param
+    extra_args: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to sampling."),
+    )
+
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
         alias="timestamp_granularities[]", default=[])
     """The timestamp granularities to populate for this transcription.
@@ -1628,7 +1648,8 @@ def to_sampling_params(
                                             max_tokens=max_tokens,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
-                                            else RequestOutputKind.FINAL_ONLY)
+                                            else RequestOutputKind.FINAL_ONLY,
+                                            extra_args=self.extra_args)
 
     @model_validator(mode="before")
     @classmethod

From 191b9e1aa91550b31bb50fde7d3627fbddbba7d5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 22 Apr 2025 06:08:06 +0000
Subject: [PATCH 003/180] rename

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6b5d4077562..b3a1e4c34f3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -242,8 +242,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ChatCompletionNamedToolChoiceParam,
     ]] = "none"
 
-    # Custom args param
-    extra_args: Optional[dict[str, Any]] = Field(
+    # Custom sampling params
+    extra_sampling_params: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to sampling."),
     )
@@ -521,7 +521,7 @@ def to_sampling_params(
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
-            extra_args=self.extra_args)
+            extra_args=self.extra_sampling_params)
 
     def _get_guided_json_from_tool(
             self) -> Optional[Union[str, dict, BaseModel]]:
@@ -726,7 +726,7 @@ class CompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # Custom args param
-    extra_args: Optional[dict[str, Any]] = Field(
+    extra_sampling_params: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to sampling."),
     )
@@ -946,7 +946,7 @@ def to_sampling_params(
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
-            extra_args=self.extra_args)
+            extra_args=self.extra_sampling_params)
 
     @model_validator(mode="before")
     @classmethod

From 1b658cdf5e03cc1d44add219551d81049f0511b1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 22 Apr 2025 06:09:05 +0000
Subject: [PATCH 004/180] rename

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b3a1e4c34f3..85838cc97fb 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1590,14 +1590,14 @@ class TranscriptionRequest(OpenAIBaseModel):
     `verbose_json`, or `vtt`.
     """
 
-    ## TODO (varun) : Support if set to 0, certain thresholds are met !!
-
     # Custom args param
-    extra_args: Optional[dict[str, Any]] = Field(
+    extra_sampling_params: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to sampling."),
     )
 
+    ## TODO (varun) : Support if set to 0, certain thresholds are met !!
+
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
         alias="timestamp_granularities[]", default=[])
     """The timestamp granularities to populate for this transcription.
@@ -1705,7 +1705,7 @@ def to_sampling_params(
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY,
-                                            extra_args=self.extra_args)
+                                            extra_args=self.extra_sampling_params)
 
     @model_validator(mode="before")
     @classmethod

From 6a0f87c95b14f40d80f67048f403af643143c908 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 22 Apr 2025 07:25:53 +0000
Subject: [PATCH 005/180] extra_body

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/api_server.py | 14 ++++++++++++++
 vllm/entrypoints/openai/protocol.py   | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 13681958089..1d8ec50692a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -463,6 +463,17 @@ async def show_version():
     return JSONResponse(content=ver)
 
 
+RequestWithExtraBody = Union[CompletionRequest, ChatCompletionRequest,
+                             TranscriptionRequest]
+
+
+def _merge_extra_body(request: RequestWithExtraBody) -> None:
+    """Integrate extra body arguments"""
+    for key, value in request.extra_body.items():
+        setattr(request, key, value)
+    request.extra_body = None
+
+
 @router.post("/v1/chat/completions",
              dependencies=[Depends(validate_json_request)])
 @with_cancellation
@@ -494,6 +505,9 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Completions API")
+    if request.extra_body:
+        # Integrate extra body arguments
+        _merge_extra_body(request)
 
     generator = await handler.create_completion(request, raw_request)
     if isinstance(generator, ErrorResponse):
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 85838cc97fb..e2eed0b1326 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -248,6 +248,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to sampling."),
     )
 
+    # Catch-all for request attributes beyond the OpenAI API spec
+    extra_body: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Specify arguments beyond the OpenAI API spec."),
+    )
+
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
@@ -731,6 +737,12 @@ class CompletionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to sampling."),
     )
 
+    # Catch-all for request attributes beyond the OpenAI API spec
+    extra_body: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Specify arguments beyond the OpenAI API spec."),
+    )
+
     # doc: begin-completion-sampling-params
     use_beam_search: bool = False
     top_k: Optional[int] = None
@@ -1596,6 +1608,12 @@ class TranscriptionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to sampling."),
     )
 
+    # Catch-all for request attributes beyond the OpenAI API spec
+    extra_body: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=("Specify arguments beyond the OpenAI API spec."),
+    )
+
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(

From ac57a7f51763ffccb29d71f305c37e96be50b599 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 22 Apr 2025 08:21:10 +0000
Subject: [PATCH 006/180] completion custom arg unit test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../v1/entrypoints/openai/test_completion.py  | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 57ca99e1f68..4bdf14d9927 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -80,6 +80,38 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     assert completion.choices[0].prompt_logprobs is None
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_custom_arg(client: openai.AsyncOpenAI, model_name: str) -> None:
+    """Test that custom arg works and does not break completion.
+    Issue a request with a contradictory `max_tokens` setting
+    in `extra_body`; test that the value in `extra_body` was
+    applied.
+    """
+    completion = await client.completions.create(
+        model=model_name,
+        prompt="Hello, my name is",
+        max_tokens=10,
+        temperature=0.0,
+        # Contradictory `max_tokens`
+        extra_body={
+            "max_tokens": 5,
+            "ignore_eos": True
+        })
+
+    # Assert: valid completion with `extra_body["max_tokens"]` tokens
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",

From 5c436091cc3cbb6c1a16496aff3bda3dfa338159 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 23 Apr 2025 14:47:35 +0000
Subject: [PATCH 007/180] tweak extra_args; test sampling params extra args via
 api

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../v1/entrypoints/openai/test_completion.py  | 42 +++++++++++++++----
 vllm/sampling_params.py                       | 12 +++---
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 4bdf14d9927..efaf5cc23db 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -85,13 +85,16 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     "model_name",
     [MODEL_NAME],
 )
-async def test_custom_arg(client: openai.AsyncOpenAI, model_name: str) -> None:
+async def test_completion_custom_arg(client: openai.AsyncOpenAI,
+                                     model_name: str) -> None:
     """Test that custom arg works and does not break completion.
-    Issue a request with a contradictory `max_tokens` setting
+    1. Issue a request with a contradictory `max_tokens` setting
     in `extra_body`; test that the value in `extra_body` was
     applied.
+    2. Issue a request with a contradictory `max_tokens` setting
+    in `extra_sampling_params`; test that the value is applied.
     """
-    completion = await client.completions.create(
+    completion_body = await client.completions.create(
         model=model_name,
         prompt="Hello, my name is",
         max_tokens=10,
@@ -103,12 +106,37 @@ async def test_custom_arg(client: openai.AsyncOpenAI, model_name: str) -> None:
         })
 
     # Assert: valid completion with `extra_body["max_tokens"]` tokens
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-    choice = completion.choices[0]
+    assert completion_body.id is not None
+    assert completion_body.choices is not None and len(
+        completion_body.choices) == 1
+    choice = completion_body.choices[0]
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
+    assert completion_body.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    completion_sampling_params = await client.completions.create(
+        model=model_name,
+        prompt="Hello, my name is",
+        temperature=0.0,
+        # Contradictory `max_tokens`
+        extra_body={
+            "ignore_eos": True,
+            "extra_sampling_params": {
+                # Contradictory max_tokens
+                "max_tokens": 5
+            }
+        })
+
+    # Assert: valid completion with
+    # `extra_body["extra_sampling_params"]["max_tokens"]` tokens
+    assert completion_sampling_params.id is not None
+    assert (completion_sampling_params.choices is not None
+            and len(completion_sampling_params.choices) == 1)
+    choice = completion_sampling_params.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion_sampling_params.usage == openai.types.CompletionUsage(
         completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 707a757ca83..319ba42b6c0 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -199,7 +199,7 @@ class SamplingParams(
             Defaults to None.
         extra_args: Arbitrary additional args, that can be used by custom
             sampling implementations. Not used by any in-tree sampling
-            implementations.
+            implementations. (Not actually a class member.)
     """
 
     n: int = 1
@@ -242,7 +242,6 @@ class SamplingParams(
     guided_decoding: Optional[GuidedDecodingParams] = None
     logit_bias: Optional[dict[int, float]] = None
     allowed_token_ids: Optional[list[int]] = None
-    extra_args: Optional[dict[str, Any]] = None
 
     # Fields used for bad words
     bad_words: Optional[list[str]] = None
@@ -288,8 +287,7 @@ def from_optional(
                 int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
-
-        return SamplingParams(
+        sampling_params = SamplingParams(
             n=1 if n is None else n,
             best_of=best_of,
             presence_penalty=0.0
@@ -321,8 +319,12 @@ def from_optional(
             guided_decoding=guided_decoding,
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
-            extra_args=extra_args,
         )
+        # Custom sampling params
+        if extra_args:
+            for attr_name, attr_val in extra_args.items():
+                setattr(sampling_params, attr_name, attr_val)
+        return sampling_params
 
     def __post_init__(self) -> None:
         # how we deal with `best_of``:

From 368f907aa05c1c26142378c14e56e5734fa57f29 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 23 Apr 2025 14:52:23 +0000
Subject: [PATCH 008/180] remove unnecessary extra_body field/breakout

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/api_server.py | 14 --------------
 vllm/entrypoints/openai/protocol.py   | 18 ------------------
 2 files changed, 32 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1d8ec50692a..13681958089 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -463,17 +463,6 @@ async def show_version():
     return JSONResponse(content=ver)
 
 
-RequestWithExtraBody = Union[CompletionRequest, ChatCompletionRequest,
-                             TranscriptionRequest]
-
-
-def _merge_extra_body(request: RequestWithExtraBody) -> None:
-    """Integrate extra body arguments"""
-    for key, value in request.extra_body.items():
-        setattr(request, key, value)
-    request.extra_body = None
-
-
 @router.post("/v1/chat/completions",
              dependencies=[Depends(validate_json_request)])
 @with_cancellation
@@ -505,9 +494,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Completions API")
-    if request.extra_body:
-        # Integrate extra body arguments
-        _merge_extra_body(request)
 
     generator = await handler.create_completion(request, raw_request)
     if isinstance(generator, ErrorResponse):
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e2eed0b1326..85838cc97fb 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -248,12 +248,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to sampling."),
     )
 
-    # Catch-all for request attributes beyond the OpenAI API spec
-    extra_body: Optional[dict[str, Any]] = Field(
-        default=None,
-        description=("Specify arguments beyond the OpenAI API spec."),
-    )
-
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
@@ -737,12 +731,6 @@ class CompletionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to sampling."),
     )
 
-    # Catch-all for request attributes beyond the OpenAI API spec
-    extra_body: Optional[dict[str, Any]] = Field(
-        default=None,
-        description=("Specify arguments beyond the OpenAI API spec."),
-    )
-
     # doc: begin-completion-sampling-params
     use_beam_search: bool = False
     top_k: Optional[int] = None
@@ -1608,12 +1596,6 @@ class TranscriptionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to sampling."),
     )
 
-    # Catch-all for request attributes beyond the OpenAI API spec
-    extra_body: Optional[dict[str, Any]] = Field(
-        default=None,
-        description=("Specify arguments beyond the OpenAI API spec."),
-    )
-
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(

From a90311a94beffcd4ce57e3030c85c6e4df99ca1b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 23 Apr 2025 14:57:34 +0000
Subject: [PATCH 009/180] removed transcription scenario

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 85838cc97fb..59ae529cefc 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1590,12 +1590,6 @@ class TranscriptionRequest(OpenAIBaseModel):
     `verbose_json`, or `vtt`.
     """
 
-    # Custom args param
-    extra_sampling_params: Optional[dict[str, Any]] = Field(
-        default=None,
-        description=("Additional kwargs to pass to sampling."),
-    )
-
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
@@ -1704,8 +1698,7 @@ def to_sampling_params(
                                             presence_penalty=self.presence_penalty,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
-                                            else RequestOutputKind.FINAL_ONLY,
-                                            extra_args=self.extra_sampling_params)
+                                            else RequestOutputKind.FINAL_ONLY)
 
     @model_validator(mode="before")
     @classmethod

From 42b0d31b887b42b4c3897381c28076c8c314900e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 1 May 2025 14:23:10 +0000
Subject: [PATCH 010/180] small changes

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py | 6 +++---
 vllm/v1/sample/metadata.py         | 3 +++
 vllm/v1/worker/tpu_model_runner.py | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index fd168613649..7d9342ac1b3 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -60,11 +60,11 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
                                             pin_memory=pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
         # Pre-allocated device tensor
-        self.min_p_gpu: torch.Tensor = torch.empty((max_num_reqs, ),
+        self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
                                                    dtype=torch.float32,
                                                    device=device)
         # Current slice of the device tensor
-        self.min_p: torch.Tensor = self.min_p_gpu[:0]
+        self.min_p: torch.Tensor = self.min_p_device[:0]
 
     def update_states(self, batch_update: Optional[BatchUpdate] = None):
         if not batch_update:
@@ -96,7 +96,7 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
         size = batch_update.batch_size
         if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
 
-            self.min_p = self.min_p_gpu[:size]
+            self.min_p = self.min_p_device[:size]
             self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
             self.min_p.unsqueeze_(1)
 
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index e113c3a50c2..0036582d493 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -38,5 +38,8 @@ class SamplingMetadata:
     # req_index -> bad_words_token_ids
     bad_words_token_ids: dict[int, list[list[int]]]
 
+    # Some logits processors don't affect greedy decoding (or if they do,
+    # only due to precision errors); "non-greedy" processors are
+    # only applied to random-sampled requests in the batch.
     logits_procs: list[LogitsProcessor]
     nongreedy_logits_procs: list[LogitsProcessor]
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 33d43937aa8..db91f199f11 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1065,7 +1065,7 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
     If padding_gap == 0 then:
         increase 2X each time (exponential)
     else:
-        first increase the size to twice,
+        first increase the size to twice, 
         then increase the padding size by padding_gap.
     """
     # assert min_token_size is power of 2

From f1ef8efe02418ebdf048eb650bfbd39154202f53 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 2 May 2025 14:25:48 +0000
Subject: [PATCH 011/180] spec decode min p

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py |  3 +++
 vllm/v1/spec_decode/utils.py       |  2 +-
 vllm/v1/worker/gpu_input_batch.py  | 14 ++++++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 7d9342ac1b3..be812c11b76 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -66,6 +66,9 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
+    def get_min_p_by_index(self, index: int) -> float:
+        return float(self.min_p_cpu[index])
+
     def update_states(self, batch_update: Optional[BatchUpdate] = None):
         if not batch_update:
             return
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index ce81a40ee3a..e9de0086e59 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -3,7 +3,7 @@
 
 
 def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    if req_id in input_batch.min_p_reqs:
+    if input_batch.get_min_p_by_req_id(req_id):
         # Spec decode doesn't support min_p sampling.
         return False
     elif (req_id in input_batch.frequency_penalties_reqs
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 4f04072b96f..265c09901f0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -200,16 +200,20 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
+        # Define logits processors
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
         self.logit_procs: list[LogitsProcessor] = [
             MinTokensLogitsProcessor(pin_memory=pin_memory, device=device),
             LogitBiasLogitsProcessor(pin_memory=pin_memory, device=device),
         ]
-        self.nongreedy_logits_procs: list[LogitsProcessor] = [
-            MinPLogitsProcessor(
+        self.min_p_logitsproc = MinPLogitsProcessor(
                 pin_memory=pin_memory,
                 device=device,
                 # +1 for temporary swap space
                 max_num_reqs=max_num_reqs + 1)
+        self.nongreedy_logits_procs: list[LogitsProcessor] = [
+            self.min_p_logitsproc
         ]
 
         # TODO convert this to LogitsProcessor
@@ -624,6 +628,12 @@ def make_lora_inputs(
 
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
 
+    def get_min_p_by_req_id(self, req_id: str) -> float:
+        assert req_id in self.req_id_to_index
+        return self.min_p_logitsproc.get_min_p_by_index(
+            self.req_id_to_index[req_id])
+
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)

From b270ac443da430002ab1f769f8c267d8e9e3dadf Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 2 May 2025 14:26:35 +0000
Subject: [PATCH 012/180] spec decode min p

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py | 4 ++--
 vllm/v1/worker/gpu_input_batch.py  | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index be812c11b76..22334e687a8 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -61,8 +61,8 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
         # Pre-allocated device tensor
         self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
-                                                   dtype=torch.float32,
-                                                   device=device)
+                                                      dtype=torch.float32,
+                                                      device=device)
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 265c09901f0..c118ad92aec 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -208,10 +208,10 @@ def __init__(
             LogitBiasLogitsProcessor(pin_memory=pin_memory, device=device),
         ]
         self.min_p_logitsproc = MinPLogitsProcessor(
-                pin_memory=pin_memory,
-                device=device,
-                # +1 for temporary swap space
-                max_num_reqs=max_num_reqs + 1)
+            pin_memory=pin_memory,
+            device=device,
+            # +1 for temporary swap space
+            max_num_reqs=max_num_reqs + 1)
         self.nongreedy_logits_procs: list[LogitsProcessor] = [
             self.min_p_logitsproc
         ]
@@ -633,7 +633,6 @@ def get_min_p_by_req_id(self, req_id: str) -> float:
         return self.min_p_logitsproc.get_min_p_by_index(
             self.req_id_to_index[req_id])
 
-
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)

From 49531cbffff70742394d1e47e871cec8896951a8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 5 May 2025 10:14:34 +0000
Subject: [PATCH 013/180] wip TPU fix

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/tpu/metadata.py     |   2 +-
 vllm/v1/worker/tpu_input_batch.py  | 611 +++++++++++++++++++++++++++++
 vllm/v1/worker/tpu_model_runner.py |   2 +-
 3 files changed, 613 insertions(+), 2 deletions(-)
 create mode 100644 vllm/v1/worker/tpu_input_batch.py

diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 3950fda3e5e..341b38b42a4 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.tpu_input_batch import InputBatch
 
 DEFAULT_SAMPLING_PARAMS = dict(
     temperature=-1.0,
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
new file mode 100644
index 00000000000..50dddcd2a30
--- /dev/null
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -0,0 +1,611 @@
+# SPDX-License-Identifier: Apache-2.0
+# Datastructures defining an input batch
+
+from dataclasses import dataclass
+from typing import Optional, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+from vllm.v1.utils import copy_slice
+from vllm.v1.worker.block_table import BlockTable
+
+_SAMPLING_EPS = 1e-5
+
+
+@dataclass
+class CachedRequestState:
+
+    req_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: list[MultiModalKwargs]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: SamplingParams
+    generator: Optional[torch.Generator]
+
+    block_ids: list[int]
+    num_computed_tokens: int
+    output_token_ids: list[int]
+
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[int] = None
+
+    lora_request: Optional[LoRARequest] = None
+
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+
+    @property
+    def num_tokens(self) -> int:
+        return self.num_prompt_tokens + len(self.output_token_ids)
+
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
+
+
+class InputBatch:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_blocks_per_req: int,
+        device: torch.device,
+        pin_memory: bool,
+        vocab_size: int,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.device = device
+        self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
+
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
+
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=False,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
+
+        # Block table.
+        self.block_table = BlockTable(
+            max_num_reqs=max_num_reqs,
+            max_num_blocks_per_req=max_num_blocks_per_req,
+            pin_memory=pin_memory,
+            device=device,
+        )
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: set[str] = set()
+
+        self.min_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.min_p_reqs: set[str] = set()
+
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
+        self.presence_penalties_reqs: set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: set[str] = set()
+
+        # req_index -> (min_tokens, stop_token_ids)
+        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
+
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
+
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: dict[int, torch.Generator] = {}
+
+        self.num_logprobs: dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
+
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+
+        self.logit_bias: list[Optional[dict[int,
+                                            float]]] = [None] * max_num_reqs
+        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
+        self.req_output_token_ids: list[Optional[list[int]]] = []
+
+    @property
+    def req_ids(self) -> list[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(list[str], self._req_ids)
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
+
+        req_id = request.req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
+        self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        self.block_table.add_row(request.block_ids, req_index)
+
+        sampling_params = request.sampling_params
+        if sampling_params.sampling_type == SamplingType.GREEDY:
+            # Avoid later division by zero.
+            self.temperature_cpu[req_index] = -1.0
+            self.greedy_reqs.add(req_id)
+        else:
+            self.temperature_cpu[req_index] = sampling_params.temperature
+            self.random_reqs.add(req_id)
+
+        self.top_p_cpu[req_index] = sampling_params.top_p
+        if sampling_params.top_p < 1:
+            self.top_p_reqs.add(req_id)
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
+            self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
+        self.min_p_cpu[req_index] = sampling_params.min_p
+        self.frequency_penalties_cpu[
+            req_index] = sampling_params.frequency_penalty
+        if sampling_params.min_p > _SAMPLING_EPS:
+            self.min_p_reqs.add(req_id)
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[
+            req_index] = sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[
+            req_index] = sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+        if sampling_params.min_tokens:
+            self.min_tokens[req_index] = (sampling_params.min_tokens,
+                                          sampling_params.all_stop_token_ids)
+
+        # NOTE(woosuk): self.generators should not include the requests that
+        # do not have their own generator.
+        if request.generator is not None:
+            self.generators[req_index] = request.generator
+
+        if sampling_params.logprobs is not None:
+            self.num_logprobs[req_id] = sampling_params.logprobs
+        if sampling_params.prompt_logprobs is not None:
+            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
+        if sampling_params.logit_bias is not None:
+            self.logit_bias[req_index] = sampling_params.logit_bias
+
+        if sampling_params.allowed_token_ids:
+            self.has_allowed_token_ids.add(req_id)
+            if self.allowed_token_ids_mask_cpu_tensor is None:
+                # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs,
+                    self.vocab_size,
+                    dtype=torch.bool,
+                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index][
+                sampling_params.allowed_token_ids] = False
+
+        if sampling_params.bad_words_token_ids:
+            self.bad_words_token_ids[
+                req_index] = sampling_params.bad_words_token_ids
+
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense()."""
+
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.min_p_reqs.discard(req_id)
+        self.min_tokens.pop(req_index, None)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+
+        self.logit_bias[req_index] = None
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
+        return req_index
+
+    def swap_states(self, i1: int, i2: int) -> None:
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+            self.min_p_cpu[i2], self.min_p_cpu[i1]
+
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.logit_bias[i1], self.logit_bias[i2] =\
+            self.logit_bias[i2], self.logit_bias[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
+        self.block_table.swap_row(i1, i2)
+
+    def condense(self, empty_req_indices: list[int]) -> None:
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
+            # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = empty_req_indices.pop()
+            if empty_index >= last_req_index:
+                break
+
+            # Swap the states.
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
+            assert req_id is not None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            min_token = self.min_tokens.pop(last_req_index, None)
+            if min_token is not None:
+                self.min_tokens[empty_index] = min_token
+
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
+            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
+
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory,
+        )
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
+    @property
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index db91f199f11..6b5c0662b80 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -35,7 +35,7 @@
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
-from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
 from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
                     scatter_mm_placeholders)

From 066761d931d41aa7318cbfb9a285eb170cf3f752 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 5 May 2025 11:00:28 +0000
Subject: [PATCH 014/180] merge

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .../configs/DeepSeek-V2-Lite-Chat.yaml        |    1 +
 ...lama-3-70B-Instruct-FBGEMM-nonuniform.yaml |    1 +
 .../configs/Meta-Llama-3-70B-Instruct.yaml    |    1 +
 ...struct-Channelwise-compressed-tensors.yaml |    1 +
 ...Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml |    1 +
 ...-3-8B-Instruct-FP8-compressed-tensors.yaml |    1 +
 .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml |    1 +
 ...Instruct-INT8-compressed-tensors-asym.yaml |    1 +
 ...3-8B-Instruct-INT8-compressed-tensors.yaml |    1 +
 ...nstruct-nonuniform-compressed-tensors.yaml |    1 +
 .../configs/Meta-Llama-3-8B-Instruct.yaml     |    3 +-
 .../configs/Meta-Llama-3-8B-QQQ.yaml          |    1 +
 ...2-1B-Instruct-INT8-compressed-tensors.yaml |    1 +
 .../configs/Minitron-4B-Base-FP8.yaml         |    1 +
 ...xtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml |    1 +
 .../Mixtral-8x7B-Instruct-v0.1-FP8.yaml       |    1 +
 .../configs/Mixtral-8x7B-Instruct-v0.1.yaml   |    3 +-
 .../Qwen1.5-MoE-W4A16-compressed-tensors.yaml |    5 +-
 .../configs/Qwen2-1.5B-Instruct-FP8W8.yaml    |    1 +
 ...1.5B-Instruct-INT8-compressed-tensors.yaml |    1 +
 ....5B-Instruct-W8A16-compressed-tensors.yaml |    1 +
 .../configs/Qwen2-57B-A14-Instruct.yaml       |    1 +
 .../SparseLlama3.1_2of4_fp8_compressed.yaml   |    1 +
 .../test_lm_eval_correctness.py               |    2 +-
 .buildkite/release-pipeline.yaml              |   27 +-
 .../scripts/hardware_ci/run-amd-test.sh       |   67 +-
 .../hardware_ci/run-cpu-test-ppc64le.sh       |   35 +-
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   11 +-
 .buildkite/scripts/upload-wheels.sh           |   18 +-
 .buildkite/test-pipeline.yaml                 |  160 +-
 .github/CODEOWNERS                            |    1 +
 .github/ISSUE_TEMPLATE/200-installation.yml   |    2 +-
 .github/ISSUE_TEMPLATE/300-usage.yml          |    2 +-
 .github/ISSUE_TEMPLATE/400-bug-report.yml     |    6 +-
 .../700-performance-discussion.yml            |    2 +-
 .github/mergify.yml                           |   34 +-
 .github/workflows/lint-and-deploy.yaml        |    4 +-
 .gitignore                                    |    5 +-
 .pre-commit-config.yaml                       |   15 +-
 CMakeLists.txt                                |   51 +-
 benchmarks/auto_tune.sh                       |  212 ++
 benchmarks/backend_request_func.py            |  109 +
 benchmarks/benchmark_dataset.py               |  134 ++
 benchmarks/benchmark_prefix_caching.py        |   14 +-
 benchmarks/benchmark_serving.py               |   56 +-
 .../benchmark_serving_structured_output.py    |   23 +-
 benchmarks/benchmark_throughput.py            |    7 +
 benchmarks/kernels/benchmark_bitblas.py       |  236 +++
 .../kernels/benchmark_grouped_gemm_cutlass.py |    3 +-
 benchmarks/kernels/benchmark_lora.py          |   10 +-
 benchmarks/kernels/benchmark_moe.py           |   36 +-
 .../benchmark_moe_permute_unpermute.py        |  349 ++++
 cmake/external_projects/vllm_flash_attn.cmake |    2 +-
 csrc/attention/merge_attn_states.cu           |   25 +-
 csrc/attention/mla/cutlass_mla_entry.cu       |   38 +
 csrc/attention/mla/cutlass_mla_kernels.cu     |  225 +++
 csrc/cache_kernels.cu                         |   39 +-
 csrc/core/math.hpp                            |   19 +
 csrc/moe/marlin_kernels/marlin_moe_kernel.h   |    8 +-
 csrc/moe/marlin_moe_wna16/marlin_template.h   |    8 +-
 csrc/moe/moe_permute_unpermute_op.cu          |  133 ++
 csrc/moe/moe_wna16.cu                         |   10 +-
 csrc/moe/moe_wna16_utils.h                    |   16 +-
 csrc/moe/permute_unpermute_kernels/dispatch.h |   53 +
 .../moe_permute_unpermute_kernel.cu           |  229 +++
 .../moe_permute_unpermute_kernel.h            |   95 +
 .../moe_permute_unpermute_kernel.inl          |  211 ++
 csrc/moe/torch_bindings.cpp                   |   22 +
 csrc/ops.h                                    |    9 +
 csrc/quantization/activation_kernels.cu       |  120 ++
 .../quantization/cutlass_w8a8/moe/moe_data.cu |   17 +-
 .../scaled_mm_c2x_sm89_fp8_dispatch.cuh       |    2 +-
 .../scaled_mm_c2x_sm89_int8_dispatch.cuh      |    2 +-
 .../fp4/nvfp4_scaled_mm_kernels.cu            |    2 +-
 ...fused_layernorm_dynamic_per_token_quant.cu |    2 +-
 .../gptq_allspark/allspark_qgemm_w8a16.cu     |    2 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |   16 +-
 .../marlin/dense/marlin_cuda_kernel.cu        |    4 +-
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      |    4 +-
 csrc/quantization/marlin/sparse/common/mma.h  |    4 +-
 csrc/rocm/attention.cu                        |   11 +-
 csrc/rocm/ops.h                               |    9 +
 csrc/rocm/skinny_gemms.cu                     | 1600 +++++++++++++++
 csrc/rocm/torch_bindings.cpp                  |   18 +
 csrc/torch_bindings.cpp                       |   13 +-
 docker/Dockerfile                             |   55 +-
 docker/Dockerfile.cpu                         |    1 +
 docker/Dockerfile.nightly_torch               |  313 +++
 docker/Dockerfile.rocm                        |   10 +-
 docker/Dockerfile.rocm_base                   |    7 +-
 docker/Dockerfile.s390x                       |    3 +-
 docker/Dockerfile.tpu                         |    2 +-
 docker/Dockerfile.xpu                         |    6 -
 docs/Makefile                                 |    1 +
 docs/source/api/engine/async_llm_engine.md    |    7 -
 docs/source/api/engine/index.md               |   17 -
 docs/source/api/engine/llm_engine.md          |    7 -
 docs/source/api/inference_params.md           |   21 -
 docs/source/api/model/adapters.md             |    9 -
 docs/source/api/model/index.md                |   11 -
 docs/source/api/model/interfaces.md           |    9 -
 docs/source/api/model/interfaces_base.md      |    9 -
 docs/source/api/multimodal/index.md           |   28 -
 docs/source/api/multimodal/inputs.md          |   49 -
 docs/source/api/multimodal/parse.md           |    9 -
 docs/source/api/multimodal/processing.md      |    9 -
 docs/source/api/multimodal/profiling.md       |    9 -
 docs/source/api/multimodal/registry.md        |    9 -
 docs/source/api/offline_inference/index.md    |    9 -
 docs/source/api/offline_inference/llm.md      |    7 -
 .../api/offline_inference/llm_inputs.md       |   19 -
 docs/source/api/summary.md                    |  133 ++
 .../deployment/anything-llm-chat-with-doc.png |  Bin 0 -> 120834 bytes
 .../anything-llm-chat-without-doc.png         |  Bin 0 -> 138979 bytes
 .../deployment/anything-llm-provider.png      |  Bin 0 -> 112470 bytes
 .../deployment/anything-llm-upload-doc.png    |  Bin 0 -> 114117 bytes
 docs/source/assets/deployment/open_webui.png  |  Bin 0 -> 69283 bytes
 .../assets/deployment/streamlit-chat.png      |  Bin 0 -> 108553 bytes
 docs/source/autodoc2_docstring_parser.py      |   21 +
 docs/source/conf.py                           |  131 +-
 .../source/contributing/deprecation_policy.md |   87 +
 docs/source/contributing/model/multimodal.md  |   62 +-
 docs/source/contributing/overview.md          |   12 +-
 docs/source/deployment/docker.md              |   12 +
 .../deployment/frameworks/anything-llm.md     |   47 +
 docs/source/deployment/frameworks/index.md    |    3 +
 .../deployment/frameworks/open-webui.md       |   29 +
 .../source/deployment/frameworks/streamlit.md |   42 +
 .../integrations/production-stack.md          |    2 +-
 docs/source/deployment/security.md            |   58 +
 docs/source/design/arch_overview.md           |    4 +-
 docs/source/design/mm_processing.md           |    2 +-
 docs/source/design/v1/metrics.md              |   18 +-
 docs/source/design/v1/prefix_caching.md       |   20 +-
 docs/source/design/v1/torch_compile.md        |    4 +-
 docs/source/features/compatibility_matrix.md  |    6 +-
 docs/source/features/disagg_prefill.md        |    4 +-
 docs/source/features/lora.md                  |   61 +-
 docs/source/features/quantization/auto_awq.md |    4 +-
 docs/source/features/quantization/bitblas.md  |   48 +
 docs/source/features/quantization/bnb.md      |    2 +-
 docs/source/features/quantization/fp8.md      |   19 +-
 .../source/features/quantization/gptqmodel.md |   21 +-
 docs/source/features/quantization/index.md    |    2 +
 docs/source/features/quantization/int4.md     |    8 +-
 docs/source/features/quantization/int8.md     |    8 +-
 docs/source/features/quantization/modelopt.md |   78 +
 .../quantization/quantized_kvcache.md         |    2 +-
 docs/source/features/quantization/quark.md    |    7 +
 .../quantization/supported_hardware.md        |   23 +-
 docs/source/features/quantization/torchao.md  |    3 +-
 docs/source/features/reasoning_outputs.md     |   14 +-
 docs/source/features/structured_outputs.md    |   55 +-
 docs/source/features/tool_calling.md          |   28 +-
 .../ai_accelerator/hpu-gaudi.inc.md           |   34 +-
 .../installation/ai_accelerator/tpu.inc.md    |   10 +-
 .../getting_started/installation/cpu.md       |    2 +-
 .../installation/cpu/build.inc.md             |    8 +-
 .../installation/gpu/cuda.inc.md              |   14 +-
 .../installation/gpu/rocm.inc.md              |   17 +-
 .../installation/gpu/xpu.inc.md               |   11 +-
 .../source/getting_started/troubleshooting.md |    2 +-
 docs/source/getting_started/v1_user_guide.md  |    7 +-
 docs/source/index.md                          |    9 +-
 .../models/extensions/fastsafetensor.md       |    2 +-
 .../models/extensions/runai_model_streamer.md |   26 +
 docs/source/models/generative_models.md       |    4 +-
 docs/source/models/pooling_models.md          |   76 +-
 docs/source/models/supported_models.md        |  154 +-
 docs/source/performance/optimization.md       |  187 +-
 docs/source/serving/distributed_serving.md    |    4 +
 docs/source/serving/engine_args.md            |    2 +
 docs/source/serving/multimodal_inputs.md      |    2 +-
 docs/source/serving/offline_inference.md      |   85 +-
 .../serving/openai_compatible_server.md       |   27 +-
 examples/lmcache/README.md                    |   56 +
 examples/lmcache/cpu_offload_lmcache.py       |  151 ++
 .../disagg_prefill_lmcache_v0.py}             |    0
 .../configs/lmcache-decoder-config.yaml       |   13 +
 .../configs/lmcache-prefiller-config.yaml     |   13 +
 .../disagg_example_nixl.sh                    |  136 ++
 .../disagg_proxy_server.py                    |  193 ++
 .../disagg_vllm_launcher.sh                   |   59 +
 .../lmcache/kv_cache_sharing_lmcache_v1.py    |  130 ++
 examples/offline_inference/audio_language.py  |   65 +-
 .../offline_inference/batch_llm_inference.py  |   90 +
 .../offline_inference/cpu_offload_lmcache.py  |   65 -
 .../decode_example.py                         |   36 +
 .../prefill_example.py                        |   43 +
 .../disaggregated-prefill-v1/run.sh           |    5 +
 examples/offline_inference/distributed.py     |  109 -
 examples/offline_inference/eagle.py           |   24 +-
 .../encoder_decoder_multimodal.py             |    3 +-
 .../offline_inference/llm_engine_example.py   |   12 +-
 examples/offline_inference/mistral-small.py   |    5 +-
 examples/offline_inference/profiling.py       |    2 +-
 .../offline_inference/qwen2_5_omni/README.md  |   32 +
 .../qwen2_5_omni/only_thinker.py              |  159 ++
 examples/offline_inference/vision_language.py |   79 +-
 .../vision_language_multi_image.py            |   40 +-
 .../online_serving/chart-helm/values.yaml     |    2 +-
 .../gradio_openai_chatbot_webserver.py        |    4 -
 examples/online_serving/kv_events.sh          |   86 +
 .../online_serving/kv_events_subscriber.py    |  114 ++
 ...i_chat_completion_client_for_multimodal.py |   20 +-
 ...penai_chat_completion_client_with_tools.py |  195 +-
 ...t_completion_client_with_tools_required.py |   58 +-
 ...enai_chat_completion_structured_outputs.py |  195 +-
 ...etion_structured_outputs_structural_tag.py |   85 +
 ...etion_structured_outputs_with_reasoning.py |  160 +-
 ...at_completion_tool_calls_with_reasoning.py |  160 +-
 .../openai_chat_completion_with_reasoning.py  |   65 +-
 ...hat_completion_with_reasoning_streaming.py |   84 +-
 ...ai_chat_embedding_client_for_multimodal.py |   11 +-
 .../openai_completion_client.py               |   58 +-
 .../openai_cross_encoder_score.py             |   23 +-
 .../online_serving/openai_embedding_client.py |   45 +-
 .../openai_embedding_matryoshka_fy.py         |   36 +
 .../online_serving/openai_pooling_client.py   |   15 +-
 .../openai_transcription_client.py            |   15 +-
 examples/online_serving/ray_serve_deepseek.py |   48 +
 .../streamlit_openai_chatbot_webserver.py     |  185 ++
 examples/tool_chat_template_llama4_json.jinja |  116 ++
 examples/tool_chat_template_mistral3.jinja    |  119 ++
 pyproject.toml                                |   17 +-
 requirements/build.txt                        |    6 +-
 requirements/common.txt                       |    6 +-
 requirements/cpu.txt                          |   11 +-
 requirements/cuda.txt                         |    9 +-
 requirements/docs.txt                         |   24 +-
 requirements/hpu.txt                          |    4 +-
 requirements/neuron.txt                       |    2 +
 requirements/nightly_torch_test.txt           |   33 +
 requirements/rocm-build.txt                   |   11 +-
 requirements/rocm.txt                         |    3 +-
 requirements/test.in                          |   12 +-
 requirements/test.txt                         |  168 +-
 requirements/tpu.txt                          |    3 +-
 requirements/xpu.txt                          |   10 +-
 setup.py                                      |   29 +-
 tests/compile/test_basic_correctness.py       |    3 +-
 tests/compile/test_full_graph.py              |    6 +-
 tests/compile/test_functionalization.py       |   33 +-
 tests/compile/test_fusion.py                  |    9 +-
 tests/compile/test_pass_manager.py            |    9 +-
 tests/compile/test_sequence_parallelism.py    |  190 ++
 tests/compile/test_silu_mul_quant_fusion.py   |   74 +
 tests/conftest.py                             |  122 +-
 tests/core/block/e2e/test_correctness.py      |    6 +-
 tests/core/test_scheduler.py                  |   74 +-
 tests/core/utils.py                           |   11 +-
 tests/distributed/conftest.py                 |  145 ++
 tests/distributed/test_comm_ops.py            |   31 +-
 tests/distributed/test_events.py              |  193 ++
 tests/distributed/test_pipeline_parallel.py   |    6 +-
 tests/distributed/test_sequence_parallel.py   |  296 +++
 tests/engine/test_arg_utils.py                |  172 +-
 tests/engine/test_options.py                  |   60 +
 tests/engine/test_skip_tokenizer_init.py      |   29 -
 tests/entrypoints/llm/test_chat.py            |  127 +-
 tests/entrypoints/llm/test_guided_generate.py |  304 ++-
 .../test_transcription_api_correctness.py     |    1 +
 tests/entrypoints/openai/test_audio.py        |   33 +-
 .../openai/test_chat_with_tool_reasoning.py   |    6 +-
 tests/entrypoints/openai/test_cli_args.py     |   14 +-
 tests/entrypoints/openai/test_embedding.py    |   42 +-
 .../openai/test_embedding_dimensions.py       |  140 +-
 .../entrypoints/openai/test_lora_resolvers.py |  209 ++
 .../entrypoints/openai/test_openai_schema.py  |   49 +
 tests/entrypoints/openai/test_serving_chat.py |   40 +
 .../openai/test_transcription_validation.py   |   33 +
 tests/entrypoints/openai/test_truncation.py   |  103 +
 tests/entrypoints/openai/test_video.py        |   33 +-
 tests/entrypoints/openai/test_vision.py       |   34 +-
 .../openai/test_vision_embedding.py           |    4 +-
 tests/kernels/{ => attention}/conftest.py     |    0
 .../kernels/{ => attention}/test_attention.py |    3 +-
 .../attention/test_attention_selector.py      |  252 +++
 .../test_blocksparse_attention.py             |    3 +-
 tests/kernels/{ => attention}/test_cache.py   |   60 +-
 .../test_cascade_flash_attn.py                |    0
 .../test_encoder_decoder_attn.py              |    0
 .../{ => attention}/test_flash_attn.py        |    2 +-
 .../{ => attention}/test_flashinfer.py        |    0
 .../kernels/{ => attention}/test_flashmla.py  |    0
 .../{ => attention}/test_lightning_attn.py    |    0
 .../{ => attention}/test_merge_attn_states.py |    0
 .../kernels/{ => attention}/test_mha_attn.py  |    0
 .../{ => attention}/test_mla_decode_cpu.py    |    0
 .../{ => attention}/test_prefix_prefill.py    |    0
 .../attention/test_rocm_attention_selector.py |   61 +
 .../test_triton_decode_attention.py           |    0
 tests/kernels/{ => core}/test_activation.py   |    3 +-
 .../{ => core}/test_fused_quant_layernorm.py  |    0
 tests/kernels/{ => core}/test_layernorm.py    |    0
 tests/kernels/core/test_opcheck.py            |   25 +
 tests/kernels/{ => core}/test_permute_cols.py |    0
 tests/kernels/{ => core}/test_pos_encoding.py |    3 +-
 .../{ => core}/test_rotary_embedding.py       |    0
 tests/kernels/{ => core}/test_uva.py          |    0
 .../kernels/{ => mamba}/test_causal_conv1d.py |    0
 .../kernels/{ => mamba}/test_mamba_mixer2.py  |    0
 tests/kernels/{ => mamba}/test_mamba_ssm.py   |    0
 .../kernels/{ => mamba}/test_mamba_ssm_ssd.py |    0
 tests/kernels/moe/test_cutlass_moe.py         |  364 ++++
 tests/kernels/{ => moe}/test_moe.py           |    3 +-
 .../kernels/moe/test_moe_permute_unpermute.py |  223 +++
 .../{ => moe}/test_triton_moe_ptpc_fp8.py     |    0
 tests/kernels/quant_utils.py                  |   60 +
 .../{ => quantization}/test_allspark_gemm.py  |    0
 tests/kernels/{ => quantization}/test_aqlm.py |    0
 tests/kernels/{ => quantization}/test_awq.py  |    0
 .../{ => quantization}/test_awq_marlin.py     |    3 +-
 .../{ => quantization}/test_awq_triton.py     |    0
 .../{ => quantization}/test_block_fp8.py      |    9 +-
 .../{ => quantization}/test_block_int8.py     |    3 +-
 .../test_cutlass_2of4_sparse.py               |    3 +-
 .../test_cutlass_scaled_mm.py}                |    4 +-
 .../{ => quantization}/test_fp8_quant.py      |    0
 tests/kernels/{ => quantization}/test_ggml.py |    0
 tests/kernels/{ => quantization}/test_gguf.py |    0
 tests/kernels/{ => quantization}/test_gptq.py |    0
 .../{ => quantization}/test_int8_kernel.py    |    0
 .../{ => quantization}/test_int8_quant.py     |    0
 .../{ => quantization}/test_machete_mm.py     |    0
 .../{ => quantization}/test_marlin_gemm.py    |    0
 .../{ => quantization}/test_nvfp4_quant.py    |    0
 .../test_nvfp4_scaled_mm.py                   |    0
 .../quantization/test_rocm_skinny_gemms.py    |   80 +
 .../test_triton_scaled_mm.py                  |    0
 tests/kernels/test_attention_selector.py      |  136 --
 tests/kernels/test_cutlass_mla_decode.py      |   93 +
 tests/kernels/test_cutlass_moe.py             |  244 ---
 tests/kernels/test_fused_quant_activation.py  |   69 +
 tests/kernels/test_rocm_attention_selector.py |   34 -
 tests/kernels/test_triton_flash_attention.py  |  499 +++++
 tests/kernels/test_utils.py                   |   25 -
 tests/kernels/utils_block.py                  |   63 -
 tests/kv_transfer/test_disagg.py              |    4 +-
 tests/lora/test_llama_tp.py                   |    1 +
 tests/lora/test_lora_manager.py               |   30 +-
 tests/lora/test_resolver.py                   |   74 +
 tests/lora/test_tokenizer_group.py            |   10 +-
 tests/lora/test_utils.py                      |   12 +
 .../model_executor/test_enabled_custom_ops.py |    7 +-
 .../model_executor/test_guided_processors.py  |   15 +-
 .../decoder_only/language/test_hybrid.py      |  360 ----
 .../decoder_only/language/test_mamba.py       |  337 ----
 tests/models/embedding/utils.py               |   39 -
 tests/models/encoder_decoder/__init__.py      |    0
 .../audio_language/__init__.py                |    0
 .../encoder_decoder/language/__init__.py      |    0
 .../vision_language/__init__.py               |    0
 .../vision_language/test_broadcast.py         |   37 -
 .../{decoder_only => language}/__init__.py    |    0
 .../generation}/__init__.py                   |    0
 .../generation}/test_bart.py                  |    4 -
 .../generation/test_common.py}                |   67 +-
 .../generation}/test_granite.py               |    4 -
 .../models/language/generation/test_hybrid.py |  315 +++
 .../generation}/test_mistral.py               |   66 +-
 .../generation}/test_phimoe.py                |    4 -
 .../language => language/pooling}/__init__.py |    0
 .../pooling/test_classification.py}           |    6 +-
 .../pooling}/test_embedding.py                |    6 +-
 .../pooling}/test_gritlm.py                   |  189 +-
 .../pooling}/test_jina.py                     |   46 +-
 .../pooling}/test_scoring.py                  |   61 +-
 .../pooling/test_snowflake_arctic_embed.py    |   95 +
 .../pooling/test_truncation_control.py        |   69 +
 .../generation}/__init__.py                   |    0
 .../generation/test_common.py}                |   74 +-
 .../generation}/test_florence2.py             |   21 +-
 .../generation/test_granite_speech.py         |  144 ++
 .../generation}/test_interleaved.py           |    3 +-
 .../generation}/test_mllama.py                |   50 +-
 .../generation}/test_phi4mm.py                |    6 +-
 .../generation}/test_pixtral.py               |    4 -
 .../generation}/test_qwen2_vl.py              |    2 +-
 .../generation}/test_ultravox.py              |   77 +-
 .../generation}/test_whisper.py               |   58 +-
 .../generation}/vlm_utils/__init__.py         |    0
 .../generation}/vlm_utils/builders.py         |   10 +-
 .../generation}/vlm_utils/case_filtering.py   |    0
 .../generation}/vlm_utils/core.py             |    2 +-
 .../generation}/vlm_utils/custom_inputs.py    |    0
 .../generation}/vlm_utils/model_utils.py      |   65 +-
 .../generation}/vlm_utils/runners.py          |   11 +-
 .../generation}/vlm_utils/types.py            |    6 +-
 .../pooling}/__init__.py                      |    0
 .../pooling}/test_dse_qwen2_vl.py             |    2 +-
 .../pooling}/test_intern_vit.py               |   23 +-
 .../pooling}/test_llava_next.py               |    2 +-
 .../pooling}/test_phi3v.py                    |    2 +-
 .../multimodal/processing/test_common.py      |    5 +
 .../multimodal/processing/test_h2ovl.py       |    4 +-
 .../multimodal/processing/test_idefics3.py    |    4 +-
 .../multimodal/processing/test_internvl.py    |    4 +-
 .../multimodal/processing/test_llama4.py      |    4 +-
 .../processing/test_minimax_vl_01.py          |   98 +
 .../multimodal/processing/test_phi3v.py       |    4 +-
 .../multimodal/processing/test_phi4mm.py      |   59 +
 .../multimodal/processing/test_qwen2_vl.py    |    4 +-
 .../multimodal/processing/test_smolvlm.py     |    4 +-
 .../language => quantization}/__init__.py     |    0
 .../language => quantization}/test_aqlm.py    |    6 -
 .../test_awq.py                               |    7 +-
 tests/models/quantization/test_bitblas.py     |   61 +
 .../language => quantization}/test_fp8.py     |    7 +-
 .../language => quantization}/test_gguf.py    |    7 +-
 .../models/quantization/test_gptq_bitblas.py  |   59 +
 .../test_gptq_marlin.py                       |    8 +-
 .../test_gptq_marlin_24.py                    |    5 +-
 .../test_modelopt.py                          |    1 -
 .../language => quantization}/test_nvfp4.py   |    1 -
 tests/models/registry.py                      |   94 +-
 tests/models/test_initialization.py           |    5 +-
 tests/models/test_oot_registration.py         |    5 +-
 tests/models/test_transformers.py             |    5 +-
 tests/models/utils.py                         |   66 +-
 tests/multimodal/assets/image1.png            |  Bin 0 -> 1837 bytes
 tests/multimodal/assets/image2.png            |  Bin 0 -> 1837 bytes
 tests/multimodal/test_hasher.py               |   61 +
 .../test_register_quantization_config.py      |    4 +-
 tests/quantization/test_torchao.py            |   26 +
 .../reasoning/test_qwen3_reasoning_parser.py  |  141 ++
 .../e2e/test_medusa_correctness.py            |    2 +-
 tests/spec_decode/e2e/test_mlp_correctness.py |    4 +-
 .../e2e/test_multistep_correctness.py         |    9 +-
 .../spec_decode/e2e/test_ngram_correctness.py |    2 +-
 tests/spec_decode/test_memory_usage.py        |   90 +
 tests/spec_decode/test_scorer.py              |    5 +-
 tests/test_config.py                          |   82 +-
 tests/test_utils.py                           |   54 +-
 tests/tokenization/test_cached_tokenizer.py   |   43 +-
 tests/tokenization/test_detokenize.py         |  209 +-
 tests/tokenization/test_get_eos.py            |    2 +-
 tests/tokenization/test_tokenizer_group.py    |  187 +-
 tests/tool_use/utils.py                       |   14 +
 tests/utils.py                                |    2 +-
 tests/v1/core/test_kv_cache_utils.py          |   61 +-
 tests/v1/core/test_prefix_caching.py          |  405 +++-
 tests/v1/core/test_scheduler.py               |  539 ++++-
 tests/v1/core/test_specialized_manager.py     |    8 +-
 tests/v1/e2e/test_cascade_attention.py        |   10 +-
 tests/v1/e2e/test_spec_decode.py              |   29 +-
 tests/v1/engine/conftest.py                   |    4 +-
 tests/v1/engine/test_async_llm.py             |   33 +
 tests/v1/engine/test_engine_core.py           |  110 +-
 tests/v1/engine/test_engine_core_client.py    |  167 +-
 tests/v1/engine/test_output_processor.py      |  130 +-
 tests/v1/engine/utils.py                      |    5 +-
 tests/v1/entrypoints/conftest.py              |   34 +-
 .../llm/test_struct_output_generate.py        |  225 ++-
 tests/v1/sample/test_sampler.py               |    4 +-
 tests/v1/shutdown/test_delete.py              |   97 +
 tests/v1/shutdown/test_forward_error.py       |  129 ++
 tests/v1/shutdown/test_processor_error.py     |   69 +
 tests/v1/shutdown/test_startup_error.py       |   97 +
 tests/v1/shutdown/utils.py                    |    5 +
 tests/v1/spec_decode/test_max_len.py          |   57 +
 tests/v1/spec_decode/test_ngram.py            |   63 +-
 tests/v1/structured_output/test_utils.py      |   55 +-
 tests/v1/test_async_llm_dp.py                 |    4 +-
 tests/v1/test_oracle.py                       |    5 +-
 tests/v1/test_serial_utils.py                 |  208 +-
 tests/v1/tpu/test_basic.py                    |    7 +-
 tests/v1/tpu/test_multimodal.py               |   93 +
 tests/v1/tpu/test_perf.py                     |   15 +-
 tests/v1/tpu/test_sampler.py                  |   22 +
 tests/v1/tpu/test_topk_topp_sampler.py        |   22 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py  |    9 +-
 tests/v1/worker/test_gpu_input_batch.py       |    1 -
 tests/v1/worker/test_gpu_model_runner.py      |    1 -
 tests/worker/test_model_runner.py             |  146 +-
 vllm/_custom_ops.py                           |   50 +-
 vllm/_ipex_ops.py                             |   20 +-
 vllm/assets/audio.py                          |   12 +-
 vllm/assets/image.py                          |    4 +-
 vllm/assets/video.py                          |   35 +-
 vllm/attention/backends/abstract.py           |    5 +
 vllm/attention/backends/cpu_mla.py            |    8 +-
 vllm/attention/backends/flash_attn.py         |    6 +-
 vllm/attention/backends/flashinfer.py         |   54 +-
 vllm/attention/backends/flashmla.py           |    2 +-
 vllm/attention/backends/hpu_attn.py           |  106 +-
 vllm/attention/backends/ipex_attn.py          |   26 +-
 vllm/attention/backends/mla/common.py         |  266 +--
 vllm/attention/backends/rocm_aiter_mla.py     |  412 ++++
 vllm/attention/backends/rocm_flash_attn.py    |  157 +-
 vllm/attention/backends/triton_mla.py         |    2 +-
 vllm/attention/backends/utils.py              |   28 +-
 vllm/attention/layer.py                       |   46 +-
 .../ops/chunked_prefill_paged_decode.py       |    2 +-
 vllm/attention/ops/hpu_paged_attn.py          |    1 -
 vllm/attention/ops/ipex_attn.py               |    3 +-
 vllm/attention/ops/prefix_prefill.py          | 1634 +++++++--------
 vllm/attention/ops/rocm_aiter_mla.py          |   42 +
 vllm/attention/ops/rocm_aiter_paged_attn.py   |  101 +
 vllm/attention/ops/triton_flash_attention.py  | 1762 +++++++++++------
 .../utils}/fa_utils.py                        |    0
 .../benchmarks}/__init__.py                   |    0
 collect_env.py => vllm/collect_env.py         |   32 +-
 vllm/compilation/activation_quant_fusion.py   |   87 +
 vllm/compilation/backends.py                  |   41 +-
 vllm/compilation/compiler_interface.py        |   65 +-
 vllm/compilation/fix_functionalization.py     |   17 +-
 vllm/compilation/fusion.py                    |    8 +-
 vllm/compilation/fx_utils.py                  |   16 +
 vllm/compilation/inductor_pass.py             |   32 +
 vllm/compilation/pass_manager.py              |   31 +-
 vllm/compilation/sequence_parallelism.py      |  266 +++
 vllm/compilation/vllm_inductor_pass.py        |   13 +-
 vllm/config.py                                | 1554 +++++++++------
 vllm/connections.py                           |    2 +-
 vllm/core/scheduler.py                        |   33 +-
 vllm/distributed/communication_op.py          |    6 +
 .../base_device_communicator.py               |   34 +
 .../device_communicators/cuda_communicator.py |   25 +
 .../device_communicators/shm_broadcast.py     |   35 +-
 vllm/distributed/kv_events.py                 |  295 +++
 vllm/distributed/kv_transfer/__init__.py      |   12 +
 .../kv_transfer/kv_connector/base.py          |    4 +
 .../kv_transfer/kv_connector/factory.py       |   57 +-
 .../kv_connector/mooncake_store_connector.py  |   39 +-
 .../kv_connector/simple_connector.py          |   95 +-
 .../kv_transfer/kv_connector/utils.py         |   90 +
 .../kv_transfer/kv_connector/v1/__init__.py   |    8 +
 .../kv_transfer/kv_connector/v1/base.py       |  209 ++
 .../kv_connector/v1/lmcache_connector.py      |  131 ++
 .../v1/shared_storage_connector.py            |  383 ++++
 ...ransfer_agent.py => kv_connector_agent.py} |    2 +-
 .../kv_transfer/kv_pipe/mooncake_pipe.py      |   21 +-
 .../kv_transfer/kv_transfer_state.py          |   70 +
 vllm/distributed/parallel_state.py            |   93 +-
 vllm/distributed/utils.py                     |   22 +-
 vllm/engine/arg_utils.py                      | 1554 +++++++--------
 vllm/engine/async_llm_engine.py               |  131 +-
 vllm/engine/llm_engine.py                     |  205 +-
 vllm/engine/metrics.py                        |    9 +-
 vllm/engine/multiprocessing/client.py         |    9 +-
 vllm/engine/multiprocessing/engine.py         |   12 +-
 vllm/engine/output_processor/multi_step.py    |   12 +-
 vllm/engine/output_processor/single_step.py   |   15 +-
 vllm/engine/protocol.py                       |    7 +-
 vllm/entrypoints/api_server.py                |    2 +-
 vllm/entrypoints/chat_utils.py                |   76 +-
 vllm/entrypoints/cli/collect_env.py           |   35 +
 vllm/entrypoints/cli/main.py                  |    2 +
 vllm/entrypoints/launcher.py                  |   98 +-
 vllm/entrypoints/llm.py                       |  188 +-
 vllm/entrypoints/openai/api_server.py         |   42 +-
 vllm/entrypoints/openai/cli_args.py           |   29 +-
 vllm/entrypoints/openai/protocol.py           |  175 +-
 vllm/entrypoints/openai/run_batch.py          |    4 +-
 vllm/entrypoints/openai/serving_chat.py       |   62 +-
 vllm/entrypoints/openai/serving_embedding.py  |   14 +-
 vllm/entrypoints/openai/serving_engine.py     |   31 +-
 vllm/entrypoints/openai/serving_models.py     |   70 +
 vllm/entrypoints/openai/serving_pooling.py    |   14 +-
 vllm/entrypoints/openai/serving_score.py      |   15 +-
 .../openai/tool_parsers/llama_tool_parser.py  |    1 +
 .../tool_parsers/mistral_tool_parser.py       |   17 +
 vllm/entrypoints/score_utils.py               |    2 +-
 vllm/entrypoints/utils.py                     |   24 +
 vllm/env_override.py                          |   17 +-
 vllm/envs.py                                  |   56 +-
 vllm/executor/executor_base.py                |    2 +-
 vllm/executor/uniproc_executor.py             |    4 +-
 vllm/forward_context.py                       |   39 +-
 vllm/inputs/__init__.py                       |   14 +-
 vllm/inputs/data.py                           |  260 +--
 vllm/inputs/parse.py                          |   57 +-
 vllm/inputs/preprocess.py                     |  621 +++---
 vllm/inputs/registry.py                       |  335 +---
 vllm/logger.py                                |   23 +-
 vllm/lora/ops/triton_ops/__init__.py          |    4 +-
 .../{lora_expand.py => lora_expand_op.py}     |    0
 .../{lora_shrink.py => lora_shrink_op.py}     |    0
 vllm/lora/punica_wrapper/punica_selector.py   |    3 +-
 vllm/lora/resolver.py                         |   83 +
 vllm/lora/utils.py                            |   13 +-
 vllm/model_executor/custom_op.py              |    6 +-
 .../guided_decoding/__init__.py               |   30 +-
 .../guided_decoding/guidance_decoding.py      |   18 +-
 .../guided_decoding/guided_fields.py          |   11 +-
 .../outlines_logits_processors.py             |    2 +-
 .../guided_decoding/reasoner/__init__.py      |   35 -
 vllm/model_executor/guided_decoding/utils.py  |   10 +-
 .../guided_decoding/xgrammar_decoding.py      |   14 +-
 vllm/model_executor/layers/activation.py      |    1 +
 ...me=AMD_Instinct_MI300X,dtype=fp8_w8a8.json |  164 ++
 ...=1024,device_name=AMD_Instinct_MI300X.json |  200 ++
 ...192,device_name=NVIDIA_H100_80GB_HBM3.json |  146 ++
 .../E=128,N=192,device_name=NVIDIA_H20.json   |  146 ++
 .../E=128,N=192,device_name=NVIDIA_H200.json  |  146 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  164 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  146 ++
 .../E=128,N=384,device_name=NVIDIA_H20.json   |  146 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  146 ++
 .../E=128,N=384,device_name=NVIDIA_H200.json  |  146 ++
 ...512,device_name=NVIDIA_H100_80GB_HBM3.json |  146 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  164 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  146 ++
 .../E=128,N=768,device_name=NVIDIA_H20.json   |  146 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  146 ++
 .../E=128,N=768,device_name=NVIDIA_H200.json  |  146 ++
 .../E=128,N=96,device_name=NVIDIA_H20.json    |  146 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  146 ++
 .../layers/fused_moe/configs/README           |    3 +-
 .../layers/fused_moe/cutlass_moe.py           |   43 +-
 .../layers/fused_moe/fused_marlin_moe.py      |    4 +-
 .../layers/fused_moe/fused_moe.py             |   25 +-
 vllm/model_executor/layers/fused_moe/layer.py |   25 +-
 .../layers/fused_moe/moe_permute_unpermute.py |  116 ++
 .../layers/fused_moe/rocm_aiter_fused_moe.py  |  434 +++-
 vllm/model_executor/layers/layernorm.py       |    8 +-
 vllm/model_executor/layers/linear.py          |   29 +-
 .../layers/mamba/ops/mamba_ssm.py             |    4 +-
 .../layers/quantization/__init__.py           |   20 +-
 .../layers/quantization/aqlm.py               |    3 +-
 .../model_executor/layers/quantization/awq.py |    3 +-
 .../layers/quantization/awq_marlin.py         |   14 +-
 .../layers/quantization/base_config.py        |   13 +-
 .../layers/quantization/bitblas.py            |  460 +++++
 .../layers/quantization/bitsandbytes.py       |    3 +-
 .../compressed_tensors/compressed_tensors.py  |    7 +-
 .../compressed_tensors_moe.py                 |  171 +-
 .../layers/quantization/deepspeedfp.py        |    5 +-
 .../layers/quantization/experts_int8.py       |    3 +-
 .../layers/quantization/fbgemm_fp8.py         |    3 +-
 .../model_executor/layers/quantization/fp8.py |   13 +-
 .../layers/quantization/gguf.py               |    3 +-
 .../layers/quantization/gptq.py               |    3 +-
 .../layers/quantization/gptq_bitblas.py       |  444 +++++
 .../layers/quantization/gptq_marlin.py        |    9 +-
 .../layers/quantization/gptq_marlin_24.py     |    7 +-
 .../layers/quantization/hqq_marlin.py         |    3 +-
 .../layers/quantization/ipex_quant.py         |    7 +-
 .../kernels/mixed_precision/__init__.py       |    5 +-
 .../kernels/mixed_precision/bitblas.py        |  299 +++
 .../quantization/kernels/scaled_mm/cutlass.py |    2 +-
 .../layers/quantization/kv_cache.py           |   36 +
 .../layers/quantization/marlin.py             |    7 +-
 .../layers/quantization/modelopt.py           |    7 +-
 .../layers/quantization/moe_wna16.py          |    7 +-
 .../layers/quantization/neuron_quant.py       |    3 +-
 .../layers/quantization/ptpc_fp8.py           |    3 +-
 .../model_executor/layers/quantization/qqq.py |    3 +-
 .../layers/quantization/quark/quark.py        |   38 +-
 .../layers/quantization/torchao.py            |    3 +-
 .../layers/quantization/tpu_int8.py           |    3 +-
 .../quantization/utils/bitblas_utils.py       |  207 ++
 .../layers/quantization/utils/int8_utils.py   |   28 +-
 .../layers/quantization/utils/w8a8_utils.py   |  266 ++-
 .../layers/rejection_sampler.py               |   31 +-
 .../model_executor/layers/rotary_embedding.py |  362 +++-
 vllm/model_executor/layers/sampler.py         |    9 +-
 .../layers/typical_acceptance_sampler.py      |   15 +-
 vllm/model_executor/layers/utils.py           |   51 +-
 .../layers/vocab_parallel_embedding.py        |    3 +-
 vllm/model_executor/model_loader/loader.py    |   82 +-
 vllm/model_executor/model_loader/neuron.py    |    1 -
 vllm/model_executor/model_loader/utils.py     |   37 +-
 .../model_loader/weight_utils.py              |   25 +-
 vllm/model_executor/models/aimv2.py           |  322 +++
 vllm/model_executor/models/arctic.py          |   16 +-
 vllm/model_executor/models/aria.py            |   12 +-
 vllm/model_executor/models/aya_vision.py      |   16 -
 vllm/model_executor/models/baichuan.py        |   10 -
 vllm/model_executor/models/bamba.py           |   10 -
 vllm/model_executor/models/bart.py            |   10 -
 vllm/model_executor/models/bert.py            |  201 +-
 vllm/model_executor/models/blip2.py           |   58 +-
 vllm/model_executor/models/bloom.py           |   10 -
 vllm/model_executor/models/chameleon.py       |   18 +-
 vllm/model_executor/models/chatglm.py         |   10 -
 vllm/model_executor/models/commandr.py        |   23 +-
 vllm/model_executor/models/dbrx.py            |   16 +-
 vllm/model_executor/models/deepseek.py        |   10 -
 vllm/model_executor/models/deepseek_mtp.py    |   11 -
 vllm/model_executor/models/deepseek_v2.py     |   31 +-
 vllm/model_executor/models/deepseek_vl2.py    |   32 +-
 vllm/model_executor/models/eagle.py           |   13 -
 vllm/model_executor/models/exaone.py          |   11 -
 vllm/model_executor/models/falcon.py          |   10 -
 vllm/model_executor/models/florence2.py       |   21 -
 vllm/model_executor/models/fuyu.py            |   13 -
 vllm/model_executor/models/gemma.py           |   10 -
 vllm/model_executor/models/gemma2.py          |   14 +-
 vllm/model_executor/models/gemma3.py          |   14 +-
 vllm/model_executor/models/gemma3_mm.py       |   34 +-
 vllm/model_executor/models/glm.py             |    5 +-
 vllm/model_executor/models/glm4.py            |   18 +-
 vllm/model_executor/models/gpt2.py            |   10 -
 vllm/model_executor/models/gpt_bigcode.py     |   64 +-
 vllm/model_executor/models/gpt_j.py           |  110 +-
 vllm/model_executor/models/gpt_neox.py        |   10 -
 vllm/model_executor/models/granite.py         |   11 -
 vllm/model_executor/models/granite_speech.py  |  777 ++++++++
 vllm/model_executor/models/granitemoe.py      |   11 -
 .../model_executor/models/granitemoeshared.py |   11 -
 vllm/model_executor/models/grok1.py           |   10 -
 vllm/model_executor/models/h2ovl.py           |   16 +-
 vllm/model_executor/models/idefics3.py        |   10 -
 vllm/model_executor/models/interfaces.py      |    4 +-
 vllm/model_executor/models/interfaces_base.py |    9 -
 vllm/model_executor/models/internlm2.py       |   12 +-
 vllm/model_executor/models/internvl.py        |   18 +-
 vllm/model_executor/models/jais.py            |   10 -
 vllm/model_executor/models/jamba.py           |   10 -
 vllm/model_executor/models/kimi_vl.py         |   75 +-
 vllm/model_executor/models/llama.py           |   42 +-
 vllm/model_executor/models/llama4.py          |    8 +-
 vllm/model_executor/models/llama_eagle.py     |   32 +-
 vllm/model_executor/models/llama_eagle3.py    |  242 +++
 vllm/model_executor/models/llava.py           |   29 +-
 vllm/model_executor/models/llava_next.py      |   23 +-
 .../model_executor/models/llava_next_video.py |   16 -
 vllm/model_executor/models/llava_onevision.py |   54 +-
 vllm/model_executor/models/mamba.py           |   51 +-
 vllm/model_executor/models/mamba2.py          |   10 -
 vllm/model_executor/models/minicpm.py         |   10 -
 vllm/model_executor/models/minicpmo.py        |   36 +-
 vllm/model_executor/models/minicpmv.py        |   20 +-
 vllm/model_executor/models/minimax_text_01.py |   79 +-
 vllm/model_executor/models/minimax_vl_01.py   |  363 ++++
 vllm/model_executor/models/mistral3.py        |   45 +-
 vllm/model_executor/models/mixtral.py         |   10 -
 vllm/model_executor/models/mixtral_quant.py   |   10 -
 vllm/model_executor/models/mllama.py          |   10 -
 vllm/model_executor/models/mllama4.py         |   19 +-
 vllm/model_executor/models/modernbert.py      |  325 +++
 vllm/model_executor/models/molmo.py           |   14 +-
 vllm/model_executor/models/mpt.py             |   10 -
 vllm/model_executor/models/nemotron.py        |   11 -
 vllm/model_executor/models/nemotron_nas.py    |    8 -
 vllm/model_executor/models/olmo.py            |   10 -
 vllm/model_executor/models/olmo2.py           |   12 +-
 vllm/model_executor/models/olmoe.py           |  113 +-
 vllm/model_executor/models/opt.py             |   98 +-
 vllm/model_executor/models/orion.py           |  101 +-
 vllm/model_executor/models/ovis2.py           |  388 ++++
 vllm/model_executor/models/paligemma.py       |   14 +-
 vllm/model_executor/models/persimmon.py       |   87 +-
 vllm/model_executor/models/phi.py             |   10 -
 vllm/model_executor/models/phi3_small.py      |   57 +-
 vllm/model_executor/models/phi3v.py           |   18 +-
 vllm/model_executor/models/phi4mm.py          | 1730 ++++++----------
 vllm/model_executor/models/phi4mm_audio.py    |   75 +-
 vllm/model_executor/models/phi4mm_utils.py    |    4 +-
 vllm/model_executor/models/phimoe.py          |   10 -
 vllm/model_executor/models/pixtral.py         |   40 +-
 vllm/model_executor/models/plamo2.py          |  736 +++++++
 vllm/model_executor/models/qwen.py            |   10 -
 vllm/model_executor/models/qwen2.py           |   10 -
 .../models/qwen2_5_omni_thinker.py            |  901 +++++++++
 vllm/model_executor/models/qwen2_5_vl.py      |  103 +-
 vllm/model_executor/models/qwen2_audio.py     |   25 +-
 vllm/model_executor/models/qwen2_moe.py       |   19 +-
 vllm/model_executor/models/qwen2_vl.py        |   27 +-
 vllm/model_executor/models/qwen3.py           |   10 -
 vllm/model_executor/models/qwen3_moe.py       |   19 +-
 vllm/model_executor/models/qwen_vl.py         |    2 +-
 vllm/model_executor/models/registry.py        |   28 +-
 vllm/model_executor/models/skyworkr1v.py      |   18 +-
 vllm/model_executor/models/solar.py           |   11 -
 vllm/model_executor/models/stablelm.py        |   18 +-
 vllm/model_executor/models/starcoder2.py      |   10 -
 vllm/model_executor/models/transformers.py    |   48 +-
 vllm/model_executor/models/ultravox.py        |   16 -
 vllm/model_executor/models/utils.py           |    2 +-
 vllm/model_executor/models/vision.py          |   25 +-
 vllm/model_executor/models/whisper.py         |   10 -
 vllm/model_executor/models/zamba2.py          |   19 -
 vllm/model_executor/parameter.py              |   33 +-
 vllm/multimodal/__init__.py                   |   11 +-
 vllm/multimodal/audio.py                      |   76 +-
 vllm/multimodal/base.py                       |  328 +--
 vllm/multimodal/hasher.py                     |   32 +-
 vllm/multimodal/image.py                      |   80 +-
 vllm/multimodal/inputs.py                     |  274 ++-
 vllm/multimodal/parse.py                      |   39 +-
 vllm/multimodal/processing.py                 |  643 +++---
 vllm/multimodal/profiling.py                  |   42 +-
 vllm/multimodal/registry.py                   |  305 +--
 vllm/multimodal/utils.py                      |   41 +-
 vllm/multimodal/video.py                      |   71 +-
 vllm/outputs.py                               |   41 +-
 vllm/platforms/cpu.py                         |    2 -
 vllm/platforms/cuda.py                        |   18 +-
 vllm/platforms/interface.py                   |   26 +-
 vllm/platforms/neuron.py                      |    2 +-
 vllm/platforms/rocm.py                        |   85 +-
 vllm/platforms/tpu.py                         |   29 +-
 vllm/pooling_params.py                        |   11 +-
 vllm/profiler/__init__.py                     |    7 -
 vllm/reasoning/__init__.py                    |    2 +
 vllm/reasoning/qwen3_reasoning_parser.py      |  150 ++
 vllm/sampling_params.py                       |   63 +-
 vllm/sequence.py                              |  152 +-
 vllm/spec_decode/draft_model_runner.py        |   19 +-
 vllm/spec_decode/metrics.py                   |   11 +-
 vllm/spec_decode/multi_step_worker.py         |   13 +-
 .../spec_decode/smaller_tp_proposer_worker.py |    3 +-
 vllm/spec_decode/spec_decode_worker.py        |    5 +-
 vllm/transformers_utils/config.py             |   67 +-
 vllm/transformers_utils/configs/__init__.py   |    8 +-
 vllm/transformers_utils/configs/dbrx.py       |    3 +-
 vllm/transformers_utils/configs/eagle.py      |   19 +-
 vllm/transformers_utils/configs/exaone.py     |   44 +-
 .../configs/minimax_text_01.py                |   69 +
 .../configs/minimax_vl_01.py                  |   70 +
 vllm/transformers_utils/configs/olmo2.py      |  168 --
 vllm/transformers_utils/configs/ovis2.py      |  170 ++
 vllm/transformers_utils/detokenizer.py        |    4 +-
 vllm/transformers_utils/processor.py          |   69 +-
 .../transformers_utils/processors/__init__.py |    3 +-
 vllm/transformers_utils/processors/ovis2.py   |  399 ++++
 vllm/transformers_utils/tokenizer.py          |   57 +-
 vllm/transformers_utils/tokenizer_base.py     |   36 +-
 .../{tokenizer_group => }/tokenizer_group.py  |   39 +-
 .../tokenizer_group/__init__.py               |   56 -
 .../tokenizer_group/base_tokenizer_group.py   |   68 -
 .../tokenizer_group/ray_tokenizer_group.py    |  244 ---
 vllm/transformers_utils/tokenizers/mistral.py |   13 +-
 vllm/triton_utils/__init__.py                 |    2 +-
 vllm/triton_utils/importing.py                |   34 +-
 vllm/usage/usage_lib.py                       |   17 +-
 vllm/utils.py                                 |  103 +-
 vllm/v1/attention/backends/flash_attn.py      |  139 +-
 vllm/v1/attention/backends/flashinfer.py      |  635 ++++++
 vllm/v1/attention/backends/mla/common.py      |  173 +-
 vllm/v1/attention/backends/mla/flashmla.py    |    2 +-
 vllm/v1/attention/backends/mla/triton_mla.py  |    2 +-
 vllm/v1/attention/backends/pallas.py          |   15 +
 vllm/v1/core/block_pool.py                    |   49 +-
 vllm/v1/core/kv_cache_manager.py              |   78 +-
 vllm/v1/core/kv_cache_utils.py                |   27 +-
 vllm/v1/core/sched/interface.py               |   10 +-
 vllm/v1/core/sched/output.py                  |    7 +-
 vllm/v1/core/sched/scheduler.py               |  224 ++-
 vllm/v1/core/specialized_manager.py           |   39 +-
 vllm/v1/engine/__init__.py                    |   21 +-
 vllm/v1/engine/async_llm.py                   |  277 ++-
 vllm/v1/engine/core.py                        |  215 +-
 vllm/v1/engine/core_client.py                 |  378 ++--
 vllm/v1/engine/detokenizer.py                 |  253 ++-
 vllm/v1/engine/exceptions.py                  |   16 +
 vllm/v1/engine/llm_engine.py                  |   53 +-
 vllm/v1/engine/mm_input_cache.py              |    5 +-
 vllm/v1/engine/output_processor.py            |   43 +-
 vllm/v1/engine/processor.py                   |   89 +-
 vllm/v1/executor/abstract.py                  |   11 +-
 vllm/v1/executor/multiproc_executor.py        |  336 ++--
 vllm/v1/metrics/loggers.py                    |  125 +-
 vllm/v1/request.py                            |    6 +-
 vllm/v1/sample/ops/topk_topp_sampler.py       |   27 +-
 vllm/v1/sample/rejection_sampler.py           |   16 +-
 vllm/v1/sample/tpu/metadata.py                |   19 +-
 vllm/v1/serial_utils.py                       |  160 +-
 vllm/v1/spec_decode/eagle.py                  |  199 +-
 vllm/v1/spec_decode/metrics.py                |  132 +-
 vllm/v1/spec_decode/ngram_proposer.py         |   10 +-
 vllm/v1/structured_output/__init__.py         |   71 +-
 vllm/v1/structured_output/backend_guidance.py |  105 +-
 vllm/v1/structured_output/backend_types.py    |   31 +
 vllm/v1/structured_output/backend_xgrammar.py |  177 +-
 vllm/v1/structured_output/request.py          |    2 +
 vllm/v1/structured_output/utils.py            |  120 --
 vllm/v1/utils.py                              |   50 +-
 vllm/v1/worker/gpu_input_batch.py             |    1 -
 vllm/v1/worker/gpu_model_runner.py            |  216 +-
 vllm/v1/worker/gpu_worker.py                  |   42 +-
 vllm/v1/worker/lora_model_runner_mixin.py     |   16 +-
 vllm/v1/worker/tpu_model_runner.py            |  485 ++++-
 vllm/v1/worker/tpu_worker.py                  |   10 +-
 vllm/v1/worker/utils.py                       |    6 +-
 vllm/worker/cache_engine.py                   |   23 +-
 vllm/worker/cpu_enc_dec_model_runner.py       |    2 +-
 vllm/worker/cpu_model_runner.py               |   60 +-
 vllm/worker/enc_dec_model_runner.py           |   21 +-
 vllm/worker/hpu_model_runner.py               |  521 ++---
 vllm/worker/hpu_worker.py                     |    8 +-
 vllm/worker/model_runner.py                   |  192 +-
 vllm/worker/multi_step_model_runner.py        |   12 +-
 vllm/worker/neuron_model_runner.py            |   20 +-
 vllm/worker/pooling_model_runner.py           |   15 +-
 vllm/worker/tpu_worker.py                     |    4 +-
 vllm/worker/worker.py                         |   31 +-
 vllm/worker/xpu_model_runner.py               |   21 +-
 vllm/worker/xpu_worker.py                     |    9 +-
 892 files changed, 44542 insertions(+), 16160 deletions(-)
 create mode 100644 benchmarks/auto_tune.sh
 create mode 100644 benchmarks/kernels/benchmark_bitblas.py
 create mode 100644 benchmarks/kernels/benchmark_moe_permute_unpermute.py
 create mode 100644 csrc/attention/mla/cutlass_mla_entry.cu
 create mode 100644 csrc/attention/mla/cutlass_mla_kernels.cu
 create mode 100644 csrc/moe/moe_permute_unpermute_op.cu
 create mode 100644 csrc/moe/permute_unpermute_kernels/dispatch.h
 create mode 100644 csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
 create mode 100644 csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
 create mode 100644 csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
 create mode 100644 csrc/quantization/activation_kernels.cu
 create mode 100644 csrc/rocm/skinny_gemms.cu
 create mode 100644 docker/Dockerfile.nightly_torch
 delete mode 100644 docs/source/api/engine/async_llm_engine.md
 delete mode 100644 docs/source/api/engine/index.md
 delete mode 100644 docs/source/api/engine/llm_engine.md
 delete mode 100644 docs/source/api/inference_params.md
 delete mode 100644 docs/source/api/model/adapters.md
 delete mode 100644 docs/source/api/model/index.md
 delete mode 100644 docs/source/api/model/interfaces.md
 delete mode 100644 docs/source/api/model/interfaces_base.md
 delete mode 100644 docs/source/api/multimodal/index.md
 delete mode 100644 docs/source/api/multimodal/inputs.md
 delete mode 100644 docs/source/api/multimodal/parse.md
 delete mode 100644 docs/source/api/multimodal/processing.md
 delete mode 100644 docs/source/api/multimodal/profiling.md
 delete mode 100644 docs/source/api/multimodal/registry.md
 delete mode 100644 docs/source/api/offline_inference/index.md
 delete mode 100644 docs/source/api/offline_inference/llm.md
 delete mode 100644 docs/source/api/offline_inference/llm_inputs.md
 create mode 100644 docs/source/api/summary.md
 create mode 100644 docs/source/assets/deployment/anything-llm-chat-with-doc.png
 create mode 100644 docs/source/assets/deployment/anything-llm-chat-without-doc.png
 create mode 100644 docs/source/assets/deployment/anything-llm-provider.png
 create mode 100644 docs/source/assets/deployment/anything-llm-upload-doc.png
 create mode 100644 docs/source/assets/deployment/open_webui.png
 create mode 100644 docs/source/assets/deployment/streamlit-chat.png
 create mode 100644 docs/source/autodoc2_docstring_parser.py
 create mode 100644 docs/source/contributing/deprecation_policy.md
 create mode 100644 docs/source/deployment/frameworks/anything-llm.md
 create mode 100644 docs/source/deployment/frameworks/open-webui.md
 create mode 100644 docs/source/deployment/frameworks/streamlit.md
 create mode 100644 docs/source/deployment/security.md
 create mode 100644 docs/source/features/quantization/bitblas.md
 create mode 100644 docs/source/features/quantization/modelopt.md
 create mode 100644 examples/lmcache/README.md
 create mode 100644 examples/lmcache/cpu_offload_lmcache.py
 rename examples/{offline_inference/disaggregated_prefill_lmcache.py => lmcache/disagg_prefill_lmcache_v0.py} (100%)
 create mode 100644 examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
 create mode 100644 examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
 create mode 100644 examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
 create mode 100644 examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
 create mode 100644 examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
 create mode 100644 examples/lmcache/kv_cache_sharing_lmcache_v1.py
 create mode 100644 examples/offline_inference/batch_llm_inference.py
 delete mode 100644 examples/offline_inference/cpu_offload_lmcache.py
 create mode 100644 examples/offline_inference/disaggregated-prefill-v1/decode_example.py
 create mode 100644 examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
 create mode 100644 examples/offline_inference/disaggregated-prefill-v1/run.sh
 delete mode 100644 examples/offline_inference/distributed.py
 create mode 100644 examples/offline_inference/qwen2_5_omni/README.md
 create mode 100644 examples/offline_inference/qwen2_5_omni/only_thinker.py
 create mode 100644 examples/online_serving/kv_events.sh
 create mode 100644 examples/online_serving/kv_events_subscriber.py
 create mode 100644 examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
 create mode 100644 examples/online_serving/openai_embedding_matryoshka_fy.py
 create mode 100644 examples/online_serving/ray_serve_deepseek.py
 create mode 100644 examples/online_serving/streamlit_openai_chatbot_webserver.py
 create mode 100644 examples/tool_chat_template_llama4_json.jinja
 create mode 100644 examples/tool_chat_template_mistral3.jinja
 create mode 100644 requirements/nightly_torch_test.txt
 create mode 100644 tests/compile/test_sequence_parallelism.py
 create mode 100644 tests/compile/test_silu_mul_quant_fusion.py
 create mode 100644 tests/distributed/conftest.py
 create mode 100644 tests/distributed/test_events.py
 create mode 100644 tests/distributed/test_sequence_parallel.py
 create mode 100644 tests/engine/test_options.py
 delete mode 100644 tests/engine/test_skip_tokenizer_init.py
 create mode 100644 tests/entrypoints/openai/test_lora_resolvers.py
 create mode 100644 tests/entrypoints/openai/test_openai_schema.py
 create mode 100644 tests/entrypoints/openai/test_truncation.py
 rename tests/kernels/{ => attention}/conftest.py (100%)
 rename tests/kernels/{ => attention}/test_attention.py (99%)
 create mode 100644 tests/kernels/attention/test_attention_selector.py
 rename tests/kernels/{ => attention}/test_blocksparse_attention.py (99%)
 rename tests/kernels/{ => attention}/test_cache.py (93%)
 rename tests/kernels/{ => attention}/test_cascade_flash_attn.py (100%)
 rename tests/kernels/{ => attention}/test_encoder_decoder_attn.py (100%)
 rename tests/kernels/{ => attention}/test_flash_attn.py (99%)
 rename tests/kernels/{ => attention}/test_flashinfer.py (100%)
 rename tests/kernels/{ => attention}/test_flashmla.py (100%)
 rename tests/kernels/{ => attention}/test_lightning_attn.py (100%)
 rename tests/kernels/{ => attention}/test_merge_attn_states.py (100%)
 rename tests/kernels/{ => attention}/test_mha_attn.py (100%)
 rename tests/kernels/{ => attention}/test_mla_decode_cpu.py (100%)
 rename tests/kernels/{ => attention}/test_prefix_prefill.py (100%)
 create mode 100644 tests/kernels/attention/test_rocm_attention_selector.py
 rename tests/kernels/{ => attention}/test_triton_decode_attention.py (100%)
 rename tests/kernels/{ => core}/test_activation.py (97%)
 rename tests/kernels/{ => core}/test_fused_quant_layernorm.py (100%)
 rename tests/kernels/{ => core}/test_layernorm.py (100%)
 create mode 100644 tests/kernels/core/test_opcheck.py
 rename tests/kernels/{ => core}/test_permute_cols.py (100%)
 rename tests/kernels/{ => core}/test_pos_encoding.py (99%)
 rename tests/kernels/{ => core}/test_rotary_embedding.py (100%)
 rename tests/kernels/{ => core}/test_uva.py (100%)
 rename tests/kernels/{ => mamba}/test_causal_conv1d.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_mixer2.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_ssm.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_ssm_ssd.py (100%)
 create mode 100644 tests/kernels/moe/test_cutlass_moe.py
 rename tests/kernels/{ => moe}/test_moe.py (99%)
 create mode 100644 tests/kernels/moe/test_moe_permute_unpermute.py
 rename tests/kernels/{ => moe}/test_triton_moe_ptpc_fp8.py (100%)
 rename tests/kernels/{ => quantization}/test_allspark_gemm.py (100%)
 rename tests/kernels/{ => quantization}/test_aqlm.py (100%)
 rename tests/kernels/{ => quantization}/test_awq.py (100%)
 rename tests/kernels/{ => quantization}/test_awq_marlin.py (98%)
 rename tests/kernels/{ => quantization}/test_awq_triton.py (100%)
 rename tests/kernels/{ => quantization}/test_block_fp8.py (98%)
 rename tests/kernels/{ => quantization}/test_block_int8.py (99%)
 rename tests/kernels/{ => quantization}/test_cutlass_2of4_sparse.py (99%)
 rename tests/kernels/{test_cutlass.py => quantization/test_cutlass_scaled_mm.py} (99%)
 rename tests/kernels/{ => quantization}/test_fp8_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_ggml.py (100%)
 rename tests/kernels/{ => quantization}/test_gguf.py (100%)
 rename tests/kernels/{ => quantization}/test_gptq.py (100%)
 rename tests/kernels/{ => quantization}/test_int8_kernel.py (100%)
 rename tests/kernels/{ => quantization}/test_int8_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_machete_mm.py (100%)
 rename tests/kernels/{ => quantization}/test_marlin_gemm.py (100%)
 rename tests/kernels/{ => quantization}/test_nvfp4_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_nvfp4_scaled_mm.py (100%)
 create mode 100644 tests/kernels/quantization/test_rocm_skinny_gemms.py
 rename tests/kernels/{ => quantization}/test_triton_scaled_mm.py (100%)
 delete mode 100644 tests/kernels/test_attention_selector.py
 create mode 100644 tests/kernels/test_cutlass_mla_decode.py
 delete mode 100644 tests/kernels/test_cutlass_moe.py
 create mode 100644 tests/kernels/test_fused_quant_activation.py
 delete mode 100644 tests/kernels/test_rocm_attention_selector.py
 create mode 100644 tests/kernels/test_triton_flash_attention.py
 delete mode 100644 tests/kernels/test_utils.py
 delete mode 100644 tests/kernels/utils_block.py
 create mode 100644 tests/lora/test_resolver.py
 delete mode 100644 tests/models/decoder_only/language/test_hybrid.py
 delete mode 100644 tests/models/decoder_only/language/test_mamba.py
 delete mode 100644 tests/models/embedding/utils.py
 delete mode 100644 tests/models/encoder_decoder/__init__.py
 delete mode 100644 tests/models/encoder_decoder/audio_language/__init__.py
 delete mode 100644 tests/models/encoder_decoder/language/__init__.py
 delete mode 100644 tests/models/encoder_decoder/vision_language/__init__.py
 delete mode 100644 tests/models/encoder_decoder/vision_language/test_broadcast.py
 rename tests/models/{decoder_only => language}/__init__.py (100%)
 rename tests/models/{decoder_only/audio_language => language/generation}/__init__.py (100%)
 rename tests/models/{encoder_decoder/language => language/generation}/test_bart.py (98%)
 rename tests/models/{decoder_only/language/test_models.py => language/generation/test_common.py} (65%)
 rename tests/models/{decoder_only/language => language/generation}/test_granite.py (89%)
 create mode 100644 tests/models/language/generation/test_hybrid.py
 rename tests/models/{decoder_only/language => language/generation}/test_mistral.py (86%)
 rename tests/models/{decoder_only/language => language/generation}/test_phimoe.py (96%)
 rename tests/models/{decoder_only/language => language/pooling}/__init__.py (100%)
 rename tests/models/{embedding/language/test_cls_models.py => language/pooling/test_classification.py} (91%)
 rename tests/models/{embedding/language => language/pooling}/test_embedding.py (94%)
 rename tests/models/{embedding/language => language/pooling}/test_gritlm.py (64%)
 rename tests/models/{embedding/language => language/pooling}/test_jina.py (82%)
 rename tests/models/{embedding/language => language/pooling}/test_scoring.py (72%)
 create mode 100644 tests/models/language/pooling/test_snowflake_arctic_embed.py
 create mode 100644 tests/models/language/pooling/test_truncation_control.py
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/__init__.py (100%)
 rename tests/models/{decoder_only/vision_language/test_models.py => multimodal/generation/test_common.py} (91%)
 rename tests/models/{encoder_decoder/vision_language => multimodal/generation}/test_florence2.py (87%)
 create mode 100644 tests/models/multimodal/generation/test_granite_speech.py
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_interleaved.py (96%)
 rename tests/models/{encoder_decoder/vision_language => multimodal/generation}/test_mllama.py (95%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_phi4mm.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_pixtral.py (98%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/test_qwen2_vl.py (99%)
 rename tests/models/{decoder_only/audio_language => multimodal/generation}/test_ultravox.py (79%)
 rename tests/models/{encoder_decoder/audio_language => multimodal/generation}/test_whisper.py (84%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/__init__.py (100%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/builders.py (97%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/case_filtering.py (100%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/core.py (99%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/custom_inputs.py (100%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/model_utils.py (91%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/runners.py (94%)
 rename tests/models/{decoder_only/vision_language => multimodal/generation}/vlm_utils/types.py (97%)
 rename tests/models/{embedding => multimodal/pooling}/__init__.py (100%)
 rename tests/models/{embedding/vision_language => multimodal/pooling}/test_dse_qwen2_vl.py (99%)
 rename tests/models/{decoder_only/vision_language => multimodal/pooling}/test_intern_vit.py (84%)
 rename tests/models/{embedding/vision_language => multimodal/pooling}/test_llava_next.py (99%)
 rename tests/models/{embedding/vision_language => multimodal/pooling}/test_phi3v.py (98%)
 create mode 100644 tests/models/multimodal/processing/test_minimax_vl_01.py
 create mode 100644 tests/models/multimodal/processing/test_phi4mm.py
 rename tests/models/{embedding/language => quantization}/__init__.py (100%)
 rename tests/models/{decoder_only/language => quantization}/test_aqlm.py (94%)
 rename tests/models/{decoder_only/vision_language => quantization}/test_awq.py (96%)
 create mode 100644 tests/models/quantization/test_bitblas.py
 rename tests/models/{decoder_only/language => quantization}/test_fp8.py (97%)
 rename tests/models/{decoder_only/language => quantization}/test_gguf.py (97%)
 create mode 100644 tests/models/quantization/test_gptq_bitblas.py
 rename tests/models/{decoder_only/language => quantization}/test_gptq_marlin.py (94%)
 rename tests/models/{decoder_only/language => quantization}/test_gptq_marlin_24.py (95%)
 rename tests/models/{decoder_only/language => quantization}/test_modelopt.py (99%)
 rename tests/models/{decoder_only/language => quantization}/test_nvfp4.py (99%)
 create mode 100644 tests/multimodal/assets/image1.png
 create mode 100644 tests/multimodal/assets/image2.png
 create mode 100644 tests/multimodal/test_hasher.py
 create mode 100644 tests/reasoning/test_qwen3_reasoning_parser.py
 create mode 100644 tests/spec_decode/test_memory_usage.py
 create mode 100644 tests/v1/shutdown/test_delete.py
 create mode 100644 tests/v1/shutdown/test_forward_error.py
 create mode 100644 tests/v1/shutdown/test_processor_error.py
 create mode 100644 tests/v1/shutdown/test_startup_error.py
 create mode 100644 tests/v1/shutdown/utils.py
 create mode 100644 tests/v1/spec_decode/test_max_len.py
 create mode 100644 tests/v1/tpu/test_multimodal.py
 create mode 100644 vllm/attention/backends/rocm_aiter_mla.py
 create mode 100644 vllm/attention/ops/rocm_aiter_mla.py
 create mode 100644 vllm/attention/ops/rocm_aiter_paged_attn.py
 rename vllm/{vllm_flash_attn => attention/utils}/fa_utils.py (100%)
 rename {tests/models/embedding/vision_language => vllm/benchmarks}/__init__.py (100%)
 rename collect_env.py => vllm/collect_env.py (96%)
 create mode 100644 vllm/compilation/activation_quant_fusion.py
 create mode 100644 vllm/compilation/sequence_parallelism.py
 create mode 100644 vllm/distributed/kv_events.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/utils.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
 rename vllm/distributed/kv_transfer/{kv_transfer_agent.py => kv_connector_agent.py} (97%)
 create mode 100644 vllm/distributed/kv_transfer/kv_transfer_state.py
 create mode 100644 vllm/entrypoints/cli/collect_env.py
 rename vllm/lora/ops/triton_ops/{lora_expand.py => lora_expand_op.py} (100%)
 rename vllm/lora/ops/triton_ops/{lora_shrink.py => lora_shrink_op.py} (100%)
 create mode 100644 vllm/lora/resolver.py
 delete mode 100644 vllm/model_executor/guided_decoding/reasoner/__init__.py
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
 create mode 100644 vllm/model_executor/layers/quantization/bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/gptq_bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/bitblas_utils.py
 create mode 100644 vllm/model_executor/models/aimv2.py
 create mode 100644 vllm/model_executor/models/granite_speech.py
 create mode 100644 vllm/model_executor/models/llama_eagle3.py
 create mode 100644 vllm/model_executor/models/minimax_vl_01.py
 create mode 100644 vllm/model_executor/models/modernbert.py
 create mode 100644 vllm/model_executor/models/ovis2.py
 create mode 100644 vllm/model_executor/models/plamo2.py
 create mode 100644 vllm/model_executor/models/qwen2_5_omni_thinker.py
 create mode 100644 vllm/reasoning/qwen3_reasoning_parser.py
 create mode 100644 vllm/transformers_utils/configs/minimax_text_01.py
 create mode 100644 vllm/transformers_utils/configs/minimax_vl_01.py
 delete mode 100644 vllm/transformers_utils/configs/olmo2.py
 create mode 100644 vllm/transformers_utils/configs/ovis2.py
 create mode 100644 vllm/transformers_utils/processors/ovis2.py
 rename vllm/transformers_utils/{tokenizer_group => }/tokenizer_group.py (77%)
 delete mode 100644 vllm/transformers_utils/tokenizer_group/__init__.py
 delete mode 100644 vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
 delete mode 100644 vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
 create mode 100755 vllm/v1/attention/backends/flashinfer.py
 create mode 100644 vllm/v1/engine/exceptions.py

diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
index d70ecb2a7e7..d392a5f6406 100644
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
index 4397effa82c..4b7776b20da 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml
@@ -1,3 +1,4 @@
+# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
index fa6ea236ef0..05b66175199 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -1,3 +1,4 @@
+# For hf script, without -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
index c513159c6fa..12a87e52901 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
index 5e57fcbcf7d..7c7a1ca6edb 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
index 374171f1f91..1d45c377045 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
index dc36b705634..29a145252ef 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
index 0ecfc01ef04..3a5f120b3e7 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
index bc290029859..5ff57bae492 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
index 3964f3be5e8..07fb130464a 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
index fb4b4915ab9..c27886525bb 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 tasks:
 - name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
index 04245865983..56ec933c9cc 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
index 78347f63fa7..83e11f2be77 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
index 4ef8b5c3709..15a836dddbd 100644
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
index 75a24e408e7..5633a2d9b82 100644
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
index 436ec21924c..b8024c80e8e 100644
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
index dec9164d1b8..188a112ca3a 100644
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 tasks:
 - name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
index 166af81a3f0..099e0f465ba 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@@ -1,11 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.31
+    value: 0.30
   - name: "exact_match,flexible-extract"
-    value: 0.47
+    value: 0.465
 limit: 1319
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
index 42936fbfbe7..426e8ff6987 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
index 43ff2bc5ce3..8d57e9dabd5 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
index 259799ba8bf..1bce7e7fdf1 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
index 45d5efc8860..fc9707d0d6f 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 tasks:
diff --git a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
index 2928d75ce44..9a9c749748e 100644
--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@@ -1,3 +1,4 @@
+# For vllm script, with -t option (tensor parallel size).
 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 tasks:
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 4ae23eff62f..6015a83e829 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -16,7 +16,7 @@
 import pytest
 import yaml
 
-RTOL = 0.05
+RTOL = 0.08
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 3354ea37002..4cc9c70a6ad 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,20 +1,20 @@
 steps:
-  - label: "Build wheel - CUDA 12.4"
+  - label: "Build wheel - CUDA 12.8"
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA 12.6"
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -48,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
@@ -57,6 +57,8 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
+      - "yes | docker system prune -a"
+      - "git fetch --all"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
@@ -86,3 +88,18 @@ steps:
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build Neuron release image"
+    key: block-neuron-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish Neuron release image"
+    depends_on: block-neuron-release-image-build
+    agents:
+      queue: neuron-postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 469422ddec2..d29903bf497 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -75,30 +75,51 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 #ignore certain kernels tests
-if [[ $commands == *" kernels "* ]]; then
+if [[ $commands == *" kernels/core"* ]]; then
   commands="${commands} \
-  --ignore=kernels/test_attention_selector.py \
-  --ignore=kernels/test_blocksparse_attention.py \
-  --ignore=kernels/test_causal_conv1d.py \
-  --ignore=kernels/test_cutlass.py \
-  --ignore=kernels/test_encoder_decoder_attn.py \
-  --ignore=kernels/test_flash_attn.py \
-  --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_int8_quant.py \
-  --ignore=kernels/test_machete_gemm.py \
-  --ignore=kernels/test_mamba_ssm.py \
-  --ignore=kernels/test_marlin_gemm.py \
-  --ignore=kernels/test_moe.py \
-  --ignore=kernels/test_prefix_prefill.py \
-  --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py \
-  --ignore=kernels/test_cascade_flash_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py \
-  --ignore=kernels/test_aqlm.py \
-  --ignore=kernels/test_machete_mm.py \
-  --ignore=kernels/test_mha_attn.py \
-  --ignore=kernels/test_block_fp8.py \
-  --ignore=kernels/test_permute_cols.py"
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/stest_attention_selector.py \
+  --ignore=kernels/attention/test_blocksparse_attention.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_aqlm.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 fi
 
 #ignore certain Entrypoints/openai tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 9c5cf7cad94..5d863dd82e9 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -5,10 +5,41 @@
 set -ex
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+remove_docker_container() {
+  if [[ -n "$container_id" ]]; then
+      podman rm -f "$container_id" || true
+  fi
+  podman system prune -f
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.ppc64le .
+podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
+
+# Run the image
+container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
+
+function cpu_tests() {
+
+  # offline inference
+  podman exec -it "$container_id" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run basic model test
+  podman exec -it "$container_id" bash -c "
+    set -e
+    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
+    pip install sentence-transformers datamodel_code_generator
+    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
+    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+
+export container_id
+export -f cpu_tests
+timeout 40m bash -c cpu_tests
 
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 87f74277cf9..21982b01b9c 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -17,10 +17,13 @@ source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
+    && python3 -m pip install pytest pytest-asyncio tpu-info \
     && python3 -m pip install lm_eval[api]==0.4.4 \
+    && export VLLM_XLA_CACHE_PATH= \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo HARDWARE \
+    && tpu-info \
     && echo TEST_0 \
     && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
     && echo TEST_1 \
@@ -40,7 +43,11 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_8 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
     && echo TEST_9 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
+    && echo TEST_10 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
+    && echo TEST_11 \
+    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index a681f892706..75e3ef26409 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 38961138c97..b3005b1b4b0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -8,6 +8,7 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@@ -38,7 +39,7 @@ steps:
   - pip install -r ../../requirements/docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/inference_params.html
+  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   source_file_dependencies:
@@ -70,6 +71,7 @@ steps:
 - label: Basic Correctness Test # 30min
   #mirror_hardwares: [amd]
   fast_check: true
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
@@ -104,6 +106,7 @@ steps:
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
+  torch_nightly: true
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -118,7 +121,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
   - pytest -v -s entrypoints/test_chat_utils.py
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -205,6 +208,8 @@ steps:
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s v1/test_stats.py
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py
@@ -288,14 +293,17 @@ steps:
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests
+  torch_nightly: true
   source_file_dependencies:
     - vllm/
     - tests/compile
   commands:
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_sequence_parallelism.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
@@ -306,21 +314,58 @@ steps:
   - pytest -v -s compile/piecewise/test_toy_llama.py
 
 - label: PyTorch Fullgraph Test # 18min
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
-  # mirror_hardwares: [amd]
+- label: Kernels Core Operation Test
+  mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
+  - tests/kernels/core
+  commands:
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/attention/
   - vllm/attention
-  - tests/kernels
+  - vllm/v1/attention
+  - tests/kernels/attention
   commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
   # mirror_hardwares: [amd]
@@ -348,12 +393,13 @@ steps:
   commands:
   - pytest -v -s benchmarks/
 
-- label: Quantization Test # 33min
+- label: Quantization Test
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
   - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  commands:
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@@ -393,83 +439,85 @@ steps:
 #####  models test  #####
 
 - label: Basic Models Test # 24min
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models
   commands:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
     # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 
-- label: Language Models Test (Standard) # 32min
+- label: Language Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
   commands:
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/language -m core_model
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language -m core_model
 
-- label: Language Models Test (Extended) # 1h10min
+- label: Language Models Test (Extended)
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
+  - tests/models/language
   commands:
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
+    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    - pytest -v -s models/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 40min
+- label: Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
-    - pytest -v -s models/embedding/vision_language -m core_model
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
-
-- label: Multi-Modal Models Test (Extended) 1 # 48m
+    - pytest -v -s models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Models Test (Extended) 1
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/audio_language
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
-
-- label: Multi-Modal Models Test (Extended) 2 # 38m
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+
+- label: Multi-Modal Models Test (Extended) 2
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/decoder_only/vision_language
+  - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
@@ -539,14 +587,16 @@ steps:
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  # test sequence parallel
+  - pytest -v -s distributed/test_sequence_parallel.py
   # this test fails consistently.
   # TODO: investigate and fix
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 860c5c6cd53..76aa5f7a35d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -12,6 +12,7 @@
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
+/vllm/vllm_flash_attn @LucasWilkinson
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 590e56c1378..34da4019687 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 004798a388a..c9e4be0e771 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -14,7 +14,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index d4113da8b5b..637d2dd1145 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -14,19 +14,19 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
     value: |
       <details>
-      <summary>The output of `python collect_env.py`</summary>
+      <summary>The output of <code>python collect_env.py</code></summary>
 
       ```text
       Your output of `python collect_env.py` here
       ```
-      
+
       </details>
   validations:
     required: true
diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
index 273f50d59cf..3d31c115501 100644
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@@ -35,7 +35,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
       # For security purposes, please feel free to check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3097b994659..15fa3660a87 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -55,11 +55,19 @@ pull_request_rules:
   description: Automatically apply structured-output label
   conditions:
     - or:
+      - files~=^benchmarks/structured_schemas/
+      - files=benchmarks/benchmark_serving_structured_output.py
+      - files=benchmarks/run_structured_output_benchmark.sh
+      - files=docs/source/features/structured_outputs.md
+      - files=examples/offline_inference/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^vllm/model_executor/guided_decoding/
       - files=tests/model_executor/test_guided_processors.py
       - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
-      - files=benchmarks/benchmark_guided.py
+      - files~=^tests/v1/structured_output/
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files~=^vllm/v1/structured_output/
   actions:
     label:
       add:
@@ -118,6 +126,28 @@ pull_request_rules:
       remove:
         - tpu
 
+- name: label-tool-calling
+  description: Automatically add tool-calling label
+  conditions:
+    - or:
+      - files~=^tests/tool_use/
+      - files~=^tests/mistral_tool_use/
+      - files~=^tests/entrypoints/openai/tool_parsers/
+      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/
+      - files=docs/source/features/tool_calling.md
+      - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
+      - files=docs/source/getting_started/examples/chat_with_tools.md
+      - files~=^examples/tool_chat_*
+      - files=examples/offline_inference/chat_with_tools.py
+      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
+      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
+  actions:
+    label:
+      add:
+        - tool-calling
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 7b1d9f69938..dd9b61a6473 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -66,7 +66,7 @@ jobs:
           export AWS_SECRET_ACCESS_KEY=minioadmin
           sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-    
+
       - name: curl test
         run: |
           kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@@ -79,4 +79,4 @@ jobs:
                           "max_tokens": 7,
                           "temperature": 0
                   }'):$CODE"
-          echo "$CODE"
\ No newline at end of file
+          echo "$CODE"
diff --git a/.gitignore b/.gitignore
index 6f5cbd0733d..2756c612b82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@
 
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
-!vllm/vllm_flash_attn/fa_utils.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -81,6 +80,7 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
+docs/source/api/vllm
 
 # PyBuilder
 .pybuilder/
@@ -203,3 +203,6 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
+
+# Ingore moe/marlin_moe gen code
+csrc/moe/marlin_moe_wna16/kernel_*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e921f69925b..5ecd7b70ea5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,31 +11,30 @@ repos:
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
-    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.3
+  rev: v0.11.7
   hooks:
   - id: ruff
     args: [--output-format, github, --fix]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.0
+  rev: v2.4.1
   hooks:
   - id: codespell
     additional_dependencies: ['tomli']
     args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
+  rev: 6.0.1
   hooks:
   - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v19.1.7
+  rev: v20.1.3
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
     types_or: [c++, cuda]
     args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.27
+  rev: v0.9.29
   hooks:
   - id: pymarkdown
     args: [fix]
@@ -44,10 +43,10 @@ repos:
   hooks:
   - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.2
+  rev: 0.6.17
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f4b20d3515..8012c233462 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
 
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
-
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
@@ -46,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -241,6 +240,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
   "csrc/custom_all_reduce.cu"
@@ -249,9 +249,8 @@ set(VLLM_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
-  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
+  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -269,7 +268,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
         # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.8.0
+        GIT_TAG ${CUTLASS_REVISION}
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -290,7 +289,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/cutlass_extensions/common.cpp")
+    "csrc/cutlass_extensions/common.cpp"
+    "csrc/attention/mla/cutlass_mla_entry.cu")
 
   set_gencode_flags_for_srcs(
     SRCS "${VLLM_EXT_SRC}"
@@ -463,7 +463,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(FP4_ARCHS)
   endif()
 
-  #
+  # CUTLASS MLA Archs and flags
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
+    set(SRCS
+      "csrc/attention/mla/cutlass_mla_kernels.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${MLA_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
+    # Add MLA-specific include directories only to MLA source files
+    set_source_files_properties(${SRCS}
+      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
+    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
+  else()
+    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
+    # clear MLA_ARCHS
+    set(MLA_ARCHS)
+  endif()
+
   # CUTLASS MoE kernels
 
   # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
@@ -661,6 +680,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 endif()
 
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(MOE_PERMUTE_SRC
+      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+      "csrc/moe/moe_permute_unpermute_op.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_PERMUTE_SRC}"
+    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+
+  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
   _moe_C
@@ -669,6 +699,8 @@ define_gpu_extension_target(
   SOURCES ${VLLM_MOE_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
@@ -678,6 +710,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   #
   set(VLLM_ROCM_EXT_SRC
     "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/skinny_gemms.cu"
     "csrc/rocm/attention.cu")
 
   define_gpu_extension_target(
diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh
new file mode 100644
index 00000000000..ea63c6f71a6
--- /dev/null
+++ b/benchmarks/auto_tune.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
+# It also supports additional requirement: e2e latency and prefix cache. 
+
+# Pre-requisite:
+# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
+# 2. If the model is customized, replace the MODEL's config with the customized config.
+# 3. Set variables (ALL REQUIRED)
+#   BASE: your directory for vllm repo
+#   MODEL: the model served by vllm
+#   DOWNLOAD_DIR: directory to download and load model weights.
+#   INPUT_LEN: request input len
+#   OUTPUT_LEN: request output len
+#   MIN_CACHE_HIT_PCT: prefix cache rate
+#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
+# 5. The final result will be saved in RESULT file. 
+
+
+# Example use cases 
+# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
+# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
+# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
+
+TAG=$(date +"%Y_%m_%d_%H_%M")
+BASE=""
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
+DOWNLOAD_DIR=""
+INPUT_LEN=4000
+OUTPUT_LEN=16
+MIN_CACHE_HIT_PCT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000
+
+LOG_FOLDER="$BASE/auto-benchmark/$TAG"
+RESULT="$LOG_FOLDER/result.txt"
+
+echo "result file$ $RESULT"
+echo "model: $MODEL"
+echo
+
+rm -rf $LOG_FOLDER
+mkdir -p $LOG_FOLDER
+
+cd "$BASE/vllm"
+# create sonnet-4x.txt so that we can sample 2048 tokens for input
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+do
+cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+
+pip install datasets
+
+current_hash=$(git rev-parse HEAD)
+echo "hash:$current_hash" >> "$RESULT"
+echo "current_hash: $current_hash"
+
+best_throughput=0
+best_max_num_seqs=0
+best_num_batched_tokens=0
+best_goodput=0
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+
+    # start the server
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization 0.98 \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size 1 \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir $DOWNLOAD_DIR \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+    echo "wait for 10 minutes.."
+    echo
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do        
+        if grep -Fq "Application startup complete" "$vllm_log"; then
+            echo "Application started"
+            server_started=1
+            break
+        else
+            # echo "wait for 10 seconds..."
+            sleep 10
+        fi
+    done
+ 
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
+        echo "pkill -f vllm"
+        echo
+        pkill vllm
+        sleep 10
+        return 1
+    fi
+    
+    echo "run benchmark test..."
+    echo
+    meet_latency_requirement=0
+    # get a basic qps by using request-rate inf
+    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    python benchmarks/benchmark_serving.py \
+        --backend vllm \
+        --model $MODEL  \
+        --dataset-name sonnet \
+        --dataset-path benchmarks/sonnet_4x.txt \
+        --sonnet-input-len $INPUT_LEN \
+        --sonnet-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 100 \
+        --sonnet-prefix-len $prefix_len \
+        --port 8004 > "$bm_log"
+    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+
+    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+        meet_latency_requirement=1
+    fi
+
+    if (( ! meet_latency_requirement )); then
+    # start from request-rate as int(through_put) + 1
+        request_rate=$((${through_put%.*} + 1))
+        while ((request_rate > 0)); do
+            # clear prefix cache
+            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            sleep 5
+            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
+            python benchmarks/benchmark_serving.py \
+                --backend vllm \
+                --model $MODEL  \
+                --dataset-name sonnet \
+                --dataset-path benchmarks/sonnet_4x.txt \
+                --sonnet-input-len $INPUT_LEN \
+                --sonnet-output-len $OUTPUT_LEN \
+                --ignore_eos \
+                --disable-tqdm \
+                --request-rate $request_rate \
+                --percentile-metrics ttft,tpot,itl,e2el \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+                --num-prompts 100 \
+                --sonnet-prefix-len $prefix_len \
+                --port 8004 > "$bm_log"
+            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+                meet_latency_requirement=1
+                break
+            fi
+            request_rate=$((request_rate-1))
+        done
+    fi
+    # write the results and update the best result.
+    if ((meet_latency_requirement)); then
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
+            best_throughput=$through_put
+            best_max_num_seqs=$max_num_seqs
+            best_num_batched_tokens=$max_num_batched_tokens
+            best_goodput=$goodput
+        fi
+    else
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
+    fi
+
+    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+
+    echo "pkill -f vllm"
+    echo
+    pkill vllm
+    sleep 10
+    rm -f $vllm_log
+    printf '=%.0s' $(seq 1 20)
+    return 0
+}
+
+
+num_seqs_list="128 256"
+num_batched_tokens_list="512 1024 2048 4096"
+for num_seqs in $num_seqs_list; do
+    for num_batched_tokens in $num_batched_tokens_list; do
+        run_benchmark $num_seqs $num_batched_tokens
+        exit 0
+    done
+done
+echo "finish permutations"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
+
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 287d500a81d..e6a67fda682 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import io
 import json
 import os
 import sys
@@ -32,6 +33,7 @@ class RequestFuncInput:
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
+    language: Optional[str] = None
 
 
 @dataclass
@@ -199,6 +201,7 @@ async def async_request_deepspeed_mii(
                                      timeout=AIOHTTP_TIMEOUT) as session:
 
         payload = {
+            "model": request_func_input.model,
             "prompt": request_func_input.prompt,
             "max_tokens": request_func_input.output_len,
             "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
@@ -258,6 +261,7 @@ async def async_request_openai_completions(
                 if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
+            "repetition_penalty": 1.0,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
@@ -436,6 +440,110 @@ async def async_request_openai_chat_completions(
     return output
 
 
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("transcriptions", "translations"
+         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+            form = aiohttp.FormData()
+            form.add_field('file', f, content_type='audio/wav')
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
 def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
@@ -493,6 +601,7 @@ def get_tokenizer(
     "deepspeed-mii": async_request_deepspeed_mii,
     "openai": async_request_openai_completions,
     "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
     "tensorrt-llm": async_request_trt_llm,
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 63f174275d4..9c614baf1f0 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -64,6 +64,7 @@ class SampleRequest:
 
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
 
     def __init__(
         self,
@@ -621,6 +622,7 @@ class ConversationDataset(HuggingFaceDataset):
     SUPPORTED_DATASET_PATHS = {
         'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
     }
+    IS_MULTIMODAL = True
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -685,6 +687,7 @@ class VisionArenaDataset(HuggingFaceDataset):
         "lmarena-ai/vision-arena-bench-v0.1":
         lambda x: x["turns"][0][0]["content"]
     }
+    IS_MULTIMODAL = True
 
     def sample(
         self,
@@ -768,6 +771,60 @@ def sample(self,
         return sampled_requests
 
 
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench. 
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """ # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item['turns'][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template([{
+                "role": "user",
+                "content": prompt
+            }],
+                                                   add_generation_prompt=True,
+                                                   tokenize=False)
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
 # -----------------------------------------------------------------------------
 # AIMO Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -815,3 +872,80 @@ def sample(self,
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """ # noqa: E501
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
+        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
+                              "<|notimestamps|>"
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        import librosa
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        if skipped:
+            logger.warning("%d samples discarded from dataset due to" \
+                           " their length being greater than" \
+                           " what Whisper supports.", skipped)
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 4fff7a8fc8e..f44da95d321 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -63,14 +63,16 @@ class Request:
     output_len: int
 
 
-def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
+def sample_tokens(tokenizer: PreTrainedTokenizerBase,
+                  length: int) -> list[int]:
     vocab = tokenizer.get_vocab()
+    all_special_ids = set(tokenizer.all_special_ids)
+
     # Remove the special tokens.
-    vocab = {
-        k: v
-        for k, v in vocab.items() if k not in tokenizer.all_special_ids
-    }
-    return random.choices(list(vocab.values()), k=length)
+    return random.choices(
+        [v for k, v in vocab.items() if k not in all_special_ids],
+        k=length,
+    )
 
 
 def sample_requests_from_dataset(
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b5bd840d841..c236d64261d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -50,11 +50,11 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
                                ConversationDataset, HuggingFaceDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+                               InstructCoderDataset, MTBenchDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -274,10 +274,6 @@ async def benchmark(
         input_requests[0].expected_output_len, \
             input_requests[0].multi_modal_data
 
-    if backend != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat backend.
-        raise ValueError(
-            "Multi-modal content is only supported on 'openai-chat' backend.")
     assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
@@ -599,11 +595,17 @@ def main(args: argparse.Namespace):
         elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = InstructCoderDataset
             args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = ConversationDataset
         elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
             dataset_class = AIMODataset
             args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
         else:
             supported_datasets = set([
                 dataset_name for cls in HuggingFaceDataset.__subclasses__()
@@ -615,6 +617,13 @@ def main(args: argparse.Namespace):
                 f" from one of following: {supported_datasets}. "
                 "Please consider contributing if you would "
                 "like to add support for additional dataset formats.")
+
+        if (dataset_class.IS_MULTIMODAL and backend not in \
+            ["openai-chat", "openai-audio"]):
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and " \
+                "'openai-audio' backend.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
@@ -707,7 +716,7 @@ def main(args: argparse.Namespace):
         ))
 
     # Save config and results to json
-    if args.save_result:
+    if args.save_result or args.append_result:
         result_json: dict[str, Any] = {}
 
         # Setup
@@ -728,6 +737,14 @@ def main(args: argparse.Namespace):
                     raise ValueError(
                         "Invalid metadata format. Please use KEY=VALUE format."
                     )
+        # Traffic
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
 
         if not args.save_detailed:
             # Remove fields with too many data points
@@ -738,15 +755,6 @@ def main(args: argparse.Namespace):
                 if field in result_json:
                     del result_json[field]
 
-        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
-                                       < float("inf") else "inf")
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
-
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}
-
         # Save to file
         base_model_id = model_id.split("/")[-1]
         max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@@ -756,7 +764,12 @@ def main(args: argparse.Namespace):
             file_name = args.result_filename
         if args.result_dir:
             file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name,
+                  mode="a+" if args.append_result else "w",
+                  encoding='utf-8') as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
@@ -888,6 +901,11 @@ def main(args: argparse.Namespace):
         help="When saving the results, whether to include per request "
         "information such as response, error, ttfs, tpots, etc.",
     )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
     parser.add_argument(
         "--metadata",
         metavar="KEY=VALUE",
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index e52f16a8b12..7c40e39ac81 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -51,7 +51,7 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -123,6 +123,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                 copy.deepcopy(schema) for _ in range(args.num_prompts)
             ]
             for i in range(len(json_schemas)):
+                if "properties" not in json_schemas[i]:
+                    json_schemas[i]["properties"] = {}
                 json_schemas[i]["properties"][
                     f"__optional_field_{uuid.uuid4()}"] = {
                         "type":
@@ -134,7 +136,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
             json_schemas = [schema] * args.num_prompts
 
         def gen_prompt(index: int):
-            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
 
         def get_schema(index: int):
             return json_schemas[index % len(json_schemas)]
@@ -150,17 +152,17 @@ def get_schema(index: int):
 
     elif args.dataset == "grammar":
         schema = """
-            ?start: select_statement
+        root ::= select_statement
 
-            ?select_statement: "SELECT " column_list " FROM " table_name
+        select_statement ::= "SELECT " column " from " table " where " condition
 
-            ?column_list: column_name ("," column_name)*
+        column ::= "col_1 " | "col_2 "
 
-            ?table_name: identifier
+        table ::= "table_1 " | "table_2 "
 
-            ?column_name: identifier
+        condition ::= column "= " number
 
-            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        number ::= "1 " | "2 "
         """
         prompt = "Generate an SQL query to show the 'username' \
             and 'email' from the 'users' table."
@@ -231,7 +233,8 @@ def _filter_func(item):
                 idx -= len_dataset
             schema = dataset["schema"][idx]
             prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
-                                                   tokenize=False)
+                                                   tokenize=False,
+                                                   add_generation_prompt=True)
             input_len = len(tokenizer(prompt).input_ids)
             completion = dataset["completion"][idx]
 
@@ -849,7 +852,7 @@ def main(args: argparse.Namespace):
                             'json', 'json-unique', 'grammar', 'regex',
                             'choice', 'xgrammar_bench'
                         ])
-    parser.add_argument("--json_schema_path",
+    parser.add_argument("--json-schema-path",
                         type=str,
                         default=None,
                         help="Path to json schema.")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 67e509c1f55..1f65277e1bf 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -523,6 +523,13 @@ def validate_args(args):
         raise ValueError(
             "Tokenizer must be the same as the model for MII backend.")
 
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead")
+
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(description="Benchmark the throughput.")
diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py
new file mode 100644
index 00000000000..b23b4f3ea68
--- /dev/null
+++ b/benchmarks/kernels/benchmark_bitblas.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION)
+
+try:
+    import bitblas
+    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+        raise ImportError("bitblas version is wrong. Please "
+                          f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+except ImportError as e:
+    bitblas_import_exception = e
+    raise ValueError("Trying to use the bitblas backend, but could not import"
+                     f"with the following error: {bitblas_import_exception}. "
+                     "Please install bitblas through the following command: "
+                     f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+                     ) from bitblas_import_exception
+
+from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
+
+from vllm.utils import FlexibleArgumentParser
+
+parser = FlexibleArgumentParser(
+    description="Benchmark BitBLAS int4 on a specific target.")
+
+# Add arguments to the parser
+parser.add_argument(
+    "--target",
+    type=str,
+    default=auto_detect_nvidia_target(),
+    help="Specify the target device for benchmarking.",
+)
+parser.add_argument("--group_size",
+                    type=int,
+                    default=None,
+                    help="Group size for grouped quantization.")
+parser.add_argument(
+    "--A_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "float64", "int32", "int8"],
+    help="Data type of activation A.",
+)
+parser.add_argument(
+    "--W_dtype",
+    type=str,
+    default="int4",
+    choices=[
+        "float16",
+        "float32",
+        "float64",
+        "int32",
+        "int8",
+        "int4",
+        "int2",
+        "int1",
+        "nf4",
+        "fp4_e2m1",
+    ],
+    help="Data type of weight W.",
+)
+parser.add_argument(
+    "--accum_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "int32"],
+    help="Data type for accumulation.",
+)
+parser.add_argument(
+    "--out_dtype",
+    type=str,
+    default="float16",
+    choices=["float16", "float32", "int32", "int8"],
+    help="Data type for output.",
+)
+parser.add_argument(
+    "--layout",
+    type=str,
+    default="nt",
+    choices=["nt", "nn"],
+    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
+)
+parser.add_argument("--with_bias",
+                    action="store_true",
+                    help="Include bias in the benchmark.")
+parser.add_argument(
+    "--with_scaling",
+    action="store_true",
+    help="Include scaling factor in the quantization.",
+)
+parser.add_argument("--with_zeros",
+                    action="store_true",
+                    help="Include zeros in the quantization.")
+parser.add_argument(
+    "--zeros_mode",
+    type=str,
+    default=None,
+    choices=["original", "rescale", "quantized"],
+    help="Specify the mode for calculating zeros.",
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+# Assign arguments to variables
+target = args.target
+A_dtype = args.A_dtype
+W_dtype = args.W_dtype
+accum_dtype = args.accum_dtype
+out_dtype = args.out_dtype
+layout = args.layout
+with_bias = args.with_bias
+group_size = args.group_size
+with_scaling = args.with_scaling
+with_zeros = args.with_zeros
+zeros_mode = args.zeros_mode
+
+# Define a list of shared arguments that repeat in every config
+shared_args = [
+    A_dtype,
+    W_dtype,
+    out_dtype,
+    accum_dtype,
+    layout,
+    with_bias,
+    group_size,
+    with_scaling,
+    with_zeros,
+    zeros_mode,
+]
+
+# Define just the (M, K, N) shapes in a more compact list
+shapes = [
+    # square test
+    (1, 16384, 16384),
+    # BLOOM-176B
+    (1, 43008, 14336),
+    (1, 14336, 14336),
+    (1, 57344, 14336),
+    (1, 14336, 57344),
+    # OPT-65B
+    (1, 9216, 9216),
+    (1, 36864, 9216),
+    (1, 9216, 36864),
+    (1, 22016, 8192),
+    # LLAMA-70B/65B
+    (1, 8192, 22016),
+    (1, 8192, 8192),
+    (1, 28672, 8192),
+    (1, 8192, 28672),
+    # square test
+    (16384, 16384, 16384),
+    # BLOOM-176B
+    (8192, 43008, 14336),
+    (8192, 14336, 14336),
+    (8192, 57344, 14336),
+    (8192, 14336, 57344),
+    # OPT-65B
+    (8192, 9216, 9216),
+    (8192, 36864, 9216),
+    (8192, 9216, 36864),
+    (8192, 22016, 8192),
+    # LLAMA-70B/65B
+    (8192, 8192, 22016),
+    (8192, 8192, 8192),
+    (8192, 28672, 8192),
+    (8192, 8192, 28672),
+]
+
+# Build test shapes with all the shared arguments
+test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
+               for shape in shapes]
+
+benchmark_sets = []
+benchmark_sets.extend(test_shapes)
+
+benchmark_results = {}
+for config_class, operator, input_args in benchmark_sets:
+    config = config_class(*input_args)
+    matmul = operator(config, target=target, enable_tuning=True)
+    kernel_latency = matmul.profile_latency()
+
+    print("Time cost is: {:.3f} ms".format(kernel_latency))
+
+    profile_config = {
+        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
+            "BitBLAS_top20_latency": kernel_latency,
+        }
+    }
+
+    benchmark_results.update(profile_config)
+
+# Define headers for the table
+headers = [
+    "PrimFunc",
+    "Input Arguments",
+    "BitBLAS Top20 Latency",
+]
+
+# Calculate column widths for pretty printing
+col_widths = [0, 0, 0]
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
+    col_widths[1] = max(col_widths[1],
+                        len(input_args_str) + 2,
+                        len(headers[1]) + 2)
+    col_widths[2] = max(col_widths[2],
+                        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
+                        len(headers[2]) + 2)
+    # break only if you want to measure widths from a single example;
+    # otherwise, let it loop over all items.
+
+# Print header
+for i, header in enumerate(headers):
+    headers[i] = header.ljust(col_widths[i])
+print("".join(headers))
+print("-" * sum(col_widths))
+
+# Print rows
+for config_key, values in benchmark_results.items():
+    args_split = config_key.split("-")
+    func_name = args_split[0]
+    input_args_str = "-".join(args_split[1:])
+    row = [
+        func_name,
+        input_args_str,
+        f"{values['BitBLAS_top20_latency']:.3f} ms",
+    ]
+    row_str = "".join(
+        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
+    print(row_str)
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index bcdbf6c7551..c92ea43e826 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -90,7 +90,8 @@ def bench_run(results: list[benchmark.Measurement], model: str,
 
     score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
 
-    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, renormalize=False)
 
     def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index b4b91eda284..d382ede10b4 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -17,8 +17,14 @@
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
+                                          lora_shrink)
+    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
+                                                _LORA_B_PTR_DICT)
+
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index afe0b53077a..9407747f784 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -115,8 +115,8 @@ def run():
         from vllm.model_executor.layers.fused_moe import override_config
         with override_config(config):
             if use_deep_gemm:
-                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
-                                                    False)
+                topk_weights, topk_ids, token_expert_indices = fused_topk(
+                    x, input_gating, topk, False)
                 return fused_experts(
                     x,
                     w1,
@@ -442,8 +442,14 @@ def tune(
                                                    hidden_size, search_space,
                                                    is_fp16, topk)
 
-        with torch.cuda.device(self.device_id) if current_platform.is_rocm(
-        ) else nullcontext():
+        need_device_guard = False
+        if current_platform.is_rocm():
+            visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+            if visible_device != f"{self.device_id}":
+                need_device_guard = True
+
+        with torch.cuda.device(
+                self.device_id) if need_device_guard else nullcontext():
             for config in tqdm(search_space):
                 try:
                     kernel_time = benchmark_config(
@@ -527,7 +533,7 @@ def get_weight_block_size_safety(config, default_value=None):
 
 def main(args: argparse.Namespace):
     print(args)
-    block_quant_shape = None
+
     config = AutoConfig.from_pretrained(
         args.model, trust_remote_code=args.trust_remote_code)
     if config.architectures[0] == "DbrxForCausalLM":
@@ -546,16 +552,16 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-        block_quant_shape = get_weight_block_size_safety(config)
-    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+    elif config.architectures[0] in [
+            "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
+    ]:
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
-        if not hasattr(config, "hidden_size"):
-            # Support for llama4
-            config = config.text_config
+        # Support for llama4
+        config = config.get_text_config()
         # Default: Mixtral.
         E = config.num_local_experts
         topk = config.num_experts_per_tok
@@ -566,6 +572,7 @@ def main(args: argparse.Namespace):
     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
+    block_quant_shape = get_weight_block_size_safety(config)
 
     if args.batch_size is None:
         batch_sizes = [
@@ -577,6 +584,15 @@ def main(args: argparse.Namespace):
 
     use_deep_gemm = bool(args.use_deep_gemm)
 
+    if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
+        # Ray will set ROCR_VISIBLE_DEVICES for device visibility
+        logger.warning(
+            "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
+            "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
+        val = os.environ["HIP_VISIBLE_DEVICES"]
+        os.environ["ROCR_VISIBLE_DEVICES"] = val
+        del os.environ["HIP_VISIBLE_DEVICES"]
+
     ray.init()
     num_gpus = int(ray.available_resources()["GPU"])
     workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
new file mode 100644
index 00000000000..937df962465
--- /dev/null
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+from typing import Any, TypedDict
+
+import ray
+import torch
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _moe_permute, _moe_unpermute_and_reduce)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
+def benchmark_permute(num_tokens: int,
+                      num_experts: int,
+                      hidden_size: int,
+                      topk: int,
+                      dtype: torch.dtype,
+                      use_fp8_w8a8: bool,
+                      use_int8_w8a16: bool,
+                      num_iters: int = 100,
+                      use_customized_permute: bool = False) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    # output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+
+    gating_output = torch.randn(num_iters,
+                                num_tokens,
+                                num_experts,
+                                dtype=torch.float32)
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False)
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx,
+             m_indices) = moe_permute(
+                 qhidden_states,
+                 topk_weights=topk_weights,
+                 topk_ids=topk_ids,
+                 token_expert_indices=token_expert_indices,
+                 topk=topk,
+                 n_expert=num_experts,
+                 n_local_expert=num_experts,
+                 expert_map=None,
+                 align_block_size=align_block_size,
+             )
+        else:
+            (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+             inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
+                                      num_experts, None, align_block_size)
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def benchmark_unpermute(num_tokens: int,
+                        num_experts: int,
+                        hidden_size: int,
+                        topk: int,
+                        dtype: torch.dtype,
+                        use_fp8_w8a8: bool,
+                        use_int8_w8a16: bool,
+                        num_iters: int = 100,
+                        use_customized_permute: bool = False) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False)
+
+    def prepare():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx,
+             m_indices) = moe_permute(
+                 qhidden_states,
+                 topk_weights=topk_weights,
+                 topk_ids=topk_ids,
+                 token_expert_indices=token_expert_indices,
+                 topk=topk,
+                 n_expert=num_experts,
+                 n_local_expert=num_experts,
+                 expert_map=None,
+                 align_block_size=align_block_size,
+             )
+            # convert to fp16/bf16 as gemm output
+            return (permuted_hidden_states.to(dtype), first_token_off,
+                    inv_perm_idx, m_indices)
+        else:
+            (permuted_qhidden_states, a1q_scale, sorted_token_ids, expert_ids,
+             inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
+                                      num_experts, None, align_block_size)
+            # convert to fp16/bf16 as gemm output
+            return (permuted_qhidden_states.to(dtype), a1q_scale,
+                    sorted_token_ids, expert_ids, inv_perm)
+
+    def run(input: tuple):
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx,
+             m_indices) = input
+            moe_unpermute(permuted_hidden_states, topk_weights, topk_ids,
+                          inv_perm_idx, first_token_off, topk, num_experts,
+                          num_experts)
+        else:
+            (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+             inv_perm) = input
+            _moe_unpermute_and_reduce(output_hidden_states,
+                                      permuted_hidden_states, inv_perm,
+                                      topk_weights)
+
+    # JIT compilation & warmup
+    input = prepare()
+    run(input)
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run(input)
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_customized_permute: bool = False,
+    ) -> tuple[dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+
+        permute_time = benchmark_permute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute)
+        unpermute_time = benchmark_unpermute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute)
+        return permute_time, unpermute_time
+
+
+def get_weight_block_size_safety(config, default_value=None):
+
+    quantization_config = getattr(config, 'quantization_config', {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get('weight_block_size', default_value)
+    return default_value
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
+          or config.architectures[0] == "DeepseekV2ForCausalLM"):
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+    elif config.architectures[0] in [
+            "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
+    ]:
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+
+    else:
+        # Support for llama4
+        config = config.get_text_config()
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_customized_permute = args.use_customized_permute
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    outputs = _distribute(
+        "benchmark", [(batch_size, E, hidden_size, topk, dtype, use_fp8_w8a8,
+                       use_int8_w8a16, use_customized_permute)
+                      for batch_size in batch_sizes])
+
+    for batch_size, (permute, unpermute) in zip(batch_sizes, outputs):
+        print(f"Batch size: {batch_size}")
+        print(f"Permute time: {permute:.2f} us")
+        print(f"Unpermute time: {unpermute:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
+                        default="auto")
+    parser.add_argument("--use-customized-permute", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index afd7c47e8ac..b04e4c2d06e 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
+          GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 7af0caceda2..14e5edd7e28 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -107,13 +107,14 @@ __global__ void merge_attn_states_kernel(
 
 #define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
   {                                                                         \
-    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block>>>( \
-        reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr,     \
-        reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),              \
-        reinterpret_cast<float*>(prefix_lse.data_ptr()),                    \
-        reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),              \
-        reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,        \
-        num_heads, head_size);                                              \
+    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS>                   \
+        <<<grid, block, 0, stream>>>(                                       \
+            reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
+            reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),          \
+            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
+            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
+            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
+            num_heads, head_size);                                          \
   }
 
 /*@brief Merges the attention states from prefix and suffix
@@ -122,10 +123,10 @@ __global__ void merge_attn_states_kernel(
  * @param output [n,h,d] The output tensor to store the merged attention states.
  * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
  * @param prefix_output [n,h,d] The prefix attention states.
- * @param prefix_lse [h,d] The log-sum-exp values for the prefix attention
+ * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
  * states.
  * @param suffix_output [n,h,d] The suffix attention states.
- * @param suffix_lse [h,d] The log-sum-exp values for the suffix attention
+ * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
  * states.
  */
 template <typename scalar_t>
@@ -146,13 +147,17 @@ void merge_attn_states_launcher(torch::Tensor& output,
   if (output_lse.has_value()) {
     output_lse_ptr = output_lse.value().data_ptr<float>();
   }
-  // process one pack elements per thread. float -> 4, half/bf16 -> 8
+  // Process one pack elements per thread. for float, the
+  // pack_size is 4 for half/bf16, the pack_size is 8.
   const uint threads_per_head = head_size / pack_size;
   const uint total_threads = num_tokens * num_heads * threads_per_head;
 
   dim3 block(NUM_THREADS);
   dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
 
+  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
   LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
 }
 
diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu
new file mode 100644
index 00000000000..0319d1daf30
--- /dev/null
+++ b/csrc/attention/mla/cutlass_mla_entry.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
+void cutlass_mla_decode_sm100a(torch::Tensor const& out,
+                               torch::Tensor const& q_nope,
+                               torch::Tensor const& q_pe,
+                               torch::Tensor const& kv_c_and_k_pe_cache,
+                               torch::Tensor const& seq_lens,
+                               torch::Tensor const& page_table, double scale);
+#endif
+
+void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
+                        torch::Tensor const& q_pe,
+                        torch::Tensor const& kv_c_and_k_pe_cache,
+                        torch::Tensor const& seq_lens,
+                        torch::Tensor const& page_table, double scale) {
+#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
+  return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                   seq_lens, page_table, scale);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
+}
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
new file mode 100644
index 00000000000..6743af0cf2d
--- /dev/null
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.h"
+
+#include "cutlass_extensions/common.hpp"
+
+#include "device/sm100_mla.hpp"
+#include "kernel/sm100_mla_tile_scheduler.hpp"
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+template <typename T, bool PersistenceOption = true>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
+                         Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel =
+      cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+          TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
+          /*kIsCpAsync=*/true>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
+    at::Tensor const& page_table, double scale) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = q_nope.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  int batches = q_nope.sizes()[0];
+  int page_count_per_seq = page_table.sizes()[1];
+  int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
+  int page_size = kv_c_and_k_pe_cache.sizes()[1];
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape =
+      cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q_latent = cute::make_tuple(
+      static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
+  StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
+                                           static_cast<int64_t>(H * D_rope));
+  StrideK stride_C =
+      cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
+                       static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
+                                      static_cast<int64_t>(H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
+  auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
+  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
+  auto scale_f = static_cast<float>(scale);
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
+       stride_C, C_ptr + D_latent, stride_C,
+       static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O,
+       static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      -1,       // split_kv
+      nullptr,  // is_var_split_kv
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element>
+void runMla(at::Tensor const& out, at::Tensor const& q_nope,
+            at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
+            at::Tensor const& seq_lens, at::Tensor const& page_table,
+            float scale, cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(
+      out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
+  size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
+}
+
+void cutlass_mla_decode_sm100a(torch::Tensor const& out,
+                               torch::Tensor const& q_nope,
+                               torch::Tensor const& q_pe,
+                               torch::Tensor const& kv_c_and_k_pe_cache,
+                               torch::Tensor const& seq_lens,
+                               torch::Tensor const& page_table, double scale) {
+  TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
+  TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
+  TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
+  TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
+              "kv_c_and_k_pe_cache must be a 3D tensor");
+  TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
+  TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
+  TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
+
+  auto B_q_nope = q_nope.size(0);
+  auto H_q_nope = q_nope.size(1);
+  auto D_q_nope = q_nope.size(2);
+  auto B_q_pe = q_pe.size(0);
+  auto H_q_pe = q_pe.size(1);
+  auto D_q_pe = q_pe.size(2);
+  auto B_pt = page_table.size(0);
+  auto PAGE_NUM = page_table.size(1);
+  auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
+  auto D_ckv = kv_c_and_k_pe_cache.size(2);
+  auto B_o = out.size(0);
+  auto H_o = out.size(1);
+  auto D_o = out.size(2);
+
+  TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
+  TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
+  TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
+  TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
+              "H_q_nope, H_q_pe, and H_o must be equal to 128");
+  TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
+              "PAGE_SIZE must be a power of 2");
+  TORCH_CHECK(
+      B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
+      "Batch dims must be same for page_table, q_nope and q_pe, and out");
+  TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
+              "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
+  TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
+
+  TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
+                  q_nope.dtype() == at::ScalarType::BFloat16 ||
+                  q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
+              "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
+  TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
+                  q_nope.dtype() == q_pe.dtype(),
+              "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
+  TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
+              "seq_lens must be a 32-bit integer tensor");
+  TORCH_CHECK(page_table.dtype() == torch::kInt32,
+              "page_table must be a 32-bit integer tensor");
+
+  auto in_dtype = q_nope.dtype();
+  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
+  const cudaStream_t stream =
+      at::cuda::getCurrentCUDAStream(q_nope.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
+                            page_table, scale, stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                seq_lens, page_table, scale, stream);
+  } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+    runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                  seq_lens, page_table, scale, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported input data type of MLA");
+  }
+}
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 0b3f6fc8c19..88559c8fe71 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -270,9 +270,10 @@ __global__ void reshape_and_cache_flash_kernel(
     cache_t* __restrict__ value_cache,   // [num_blocks, block_size, num_heads,
                                          // head_size]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride, const int key_stride, const int value_stride,
-    const int num_heads, const int head_size, const int block_size,
-    const float* k_scale, const float* v_scale) {
+    const int64_t block_stride, const int64_t page_stride,
+    const int64_t head_stride, const int64_t key_stride,
+    const int64_t value_stride, const int num_heads, const int head_size,
+    const int block_size, const float* k_scale, const float* v_scale) {
   const int64_t token_idx = blockIdx.x;
   const int64_t slot_idx = slot_mapping[token_idx];
   // NOTE: slot_idx can be -1 if the token is padded
@@ -288,8 +289,8 @@ __global__ void reshape_and_cache_flash_kernel(
     const int head_idx = i / head_size;
     const int head_offset = i % head_size;
     const int64_t tgt_key_value_idx = block_idx * block_stride +
-                                      block_offset * num_heads * head_size +
-                                      head_idx * head_size + head_offset;
+                                      block_offset * page_stride +
+                                      head_idx * head_stride + head_offset;
     scalar_t tgt_key = key[src_key_idx];
     scalar_t tgt_value = value[src_value_idx];
     if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
@@ -396,16 +397,16 @@ void reshape_and_cache(
 // KV_T is the data type of key and value tensors.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
-  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
-      <<<grid, block, 0, stream>>>(                                   \
-          reinterpret_cast<KV_T*>(key.data_ptr()),                    \
-          reinterpret_cast<KV_T*>(value.data_ptr()),                  \
-          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),           \
-          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),         \
-          slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
-          value_stride, num_heads, head_size, block_size,             \
-          reinterpret_cast<const float*>(k_scale.data_ptr()),         \
+#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)             \
+  vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>           \
+      <<<grid, block, 0, stream>>>(                                       \
+          reinterpret_cast<KV_T*>(key.data_ptr()),                        \
+          reinterpret_cast<KV_T*>(value.data_ptr()),                      \
+          reinterpret_cast<CACHE_T*>(key_cache.data_ptr()),               \
+          reinterpret_cast<CACHE_T*>(value_cache.data_ptr()),             \
+          slot_mapping.data_ptr<int64_t>(), block_stride, page_stride,    \
+          head_stride, key_stride, value_stride, num_heads, head_size,    \
+          block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
           reinterpret_cast<const float*>(v_scale.data_ptr()));
 
 void reshape_and_cache_flash(
@@ -432,9 +433,11 @@ void reshape_and_cache_flash(
   int head_size = key.size(2);
   int block_size = key_cache.size(1);
 
-  int key_stride = key.stride(0);
-  int value_stride = value.stride(0);
-  int block_stride = key_cache.stride(0);
+  int64_t key_stride = key.stride(0);
+  int64_t value_stride = value.stride(0);
+  int64_t block_stride = key_cache.stride(0);
+  int64_t page_stride = key_cache.stride(1);
+  int64_t head_stride = key_cache.stride(2);
   TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
 
   dim3 grid(num_tokens);
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
index b8171133f6a..6764e1fd605 100644
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@@ -7,3 +7,22 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
+
+template <typename A, typename B>
+static inline constexpr auto div_ceil(A a, B b) {
+  return (a + b - 1) / b;
+}
+
+// Round a down to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_previous_multiple_of(T a, T b) {
+  return a % b == 0 ? a : (a / b) * b;
+}
+
+// Round a up to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_next_multiple_of(T a, T b) {
+  return a % b == 0 ? a : ((a / b) + 1) * b;
+}
diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
index 47ecf109d0f..a217401b3d7 100644
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 205b308fe51..3705216cada 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -209,8 +209,8 @@ __device__ inline typename ScalarType<half>::FragB dequant<half, 4>(
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -233,9 +233,9 @@ dequant<nv_bfloat16, 4>(int q,
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
 
   static constexpr uint32_t MUL = 0x3F803F80;
   static constexpr uint32_t ADD = 0xC308C308;
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
new file mode 100644
index 00000000000..76d5f0eab02
--- /dev/null
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -0,0 +1,133 @@
+#include <c10/core/ScalarType.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "permute_unpermute_kernels/moe_permute_unpermute_kernel.h"
+#include "permute_unpermute_kernels/dispatch.h"
+#include "core/registration.h"
+
+void moe_permute(
+    const torch::Tensor& input,                      // [n_token, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    torch::Tensor& topk_ids,                         // [n_token, topk]
+    const torch::Tensor& token_expert_indicies,      // [n_token, topk]
+    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    const std::optional<int64_t>& align_block_size,
+    torch::Tensor&
+        permuted_input,  // [topk * n_token/align_block_size_m, hidden]
+    torch::Tensor& expert_first_token_offset,  // [n_local_expert + 1]
+    torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    torch::Tensor& m_indices) {                // [align_expand_m]
+  TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
+              "topk_weights must be float32");
+  TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
+              "expert_first_token_offset must be int64");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
+  TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
+              "token_expert_indicies must be int32");
+  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
+              "src_row_id2dst_row_id_map must be int32");
+  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
+              "expert_first_token_offset shape != n_local_expert+1")
+  TORCH_CHECK(
+      src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
+      "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
+  auto n_token = input.sizes()[0];
+  auto n_hidden = input.sizes()[1];
+  auto align_block_size_value =
+      align_block_size.has_value() ? align_block_size.value() : -1;
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  const long sorter_size =
+      CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert);
+  auto sort_workspace = torch::empty(
+      {sorter_size},
+      torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+  auto permuted_experts_id = torch::empty_like(topk_ids);
+  auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
+  auto align_expert_first_token_offset =
+      torch::zeros_like(expert_first_token_offset);
+
+  CubKeyValueSorter sorter{};
+  int64_t* valid_num_ptr = nullptr;
+  // pre-process kernel for expert-parallelism:
+  // no local expert id plus "n_expert" offset for priority to local expert
+  // map local expert id [n, .., n+n_local_expert-1] to [0, n_local_expert -1]
+  // For example, 4 expert with ep_size=2. ep_rank=1 owns global expert id
+  // [2,3] with expert_map[-1, -1, 0, 1], preprocess_topk_id  process topk_ids
+  // and map global expert id [2, 3] to local_expert id [0, 1] and map global
+  // expert id [0, 1] ( not in ep rank=1)  to [4, 5] by plus n_expert. This map
+  // operation is to make local expert high priority in following sort topk_ids
+  // and scan local expert_first_token_offset for each ep rank for next group
+  // gemm.
+  if (expert_map.has_value()) {
+    const int* expert_map_ptr = get_ptr<int>(expert_map.value());
+    valid_num_ptr =
+        get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+    preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
+                             expert_map_ptr, n_expert, stream);
+  }
+  // expert sort topk expert id and scan expert id get expert_first_token_offset
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
+                    get_ptr<int>(permuted_experts_id),
+                    get_ptr<int>(dst_row_id2src_row_id_map),
+                    get_ptr<int64_t>(expert_first_token_offset), n_token,
+                    n_expert, n_local_expert, topk, sorter,
+                    get_ptr<int>(sort_workspace), stream);
+
+  // dispatch expandInputRowsKernelLauncher
+  MOE_DISPATCH(input.scalar_type(), [&] {
+    expandInputRowsKernelLauncher<scalar_t>(
+        get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
+        get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
+        get_ptr<int>(dst_row_id2src_row_id_map),
+        get_ptr<int>(src_row_id2dst_row_id_map),
+        get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
+        n_hidden, topk, n_local_expert, align_block_size_value, stream);
+  });
+
+  // get m_indices and update expert_first_token_offset with align block
+  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+              get_ptr<int64_t>(align_expert_first_token_offset),
+              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+              stream);
+  if (align_block_size.has_value()) {
+    // update align_expert_first_token_offset
+    expert_first_token_offset.copy_(align_expert_first_token_offset);
+  }
+}
+
+void moe_unpermute(
+    const torch::Tensor& permuted_hidden_states,     // [n_token * topk, hidden]
+    const torch::Tensor& topk_weights,               //[n_token, topk]
+    const torch::Tensor& topk_ids,                   // [n_token, topk]
+    const torch::Tensor& src_row_id2dst_row_id_map,  // [n_token, topk]
+    const torch::Tensor& expert_first_token_offset,  // [n_local_expert+1]
+    int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    torch::Tensor& hidden_states  // [n_token, hidden]
+) {
+  TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
+              "topk_ids shape must be same as src_row_id2dst_row_id_map");
+  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
+              "topk_ids must be int32");
+  TORCH_CHECK(
+      permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
+      "topk_ids dtype must be same as src_row_id2dst_row_id_map");
+  auto n_token = hidden_states.size(0);
+  auto n_hidden = hidden_states.size(1);
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  const int64_t* valid_ptr =
+      get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
+  MOE_DISPATCH(hidden_states.scalar_type(), [&] {
+    finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
+        get_ptr<scalar_t>(permuted_hidden_states),
+        get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
+        get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
+        n_token, n_hidden, topk, valid_ptr, stream);
+  });
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("moe_permute", &moe_permute);
+  m.impl("moe_unpermute", &moe_unpermute);
+}
\ No newline at end of file
diff --git a/csrc/moe/moe_wna16.cu b/csrc/moe/moe_wna16.cu
index 51ae76c1ec8..7b6a111c00a 100644
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
@@ -13,7 +13,6 @@
 template <typename scalar_t, int bit, int GROUPS>
 __global__ void moe_wna16_gemm_kernel(
     const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
-
     const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
     const uint32_t* __restrict__ qzeros,
 
@@ -54,8 +53,6 @@ __global__ void moe_wna16_gemm_kernel(
       if (token_index / top_k >= size_m) break;
 
       num_valid_tokens = m + 1;
-      if (blockIdx.z == 0 && offset_n < size_n)
-        output[token_index * size_n + offset_n] = Dtype::int2num(0);
 
       if (expert_id != -1) {
         int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
@@ -284,8 +281,7 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  auto options =
-      torch::TensorOptions().dtype(input.dtype()).device(input.device());
+  output.zero_();
 
   const int num_experts = b_qweight.size(0);
   const int size_m = input.size(0);
@@ -302,9 +298,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
   const uint32_t* b_qzeros_ptr;
   if (b_qzeros.has_value())
     b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
-  const float* topk_weights_ptr;
+  const float* topk_weights_ptr = nullptr;
   if (topk_weights.has_value())
-    topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr<float>();
 
   int groups_per_block_row = BLOCK_SIZE_K / group_size;
   TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
diff --git a/csrc/moe/moe_wna16_utils.h b/csrc/moe/moe_wna16_utils.h
index 4396b80240e..8ef03f0e605 100644
--- a/csrc/moe/moe_wna16_utils.h
+++ b/csrc/moe/moe_wna16_utils.h
@@ -108,11 +108,11 @@ __device__ inline void dequant<half2, 4>(int q, half2* res) {
   const int MUL = 0x2c002c00;
   const int ADD = 0xd400d400;
 
-  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   q >>= 8;
-  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
 
   res[0] = __hsub2(*reinterpret_cast<half2*>(&lo0),
                    *reinterpret_cast<const half2*>(&SUB));
@@ -149,13 +149,13 @@ __device__ inline void dequant<nv_bfloat162, 4>(int q, nv_bfloat162* res) {
   static constexpr uint32_t MASK = 0x000f000f;
   static constexpr uint32_t EX = 0x43004300;
 
-  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
 
   static constexpr uint32_t MUL = 0x3F803F80;
   static constexpr uint32_t ADD = 0xC300C300;
diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h
new file mode 100644
index 00000000000..41932cdd85b
--- /dev/null
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -0,0 +1,53 @@
+#pragma once
+#include <cuda_fp8.h>
+#define MOE_SWITCH(TYPE, ...)                                     \
+  at::ScalarType _st = ::detail::scalar_type(TYPE);               \
+  switch (_st) {                                                  \
+    __VA_ARGS__                                                   \
+    default:                                                      \
+      TORCH_CHECK(false, "[moe permute]data type dispatch fail!") \
+  }
+
+#define MOE_DISPATCH_CASE(enum_type, ...)                  \
+  case enum_type: {                                        \
+    using scalar_t = ScalarType2CudaType<enum_type>::type; \
+    __VA_ARGS__();                                         \
+    break;                                                 \
+  }
+#define MOE_DISPATCH_FLOAT_CASE(...)                          \
+  MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)       \
+  MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)        \
+  MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)    \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
+  MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+
+#define MOE_DISPATCH(TYPE, ...) \
+  MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
+
+template <at::ScalarType type>
+struct ScalarType2CudaType;
+
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float> {
+  using type = float;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::Half> {
+  using type = half;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::BFloat16> {
+  using type = __nv_bfloat16;
+};
+
+// #if __CUDA_ARCH__ >= 890
+// fp8
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float8_e5m2> {
+  using type = __nv_fp8_e5m2;
+};
+template <>
+struct ScalarType2CudaType<at::ScalarType::Float8_e4m3fn> {
+  using type = __nv_fp8_e4m3;
+};
+// #endif
\ No newline at end of file
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
new file mode 100644
index 00000000000..aa353d0f043
--- /dev/null
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -0,0 +1,229 @@
+
+#include "moe_permute_unpermute_kernel.h"
+
+// CubKeyValueSorter definition begin
+CubKeyValueSorter::CubKeyValueSorter()
+    : num_experts_(0), num_bits_(sizeof(int) * 8) {}
+
+int CubKeyValueSorter::expertsToBits(int num_experts) {
+  // Max value we represent is V = num_experts + (num_experts - 1) = 2 *
+  // num_experts - 1 The maximum number of bits is therefore floor(log2(V)) + 1
+  return static_cast<int>(log2(2 * num_experts - 1)) + 1;
+}
+
+CubKeyValueSorter::CubKeyValueSorter(int const num_experts)
+    : num_experts_(num_experts), num_bits_(expertsToBits(num_experts)) {}
+
+void CubKeyValueSorter::updateNumExperts(int const num_experts) {
+  num_experts_ = num_experts;
+  num_bits_ = expertsToBits(num_experts);
+}
+
+size_t CubKeyValueSorter::getWorkspaceSize(size_t const num_key_value_pairs,
+                                           int const num_experts) {
+  int num_bits = expertsToBits(num_experts);
+  size_t required_storage = 0;
+  int* null_int = nullptr;
+  cub::DeviceRadixSort::SortPairs(nullptr, required_storage, null_int, null_int,
+                                  null_int, null_int, num_key_value_pairs, 0,
+                                  num_bits);
+
+  //   when num_key_value_pairs, num_experts, num_bits, required_storage = 64,
+  //   4, 3, 0 The required_storage seems to vary between 0 and 1 for the same
+  //   inputs
+  if (required_storage == 0) {
+    required_storage = 1;
+  }
+  return required_storage;
+}
+
+void CubKeyValueSorter::run(void* workspace, size_t const workspace_size,
+                            int const* keys_in, int* keys_out,
+                            int const* values_in, int* values_out,
+                            size_t const num_key_value_pairs,
+                            cudaStream_t stream) {
+  size_t expected_ws_size = getWorkspaceSize(num_key_value_pairs, num_experts_);
+  size_t actual_ws_size = workspace_size;
+
+  TORCH_CHECK(expected_ws_size <= workspace_size,
+              "[CubKeyValueSorter::run] The allocated workspace is too small "
+              "to run this problem.");
+  cub::DeviceRadixSort::SortPairs(workspace, actual_ws_size, keys_in, keys_out,
+                                  values_in, values_out, num_key_value_pairs, 0,
+                                  num_bits_, stream);
+}
+// CubKeyValueSorter definition end
+
+static inline size_t pad_to_multiple_of_16(size_t const& input) {
+  static constexpr int ALIGNMENT = 16;
+  return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+}
+template <class T>
+__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
+                                                      int64_t const arr_length,
+                                                      T const target) {
+  int64_t low = 0, high = arr_length - 1, target_location = -1;
+  while (low <= high) {
+    int64_t mid = (low + high) / 2;
+
+    if (sorted_indices[mid] >= target) {
+      high = mid - 1;
+    } else {
+      low = mid + 1;
+      target_location = mid;
+    }
+  }
+  return target_location + 1;
+}
+
+// Calculates the start offset of the tokens for a given expert. The last
+// element is the total number of valid tokens
+__global__ void computeExpertFirstTokenOffsetKernel(
+    int const* sorted_experts, int64_t const sorted_experts_len,
+    int const num_experts, int64_t* expert_first_token_offset) {
+  // First, compute the global tid. We only need 1 thread per expert.
+  int const expert = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Note that expert goes [0, num_experts] (inclusive) because we want a count
+  // for the total number of active tokens at the end of the scan.
+  if (expert >= num_experts + 1) {
+    return;
+  }
+  expert_first_token_offset[expert] =
+      findTotalEltsLessThanTarget(sorted_experts, sorted_experts_len, expert);
+}
+
+void computeExpertFirstTokenOffset(int const* sorted_indices,
+                                   int const total_indices,
+                                   int const num_experts,
+                                   int64_t* expert_first_token_offset,
+                                   cudaStream_t stream) {
+  int const num_entries = num_experts + 1;
+  int const threads = std::min(1024, num_entries);
+  int const blocks = (num_entries + threads - 1) / threads;
+
+  computeExpertFirstTokenOffsetKernel<<<blocks, threads, 0, stream>>>(
+      sorted_indices, total_indices, num_experts, expert_first_token_offset);
+}
+
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+                       int* permuted_experts, int* permuted_rows,
+                       int64_t* expert_first_token_offset, int num_rows,
+                       int num_experts, int num_experts_per_node, int k,
+                       CubKeyValueSorter& sorter, void* sorter_ws,
+                       cudaStream_t stream) {
+  int64_t const expanded_num_rows = static_cast<int64_t>(k) * num_rows;
+  // We need to use the full num_experts because that is the sentinel value used
+  // by topk for disabled experts
+  sorter.updateNumExperts(num_experts);
+  size_t const sorter_ws_size_bytes = pad_to_multiple_of_16(
+      sorter.getWorkspaceSize(expanded_num_rows, num_experts));
+  sorter.run((void*)sorter_ws, sorter_ws_size_bytes, expert_for_source_row,
+             permuted_experts, source_rows, permuted_rows, expanded_num_rows,
+             stream);
+  computeExpertFirstTokenOffset(permuted_experts, expanded_num_rows,
+                                num_experts_per_node, expert_first_token_offset,
+                                stream);
+}
+
+__global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
+                                       const int* expert_map_ptr,
+                                       int num_experts) {
+  auto tidx = threadIdx.x;
+  auto bidx = blockIdx.x;
+  auto lidx = tidx & 31;
+  auto widx = tidx >> 5;
+  auto warp_count = (blockDim.x + 31) >> 5;
+  auto offset = bidx * blockDim.x;
+  auto bound = min(offset + blockDim.x, size);
+  extern __shared__ int smem_expert_map[];
+  // store expert_map in smem
+  for (int i = tidx; i < num_experts; i += blockDim.x) {
+    smem_expert_map[i] = expert_map_ptr[i];
+  }
+  __syncthreads();
+
+  // query global expert id in expert map.
+  // if global expert id = -1 in exert map, plus n_expert
+  // else set global expert id = exert map[global expert id]
+  if (offset + tidx < bound) {
+    auto topk_id = topk_id_ptr[offset + tidx];
+    auto local_expert_idx = smem_expert_map[topk_id];
+    if (local_expert_idx == -1) {
+      topk_id += num_experts;
+    } else {
+      topk_id = local_expert_idx;
+    }
+    __syncwarp();
+    topk_id_ptr[offset + tidx] = topk_id;
+  }
+}
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream) {
+  int block = std::min(size, 1024);
+  int grid = (size + block - 1) / block;
+  int smem_size = (num_experts) * sizeof(int);
+  preprocessTopkIdKernel<<<grid, block, smem_size, stream>>>(
+      topk_id_ptr, size, expert_map_ptr, num_experts);
+}
+
+template <bool ALIGN_BLOCK_SIZE>
+__global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
+                                  int64_t* align_expert_first_token_offset,
+                                  int* m_indices, const int num_local_expert,
+                                  const int align_block_size) {
+  int eidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  extern __shared__ int64_t smem_expert_first_token_offset[];
+  for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
+    smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
+  }
+  __syncthreads();
+  auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
+  auto first_token_offset = smem_expert_first_token_offset[eidx];
+  int n_token_in_expert = last_token_offset - first_token_offset;
+
+  if constexpr (ALIGN_BLOCK_SIZE) {
+    n_token_in_expert = (n_token_in_expert + align_block_size - 1) /
+                        align_block_size * align_block_size;
+    // round up to ALIGN_BLOCK_SIZE
+    int64_t accumulate_align_offset = 0;
+    for (int i = 1; i <= eidx + 1; i++) {
+      int n_token = smem_expert_first_token_offset[i] -
+                    smem_expert_first_token_offset[i - 1];
+      accumulate_align_offset =
+          accumulate_align_offset + (n_token + align_block_size - 1) /
+                                        align_block_size * align_block_size;
+      if (i == eidx) {
+        first_token_offset = accumulate_align_offset;
+      }
+      // last block store align_expert_first_token_offset
+      if (eidx == num_local_expert - 1 && threadIdx.x == 0) {
+        align_expert_first_token_offset[i] = accumulate_align_offset;
+      }
+    }
+  }
+  for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) {
+    // update m_indice with expert id
+    m_indices[first_token_offset + idx] = eidx;
+  }
+}
+
+void getMIndices(int64_t* expert_first_token_offset,
+                 int64_t* align_expert_first_token_offset, int* m_indices,
+                 int num_local_expert, const int align_block_size,
+                 cudaStream_t stream) {
+  int block = 256;
+  int grid = num_local_expert;
+  int smem_size = sizeof(int64_t) * (num_local_expert + 1);
+  if (align_block_size == -1) {
+    getMIndicesKernel<false><<<grid, block, smem_size, stream>>>(
+        expert_first_token_offset, align_expert_first_token_offset, m_indices,
+        num_local_expert, align_block_size);
+  } else {
+    getMIndicesKernel<true><<<grid, block, smem_size, stream>>>(
+        expert_first_token_offset, align_expert_first_token_offset, m_indices,
+        num_local_expert, align_block_size);
+  }
+}
\ No newline at end of file
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
new file mode 100644
index 00000000000..43c29721cd1
--- /dev/null
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -0,0 +1,95 @@
+#pragma once
+// reference from tensorrt_llm moe kernel implementation archive in
+// https://github.com/BBuf/tensorrt-llm-moe/tree/master
+
+#include <c10/core/ScalarType.h>
+#include <torch/all.h>
+#include "dispatch.h"
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+#include "cutlass/numeric_size.h"
+#include "cutlass/array.h"
+
+template <typename T>
+inline T* get_ptr(torch::Tensor& t) {
+  return reinterpret_cast<T*>(t.data_ptr());
+}
+
+template <typename T>
+inline const T* get_ptr(const torch::Tensor& t) {
+  return reinterpret_cast<const T*>(t.data_ptr());
+}
+
+class CubKeyValueSorter {
+ public:
+  CubKeyValueSorter();
+
+  CubKeyValueSorter(int const num_experts);
+
+  void updateNumExperts(int const num_experts);
+
+  static size_t getWorkspaceSize(size_t const num_key_value_pairs,
+                                 int const num_experts);
+
+  void run(void* workspace, size_t const workspace_size, int const* keys_in,
+           int* keys_out, int const* values_in, int* values_out,
+           size_t const num_key_value_pairs, cudaStream_t stream);
+
+ private:
+  static int expertsToBits(int experts);
+  int num_experts_;
+  int num_bits_;
+};
+
+void computeExpertFirstTokenOffset(int const* sorted_indices,
+                                   int const total_indices,
+                                   int const num_experts,
+                                   int64_t* expert_first_token_offset,
+                                   cudaStream_t stream);
+
+void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
+                       int* permuted_experts, int* permuted_rows,
+                       int64_t* expert_first_token_offset, int num_rows,
+                       int num_experts, int num_experts_per_node, int k,
+                       CubKeyValueSorter& sorter, void* sorter_ws,
+                       cudaStream_t stream);
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int num_local_experts, const int& align_block_size, cudaStream_t stream);
+
+// Final kernel to unpermute and scale
+// This kernel unpermutes the original data, does the k-way reduction and
+// performs the final skip connection.
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr);
+
+template <class T, class OutputType>
+void finalizeMoeRoutingKernelLauncher(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream);
+
+void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
+                              const int* expert_map_ptr, int num_experts,
+                              cudaStream_t stream);
+
+void getMIndices(int64_t* expert_first_token_offset,
+                 int64_t* align_expert_first_token_offset, int* m_indices,
+                 int num_local_expert, const int align_block_size,
+                 cudaStream_t stream);
+
+#include "moe_permute_unpermute_kernel.inl"
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
new file mode 100644
index 00000000000..42441800fb1
--- /dev/null
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl
@@ -0,0 +1,211 @@
+#pragma once
+
+template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
+__global__ void expandInputRowsKernel(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_dest_rows, int64_t const cols, int64_t k,
+    int num_local_experts, int align_block_size) {
+  // Reverse permutation map.
+  // I do this so that later, we can use the source -> dest map to do the k-way
+  // reduction and unpermuting. I need the reverse map for that reduction to
+  // allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
+  // thread block will be responsible for all k summations.
+  int64_t expanded_dest_row = blockIdx.x;
+  int64_t const expanded_source_row =
+      expanded_dest_row_to_expanded_source_row[expanded_dest_row];
+  int expert_id = sorted_experts[expanded_dest_row];
+
+  extern __shared__ int64_t smem_expert_first_token_offset[];
+  int64_t align_expanded_row_accumulate = 0;
+  if constexpr (ALIGN_BLOCK_SIZE) {
+    // load g2s
+    for (int idx = threadIdx.x; idx < num_local_experts + 1;
+         idx += blockDim.x) {
+      smem_expert_first_token_offset[idx] =
+          __ldg(expert_first_token_offset + idx);
+    }
+    __syncthreads();
+    int lane_idx = threadIdx.x & 31;
+
+    if (lane_idx == 0) {
+      // set token_offset_in_expert = 0 if this expert is not local expert
+      int token_offset_in_expert =
+          expert_id >= num_local_experts
+              ? 0
+              : expanded_dest_row - smem_expert_first_token_offset[expert_id];
+      int64_t accumulate_align_offset = 0;
+#pragma unroll 1
+      for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) {
+        auto n_token_in_expert = smem_expert_first_token_offset[eidx] -
+                                 smem_expert_first_token_offset[eidx - 1];
+        accumulate_align_offset += (n_token_in_expert + align_block_size - 1) /
+                                   align_block_size * align_block_size;
+      }
+      expanded_dest_row = accumulate_align_offset + token_offset_in_expert;
+    }
+    // lane0 shuffle broadcast align_expanded_dest_row
+    expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0);
+  }
+
+  if (threadIdx.x == 0) {
+    assert(expanded_dest_row <= INT32_MAX);
+    expanded_source_row_to_expanded_dest_row[expanded_source_row] =
+        static_cast<int>(expanded_dest_row);
+  }
+
+  if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
+    // Load 128-bits per thread
+    constexpr int64_t ELEM_PER_THREAD = 128 / cutlass::sizeof_bits<T>::value;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    int64_t const source_k_rank = expanded_source_row / num_rows;
+    int64_t const source_row = expanded_source_row % num_rows;
+
+    auto const* source_row_ptr =
+        reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
+    auto* dest_row_ptr =
+        reinterpret_cast<DataElem*>(permuted_output + expanded_dest_row * cols);
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+    int64_t const num_elems_in_col = cols / ELEM_PER_THREAD;
+
+    for (int elem_index = start_offset; elem_index < num_elems_in_col;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T const* unpermuted_input, T* permuted_output,
+    const float* unpermuted_scales, int* sorted_experts,
+    int const* expanded_dest_row_to_expanded_source_row,
+    int* expanded_source_row_to_expanded_dest_row,
+    int64_t* expert_first_token_offset, int64_t const num_rows,
+    int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int num_local_experts, const int& align_block_size, cudaStream_t stream) {
+  int64_t const blocks = num_rows * k;
+  int64_t const threads = 256;
+  using FuncPtr = decltype(&expandInputRowsKernel<T, true, true>);
+  FuncPtr func_map[2][2] = {
+      {&expandInputRowsKernel<T, false, false>,
+       &expandInputRowsKernel<T, false, true>},
+      {&expandInputRowsKernel<T, true, false>,
+       &expandInputRowsKernel<T, true, true>},
+  };
+  bool is_check_skip = num_valid_tokens_ptr != nullptr;
+  bool is_align_block_size = align_block_size != -1;
+  auto func = func_map[is_check_skip][is_align_block_size];
+
+  int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
+
+  func<<<blocks, threads, smem_size, stream>>>(
+      unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
+      expanded_dest_row_to_expanded_source_row,
+      expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
+      num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
+      align_block_size);
+}
+
+template <class T, class U>
+__host__ __device__ constexpr static U arrayConvert(T const& input) {
+  using Type = typename U::Element;
+  static_assert(T::kElements == U::kElements);
+  U u;
+#pragma unroll
+  for (int i = 0; i < U::kElements; i++) {
+    u[i] = static_cast<Type>(input[i]);
+  }
+  return u;
+}
+
+template <typename T, typename OutputType, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
+    int64_t const* num_valid_ptr) {
+  assert(orig_cols % 4 == 0);
+  int64_t const original_row = blockIdx.x;
+  int64_t const num_rows = gridDim.x;
+  auto const offset = original_row * orig_cols;
+  OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
+  int64_t const num_valid = *num_valid_ptr;
+
+  // Load 128-bits per thread, according to the smallest data type we read/write
+  constexpr int64_t FINALIZE_ELEM_PER_THREAD =
+      128 / std::min(cutlass::sizeof_bits<OutputType>::value,
+                     cutlass::sizeof_bits<T>::value);
+
+  int64_t const start_offset = threadIdx.x;
+  int64_t const stride = blockDim.x;
+  int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
+
+  using InputElem = cutlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+  using OutputElem = cutlass::Array<OutputType, FINALIZE_ELEM_PER_THREAD>;
+  using ComputeElem = cutlass::Array<float, FINALIZE_ELEM_PER_THREAD>;
+  auto const* expanded_permuted_rows_v =
+      reinterpret_cast<InputElem const*>(expanded_permuted_rows);
+  auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
+
+#pragma unroll
+  for (int elem_index = start_offset; elem_index < num_elems_in_col;
+       elem_index += stride) {
+    ComputeElem thread_output;
+    thread_output.fill(0);
+    float row_rescale{0.f};
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_permuted_row =
+          expanded_source_row_to_expanded_dest_row[expanded_original_row];
+
+      int64_t const k_offset = original_row * k + k_idx;
+      float const row_scale = scales[k_offset];
+
+      // Check after row_rescale has accumulated
+      if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
+        continue;
+      }
+
+      auto const* expanded_permuted_rows_row_ptr =
+          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
+
+      int64_t const expert_idx = expert_for_source_row[k_offset];
+
+      ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
+          expanded_permuted_rows_row_ptr[elem_index]);
+      thread_output = thread_output + row_scale * (expert_result);
+    }
+
+    OutputElem output_elem =
+        arrayConvert<ComputeElem, OutputElem>(thread_output);
+    reduced_row_ptr_v[elem_index] = output_elem;
+  }
+}
+
+template <class T, class OutputType>
+void finalizeMoeRoutingKernelLauncher(
+    T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
+    float const* scales, int const* expanded_source_row_to_expanded_dest_row,
+    int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
+    cudaStream_t stream) {
+  int64_t const blocks = num_rows;
+  int64_t const threads = 256;
+  bool const check_finished = num_valid_ptr != nullptr;
+  using FuncPtr = decltype(&finalizeMoeRoutingKernel<T, OutputType, false>);
+  FuncPtr func_map[2] = {&finalizeMoeRoutingKernel<T, OutputType, false>,
+                         &finalizeMoeRoutingKernel<T, OutputType, true>};
+  auto* const kernel = func_map[check_finished];
+  kernel<<<blocks, threads, 0, stream>>>(
+      expanded_permuted_rows, reduced_unpermuted_output, scales,
+      expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
+      num_valid_ptr);
+}
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d0de42251f9..2a8b9bb39ca 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -53,7 +53,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "int size_m, int size_n, int size_k,"
       "bool is_full_k, bool use_atomic_add,"
       "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+  m.def(
+      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
+      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
+      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
+      "int b_q_type, SymInt size_m, "
+      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
+      "topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");
+
+  m.def(
+      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
+      "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
+      "int n_local_expert,"
+      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
+      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
+      "m_indices)->()");
 
+  m.def(
+      "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
+      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
+      "expert_first_token_offset, int n_expert, int n_local_expert,int "
+      "topk, Tensor! hidden_states)->()");
   // conditionally compiled so impl registration is in source file
 
 #endif
diff --git a/csrc/ops.h b/csrc/ops.h
index 86039a26041..59ae0937604 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -97,6 +97,9 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
 
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 
+void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
+                        torch::Tensor& scale);
+
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
@@ -128,6 +131,12 @@ void advance_step_flashinfer(
     torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
     torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 
+void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
+                        torch::Tensor const& q_pe,
+                        torch::Tensor const& kv_c_and_k_pe_cache,
+                        torch::Tensor const& seq_lens,
+                        torch::Tensor const& page_table, double scale);
+
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
 
 #ifndef USE_ROCM
diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
new file mode 100644
index 00000000000..acc3d672202
--- /dev/null
+++ b/csrc/quantization/activation_kernels.cu
@@ -0,0 +1,120 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cmath>
+#include "core/math.hpp"
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "quantization/fp8/common.cuh"
+
+namespace vllm {
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          typename fp8_type>
+__global__ void act_and_mul_quant_kernel(
+    fp8_type* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const float* scale, const int d) {
+  const int32_t blocks_per_token = gridDim.y;
+
+  const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);
+
+  // We don't expect the hidden dimension to exceed 32 bits so int32 should
+  // be safe here.
+  const int32_t tgt_elems_per_block = div_ceil(d, blocks_per_token);
+  const int32_t elems_per_block =
+      round_to_next_multiple_of(tgt_elems_per_block, elems_per_128bit_load);
+  const int32_t block_start = blockIdx.y * elems_per_block;
+  int32_t block_end = block_start + elems_per_block;
+  block_end = block_end > d ? d : block_end;
+
+  // token_idx is 64 bit to prevent 32 bit overflow when the number of tokens
+  // is very large
+  const int64_t token_idx = blockIdx.x;
+  const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
+  const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
+  fp8_type* __restrict__ out_ptr = out + token_idx * d;
+
+  // 128-bit vectorized code
+  const int32_t vec_loop_end =
+      round_to_previous_multiple_of(elems_per_128bit_load, block_end);
+  const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
+  const int32_t vec_start_idx = block_start / elems_per_128bit_load;
+
+  const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
+  const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
+  int2* __restrict__ out_128bit_ptr = reinterpret_cast<int2*>(out_ptr);
+
+  float inverted_scale = 1 / *scale;
+#pragma unroll
+  for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
+       vec_idx += blockDim.x) {
+    const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
+    const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
+    using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;
+    using scalar_64bit_vec_t = std::array<fp8_type, elems_per_128bit_load>;
+
+    scalar_64bit_vec_t out_vec;
+    const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
+    const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);
+
+#pragma unroll
+    for (int i = 0; i < elems_per_128bit_load; i++) {
+      out_vec[i] = scaled_fp8_conversion<true, fp8_type>(
+          ACT_FN(x_vec[i]) * y_vec[i], inverted_scale);
+    }
+
+    out_128bit_ptr[vec_idx] = reinterpret_cast<const int2&>(out_vec);
+  }
+
+  // Scalar cleanup code
+  if (block_end > vec_loop_end) {
+    for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
+         idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] =
+          scaled_fp8_conversion<true, fp8_type>(ACT_FN(x) * y, inverted_scale);
+    }
+  }
+}
+}  // namespace vllm
+
+// Launch activation, gating, and quantize kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                               \
+  int d = input.size(-1) / 2;                                               \
+  int64_t num_tokens = input.numel() / input.size(-1);                      \
+  dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4);     \
+  dim3 block(std::min(d, 512));                                             \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));         \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();             \
+  VLLM_DISPATCH_FLOATING_TYPES(                                             \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                      \
+        VLLM_DISPATCH_FP8_TYPES(                                            \
+            out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] {  \
+              vllm::act_and_mul_quant_kernel<scalar_t, KERNEL<scalar_t>,    \
+                                             fp8_t>                         \
+                  <<<grid, block, 0, stream>>>(out.data_ptr<fp8_t>(),       \
+                                               input.data_ptr<scalar_t>(),  \
+                                               scale.data_ptr<float>(), d); \
+            });                                                             \
+      });
+
+void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
+                        torch::Tensor& input,  // [..., 2 * d]
+                        torch::Tensor& scale) {
+  TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.size(-1) % 2 == 0);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 2fb0417ce6c..894727383a6 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -46,14 +46,26 @@ __global__ void compute_expert_offsets(
 }
 
 __global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+                                  const int32_t* __restrict__ expert_offsets,
                                   int32_t* input_permutation,
                                   int32_t* output_permutation,
                                   int32_t* atomic_buffer, const int topk_length,
                                   const int topk) {
-  int expert_id = blockIdx.x;
+  int const blk_expert_id = blockIdx.x;
+  int const num_experts = gridDim.x;
+  int32_t const num_tokens = expert_offsets[num_experts];
 
   for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
-    if (topk_ids[i] == expert_id) {
+    int const expert_id = topk_ids[i];
+    if (expert_id == -1 && blockIdx.x == 0) {
+      // output_permutation is used to re-order the moe outputs. It is
+      // used as c2 = c2[c_map], where c2 is a torch.tensor that is the
+      // output of the cutlass kernels and c_map is the output_permutation.
+      // c2 is initialized to zeros, therefore by setting the output_permutation
+      // to num_tokens, we are guaranteed to fill the moe outputs to zero
+      // for "invalid" topk_ids.
+      output_permutation[i] = num_tokens;
+    } else if (expert_id == blk_expert_id) {
       int start = atomicAdd(&atomic_buffer[expert_id], 1);
       input_permutation[start] = i / topk;
       output_permutation[i] = start;
@@ -83,6 +95,7 @@ void get_cutlass_moe_mm_data_caller(
       static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
   compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
       static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<const int32_t*>(expert_offsets.data_ptr()),
       static_cast<int32_t*>(input_permutation.data_ptr()),
       static_cast<int32_t*>(output_permutation.data_ptr()),
       static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
index 4e82c99c3af..6082937e7e1 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh
@@ -336,7 +336,7 @@ inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
 
   if (mp2 <= 16) {
     // M in [1, 16]
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
index 95723b31ca3..87be125b2eb 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
@@ -321,7 +321,7 @@ inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
 
   uint32_t const m = a.size(0);
   uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
 
   if (mp2 <= 16) {
     // M in [1, 16]
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
index 6e14de0c780..97c0e0da7b1 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -134,7 +134,7 @@ typename T::Gemm::Arguments args_from_options(
   using StrideB = typename T::StrideB;
   using StrideD = typename T::StrideD;
   using Sm100BlkScaledConfig =
-      typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+      typename T::Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
 
   int m = static_cast<int>(M);
   int n = static_cast<int>(N);
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 2b6ab7fcec9..95aa92e25b3 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -96,7 +96,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
     std::optional<at::Tensor> const& scale_ub,
     std::optional<at::Tensor>& residual) {
   int32_t hidden_size = input.size(-1);
-  int32_t num_tokens = input.numel() / hidden_size;
+  auto num_tokens = input.numel() / hidden_size;
 
   dim3 grid(num_tokens);
   dim3 block(std::min(hidden_size, 1024));
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index ec0bf2c3cb4..ea3bb429904 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -347,7 +347,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
       for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
         hmma16816_f32<FType>(
             C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx],
-            reinterpret_cast<uint32_t(&)[2]>(BF_frag[reg_buf_idx][n_idx]));
+            reinterpret_cast<uint32_t (&)[2]>(BF_frag[reg_buf_idx][n_idx]));
       }
     }
   }
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 83bbd1e6816..a974c881eb8 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
 
   const int SUB = 0x64006400;
   const int MUL = 0x2c002c00;
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
 
   // Guarantee that the `(a & b) | c` operations are LOP3s.
 
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
   q >>= 4;
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
 
   typename ScalarType<nv_bfloat16>::FragB frag_b;
   static constexpr uint32_t MUL = 0x3F803F80;
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index ba0a2410c03..ea96326ed7e 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
index cd1830764cc..c96d68d9b29 100644
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
   static constexpr uint32_t HI = 0x00f000f0;
   static constexpr uint32_t EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   static constexpr uint32_t SUB = 0x64086408;
diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h
index 49eee4128ee..b26505f771c 100644
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
   const int HI = 0x00f000f0;
   const int EX = 0x64006400;
   // Guarantee that the `(a & b) | c` operations are LOP3s.
-  int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
-  int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
   // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
   // directly into `SUB` and `ADD`.
   const int SUB = 0x64086408;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 2c3cae95e7f..29235264916 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -25,8 +25,9 @@
 #include "../attention/dtype_fp8.cuh"
 #include "../quantization/fp8/amd/quant_utils.cuh"
 
-#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
-  #define __HIP__MI300_MI250__
+#if defined(__HIPCC__) && \
+    (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
+  #define __HIP__GFX9__
 #endif
 
 #if defined(NDEBUG)
@@ -42,7 +43,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
 
-#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
 
   #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
   #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16
@@ -1479,7 +1480,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
   }
 }
 
-#else  // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#else  // !defined(__HIP__GFX9__) TODO: Add NAVI support
 
 // clang-format off
 template <typename scalar_t, typename cache_t,
@@ -1552,7 +1553,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 }
 // clang-format on
 
-#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
 
 #define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
   paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index afb735450e0..b90cfdc617a 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -2,6 +2,15 @@
 
 #include <torch/all.h>
 
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block);
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount);
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
+
 void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      torch::Tensor& max_logits, torch::Tensor& tmp_out,
                      torch::Tensor& query, torch::Tensor& key_cache,
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
new file mode 100644
index 00000000000..72d2820f2aa
--- /dev/null
+++ b/csrc/rocm/skinny_gemms.cu
@@ -0,0 +1,1600 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <stdexcept>
+#include <algorithm>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+#include "quantization/fp8/common.cuh"
+
+#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
+  #define __HIP__MI300_MI250__
+#endif
+
+#if defined(__HIPCC__) && defined(__gfx942__)
+  #define __HIP__MI300__
+#endif
+
+#if defined(NDEBUG)
+  #undef NDEBUG
+  #include <assert.h>
+  #define UNREACHABLE_CODE assert(false);
+  #define NDEBUG
+#else
+  #define UNREACHABLE_CODE assert(false);
+#endif
+
+template <typename T>
+struct scalar {};
+
+template <typename T>
+struct scalar2 {};
+
+template <typename T>
+__device__ __forceinline__ float2 __s22float2(T v);
+
+template <typename T>
+__device__ __forceinline__ T __float2s(float v);
+
+template <typename T>
+__device__ __forceinline__ T __float22s2_rn(float2 v);
+
+// Definitions and cvt functions for fp16
+template <>
+struct scalar<c10::Half> {
+  using type = half;
+};
+
+template <>
+struct scalar2<c10::Half> {
+  using type = __half2;
+};
+
+template <>
+__device__ __forceinline__ half __float2s(float v) {
+  return __float2half(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__half2 v) {
+  return __half22float2(v);
+}
+
+template <>
+__device__ __forceinline__ __half2 __float22s2_rn(float2 v) {
+  return __float22half2_rn(v);
+}
+
+// Definitions and cvt functions for bf16
+template <>
+struct scalar<c10::BFloat16> {
+  using type = __hip_bfloat16;
+};
+
+template <>
+struct scalar2<c10::BFloat16> {
+  using type = __hip_bfloat162;
+};
+
+template <>
+__device__ __forceinline__ __hip_bfloat16 __float2s(float v) {
+  return __float2bfloat16(v);
+}
+
+template <>
+__device__ __forceinline__ float2 __s22float2(__hip_bfloat162 v) {
+  return __bfloat1622float2(v);
+}
+
+template <>
+__device__ __forceinline__ __hip_bfloat162 __float22s2_rn(float2 v) {
+  return __float22bfloat162_rn(v);
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+  return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+  auto addr_alias = reinterpret_cast<const float*>(addr);
+  auto dat0 = loadnt(addr_alias);
+  auto dat1 = loadnt(addr_alias + 1);
+  auto dat2 = loadnt(addr_alias + 2);
+  auto dat3 = loadnt(addr_alias + 3);
+  return make_float4(dat0, dat1, dat2, dat3);
+}
+
+// TBlock fetches entire rows of A, and entire col of B (K dimension); assume
+// N=1 for time being grid is M/A_NUM_ROWS blocks
+template <typename scalar_t, int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b,
+                               scalar_t* out_c, const int K) {
+  using scalar2_t = typename scalar2<scalar_t>::type;
+  auto af4 = reinterpret_cast<const float4*>(in_a);
+  auto bf4 = reinterpret_cast<const scalar2_t*>(in_b);
+  auto c = reinterpret_cast<scalar2_t*>(out_c);
+  __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+  const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * K / 8;
+  const int threadid = threadIdx.x;
+  const int warp = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int num_warps = blockDim.x / WARP_SIZE;
+  const int qwarpid = threadid / num_warps;
+  const int qthreadid = threadid % num_warps;
+  float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w;
+  float acc[NUM_A_ROWS_PER_BLOCK];
+  scalar2_t acch2;
+  scalar2_t oval;
+
+  // As we later use warp shuffle operations, we may have more threads in the
+  // block than the actual available data, hence the if guard here.
+  if (threadid * 8 < K) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      // rowA_elem4[i] holds 8 * half numbers seen as a single float4.
+      rowA_elem4[i] = load_ntmprl(&af4[row_addr + threadid + K / 8 * i]);
+    }
+  }
+
+  colB_elem4x = bf4[threadid * 4 + 0];
+  colB_elem4y = bf4[threadid * 4 + 1];
+  colB_elem4z = bf4[threadid * 4 + 2];
+  colB_elem4w = bf4[threadid * 4 + 3];
+
+  scalar2_t Af2;
+  [[maybe_unused]] scalar2_t Bf2;
+  float2 S;
+
+  auto Ah2ptr = reinterpret_cast<scalar2_t*>(&rowA_elem4);
+  scalar2_t* ah2lptr;
+
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+    // Multiply-add on 8 scalar_t.
+    ah2lptr = Ah2ptr + i * 4;
+    Af2 = *(ah2lptr);
+    acch2 = __hmul2(Af2, colB_elem4x);
+    Af2 = *(ah2lptr + 1);
+    acch2 = __hfma2(Af2, colB_elem4y, acch2);
+    Af2 = *(ah2lptr + 2);
+    acch2 = __hfma2(Af2, colB_elem4z, acch2);
+    Af2 = *(ah2lptr + 3);
+    acch2 = __hfma2(Af2, colB_elem4w, acch2);
+    S = __s22float2(acch2);
+
+    // See comment above concerning the if guard.
+    acc[i] = (threadid * 8 < K ? S.x + S.y : 0.f);
+  }
+
+// all reduce across warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      acc[i] += __shfl_xor(acc[i], mask);
+    }
+  }
+
+  // Warp leaders store the data to shared memory.
+  if (lane < NUM_A_ROWS_PER_BLOCK) {
+    red_smem[lane][warp] = acc[lane];
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  if (qwarpid < NUM_A_ROWS_PER_BLOCK) {
+    acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+    for (int mask = num_warps / 2; mask >= 1; mask /= 2) {
+      acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+    }
+    float oval2 = __shfl_xor(acc[qwarpid], num_warps);
+
+    if (lane % (num_warps * 2) == 0) {
+      oval = __float22s2_rn<scalar2_t>(make_float2(acc[qwarpid], oval2));
+      c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] = oval;
+    }
+  }
+}
+
+torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
+                    const int64_t rows_per_block) {
+  auto M = in_a.size(0);
+  auto K = in_a.size(1);
+  auto N = in_b.size(0);
+
+  TORCH_CHECK(N == 1, "Row number of activation tensor must be 1.");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(in_b.dtype() == torch::kFloat16 ||
+              in_b.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N, M}, torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  // NUM_TREADS need to be a multiple of WARP_SIZE, as we are using warp shuffle
+  // operations.
+  const int NUM_THREADS =
+      K * 2 / 16 % WARP_SIZE == 0
+          ? K * 2 / 16
+          : K * 2 / 16 + (WARP_SIZE - K * 2 / 16 % WARP_SIZE);
+
+  int NUM_BLOCKS = M / rows_per_block;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_b));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // call the kernel function...
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "LLGemm1", [&] {
+    auto a_ptr = in_a.data_ptr<scalar_t>();
+    auto b_ptr = in_b.data_ptr<scalar_t>();
+    auto c_ptr = out_c.data_ptr<scalar_t>();
+    if (rows_per_block == 2) {
+      LLGemm1_kernel<scalar_t, 2>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 4) {
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 8) {
+      LLGemm1_kernel<scalar_t, 8>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else if (rows_per_block == 16) {
+      LLGemm1_kernel<scalar_t, 16>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    } else {
+      NUM_BLOCKS = M / 4;
+      LLGemm1_kernel<scalar_t, 4>
+          <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(a_ptr, b_ptr, c_ptr, K);
+    }
+  });
+
+  return out_c;
+}
+
+#define DOT2C(V0, V2, V3)                                                     \
+  if constexpr (std::is_same_v<scalar_t, half>) {                             \
+    asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
+  } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {            \
+    float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *             \
+               __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));              \
+    V0 += (s.x + s.y);                                                        \
+  }
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets cases where A[] fits LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    // for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        // Do the matrix multiplication of activation and weight matrix
+        // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b])
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          // if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
+          C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets cases where A[] marginally exceeds LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                 const scalar_t* __restrict__ A, scalar_t* C,
+                 const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmenation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  while (m < M) {
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t n = 0; n < N; n++) {
+  #pragma unroll
+        for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+          uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+          uint32_t k_ = k + threadIdx.x * A_CHUNK;
+          if (k_ >= K) break;
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]);
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          if (commitColumn[i])
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+
+    // Check whether there will be fragmenation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
+                             const scalar_t* __restrict__ A, scalar_t* C,
+                             const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300_MI250__)  // TODO: Add NAVI support
+// This version targets big A[] cases, where it is much larger than LDS capacity
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                     const scalar_t* __restrict__ A, scalar_t* C,
+                     const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 2) * sizeof(float)))) float;
+
+  union bigType {
+    scalar_t h[A_CHUNK];
+    float f[A_CHUNK / 2];
+    float2 f2[A_CHUNK / 4];
+    double d[A_CHUNK / 4];
+    scalar8 h8;
+  };
+
+  //----------------------------------------------------
+  // Reserving 64 KB of LDS to have 1 WG / CU
+  // Goal is to bring the activation matrix A to the LDS
+  // and use it across the lifetime of the work group
+  // TODO: When activation matrix is larger than 64 KB
+  //	     then this is not goint to work!
+  //----------------------------------------------------
+  __shared__ scalar_t s[1024 * 32];
+
+  //----------------------------------------------------
+  // Computation of columns that need to be committed to memory!
+  //----------------------------------------------------
+  uint32_t commitColumn[YTILE];
+  for (uint32_t i = 0; i < YTILE; i++) {
+    commitColumn[i] = 1;
+  }
+
+  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  //----------------------------------------------------
+  // Indexing function into the column of weight matrix B
+  // Algorithm does 64 lane k-splitting / wave and uses
+  // WG ID and Thread ID to find the index.
+  //----------------------------------------------------
+  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
+
+  // Check whether there will be fragmenation!
+  // This will happen only for the last wave!
+  if (m < M && (m + YTILE) >= M) {
+    uint32_t startColumn = M - YTILE;
+    for (uint32_t i = 0; i < (m - startColumn); i++) {
+      commitColumn[i] = 0;
+    }
+    m = startColumn;
+  }
+
+  //----------------------------------------------------
+  // Fetch the activation matrix to LDS
+  // Loop iteration:
+  // - Each thread (lane) is fetching 8 elements (A_Chunk)
+  // - Each wave will fetch 64*8=> 512 elements
+  // - Each WG will fetch 512 * 16 => 8K elements
+  // - Then the WG will move to another 8 K elements
+  // TODO: Logic below will only work when K is multiple of 8
+  //----------------------------------------------------
+  #define PCML
+  #ifndef PCML
+  for (uint32_t k = 0; k < min(K * N, 32 * 1024);
+       k += THRDS * WvPrGrp * A_CHUNK) {
+    uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+
+    if (k_in >= min(K * N, 32 * 1024)) break;
+
+    *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in]));
+  }
+  __syncthreads();
+  #endif
+
+  #define TUC (THRDS * UNRL * A_CHUNK)
+  uint32_t kBase = 0;
+  // find biggest k size that fits in LDS
+  uint32_t kFit = (32 * 1024) / N;
+  // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple
+  // of TUC
+  kFit = (kFit % TUC == 0)
+             ? kFit
+             : (kFit - kFit % TUC);  // round up to multiple of TUC
+  // if (kFit == 0) kFit = TUC;
+  kFit = min(kFit, K);
+
+  float sum[N][YTILE];
+
+  //----------------------------------------------------
+  // Each wave works on a single column of weight matrix.
+  // There are 16 waves per WG, and hence, each WG is
+  // working on 16 columns of weight matrix. Moreover,
+  // we tile in column direction by YTILE, so when YTILE=1
+  // the above math is right, however, when YTILE=2 then
+  // each wave  will be working on 2 columns and WG will
+  // be working on 32 columns.
+  //
+  // Top level loop that makes WGs persistent!
+  // - WGs iterates across columns of weight matrix
+  // - Each wave within WG works on a given column(s)
+  // - After completing first set of columns, WGs start
+  //   working on the next set of available columns
+  //----------------------------------------------------
+  #ifdef PCML
+  int YW = (YTILE * _WvPrGrp);
+  uint32_t Mrndp = (M % YW == 0) ? M : (M - M % YW + YW);
+  while (m < Mrndp) {
+  #else
+  while (m < M) {
+  #endif
+    //----------------------------------------------------
+    // 'sum' accumulates the matrix A x B computation
+    // split across 64 lanes.
+    //
+    // YTILE represents how many column of weight matrix
+    // are being worked on by each wave.
+    //----------------------------------------------------
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = 0;
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+    //----------------------------------------------------
+    // Fetch weight matrix B in interleaved K-split!
+    // - Each thread (lane) is fetching 8 elements (A_Chunk)
+    // - Each wave will fetch 64*8=> 512 elements (1024B)
+    // - YTILE represents the number of column being serviced
+    //   by wave
+    // - Loop for fetching weight matrix (B) are unrolled
+    //
+    // Fetch activation matrix A from LDS
+    // - Loop for fetching activation matrix (A) are unrolled
+    //
+    // Finally, do the matrix multiplication in an unrolled
+    // fashion. This provides lot of food for compiler
+    // scheduling.
+    //
+    // TODO: Logic below will only work when K is multiple of 8
+    //----------------------------------------------------
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #ifdef PCML
+      if ((k1 == 0) || (k1 == kBase + kFit)) {  // load next chunk of A[] to LDS
+        if (k1 != 0) kBase += kFit;
+        __syncthreads();
+        for (uint32_t k = 0; k < kFit; k += THRDS * _WvPrGrp * A_CHUNK) {
+          uint32_t kOff = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK);
+          if (kBase + kOff >= K) break;
+          if (kOff >= kFit) break;
+          for (uint32_t n = 0; n < N; n++) {
+            uint32_t k_in = kBase + n * K + kOff;
+            uint32_t k_ot = n * kFit + kOff;
+            *((bigType*)(&s[k_ot])) = *((bigType*)(&A[k_in]));
+          }
+        }
+        __syncthreads();
+      }
+      if (m >= M) continue;
+  #endif
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const scalar_t* B_ = &B[(m + 0) * K + k_];
+        bigB[0][k2].h8 = (loadnt((scalar8*)(&B_[0 * K])));
+        //----------------------------------------------------
+        // The following code with YTILE > 1 has to be deleted
+        //----------------------------------------------------
+        if constexpr (YTILE >= 2)
+          bigB[1][k2].h8 = (loadnt((scalar8*)(&B_[1 * K])));
+        if constexpr (YTILE >= 3)
+          bigB[2][k2].h8 = (loadnt((scalar8*)(&B_[2 * K])));
+        if constexpr (YTILE >= 4)
+          bigB[3][k2].h8 = (loadnt((scalar8*)(&B_[3 * K])));
+        if constexpr (YTILE >= 5)
+          bigB[4][k2].h8 = (loadnt((scalar8*)(&B_[4 * K])));
+        if constexpr (YTILE >= 6)
+          bigB[5][k2].h8 = (loadnt((scalar8*)(&B_[5 * K])));
+        if constexpr (YTILE >= 7)
+          bigB[6][k2].h8 = (loadnt((scalar8*)(&B_[6 * K])));
+        if constexpr (YTILE >= 8)
+          bigB[7][k2].h8 = (loadnt((scalar8*)(&B_[7 * K])));
+      }
+
+      // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        // Fetch A activation matrix in interleaved fashion from LDS or memory
+
+        for (int n = 0; n < N; n++) {
+  #ifdef PCML
+          bigA[n][k2] = *((const bigType*)(&(s[k_ - kBase + kFit * n])));
+  #else
+          if (k_ + K * n < 32 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+  #endif
+        }
+      }
+
+      // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+  #pragma unroll
+        for (uint32_t n = 0; n < N; n++) {
+          // Do the matrix multiplication of activation and weight matrix
+          // - Remember the accumulation is happening for K-split of 64!
+  #pragma unroll
+          for (uint32_t b = 0; b < A_CHUNK / 2; b++) {
+            DOT2C(sum[n][0], bigA[n][k2].f[b], bigB[0][k2].f[b]);
+            //----------------------------------------------------
+            // The following code with YTILE > 1
+            //----------------------------------------------------
+            if constexpr (YTILE >= 2) {
+              DOT2C(sum[n][1], bigA[n][k2].f[b], bigB[1][k2].f[b]);
+            }
+            if constexpr (YTILE >= 3) {
+              DOT2C(sum[n][2], bigA[n][k2].f[b], bigB[2][k2].f[b]);
+            }
+            if constexpr (YTILE >= 4) {
+              DOT2C(sum[n][3], bigA[n][k2].f[b], bigB[3][k2].f[b]);
+            }
+            if constexpr (YTILE >= 5) {
+              DOT2C(sum[n][4], bigA[n][k2].f[b], bigB[4][k2].f[b]);
+            }
+            if constexpr (YTILE >= 6) {
+              DOT2C(sum[n][5], bigA[n][k2].f[b], bigB[5][k2].f[b]);
+            }
+            if constexpr (YTILE >= 7) {
+              DOT2C(sum[n][6], bigA[n][k2].f[b], bigB[6][k2].f[b]);
+            }
+            if constexpr (YTILE >= 8) {
+              DOT2C(sum[n][7], bigA[n][k2].f[b], bigB[7][k2].f[b]);
+            }
+          }
+        }
+      }
+    }
+
+  #ifdef PCML
+    if (m >= M) {
+      m += CuCount * _WvPrGrp * YTILE;
+      kBase = 0;
+      continue;
+    }
+  #endif
+
+    //----------------------------------------------------
+    // Final reduction step using shuffle
+    //----------------------------------------------------
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 wave_shr:1 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:15 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_bcast:31 bound_ctrl:0"
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+      }
+    }
+
+    if (threadIdx.x == 63) {
+      for (int n = 0; n < N; n++) {
+        for (int i = 0; i < YTILE; i++) {
+          if (commitColumn[i])
+            C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+    kBase = 0;
+
+    // Check whether there will be fragmenation!
+    // This will happen only for the last wave!
+    if (m < M && (m + YTILE) >= M) {
+      uint32_t startColumn = M - YTILE;
+      for (uint32_t i = 0; i < (m - startColumn); i++) {
+        commitColumn[i] = 0;
+      }
+      m = startColumn;
+    }
+  }
+}
+#else   // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
+          int UNRL, int N>
+__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
+                                 const scalar_t* __restrict__ A, scalar_t* C,
+                                 const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
+
+int mindiv(int N, int div1, int div2) {
+  int nPrRnd = div1 * div2;
+  int rnds0 = N / nPrRnd;
+  nPrRnd -= div1 * 3;
+  int rnds3 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds4 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds5 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds6 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds7 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds8 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rnds9 = N / nPrRnd;
+  nPrRnd -= div1;
+  int rtn = div2;
+  if (rnds0 == rnds3) rtn = div2 - 3;
+  if (rnds0 == rnds4) rtn = div2 - 4;
+  if (rnds0 == rnds5) rtn = div2 - 5;
+  if (rnds0 == rnds6) rtn = div2 - 6;
+  if (rnds0 == rnds7) rtn = div2 - 7;
+  if (rnds0 == rnds8) rtn = div2 - 8;
+  if (rnds0 == rnds9) rtn = div2 - 9;
+  return rtn;
+}
+
+torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
+                       const int64_t CuCount) {
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+
+  TORCH_CHECK(in_a.dtype() == in_b.dtype());
+  TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
+  TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
+              in_a.dtype() == torch::kBFloat16);
+
+  auto out_c = torch::empty(
+      {N_in, M_in},
+      torch::TensorOptions().dtype(in_b.dtype()).device(in_b.device()));
+
+  dim3 grid(CuCount);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                 _N)                                                          \
+  {                                                                           \
+    dim3 block(64, _WvPrGrp);                                                 \
+    if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) {                \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else if (K_in * N_in <= 32 * 1024 * 1.2) {                              \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
+      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    } else {                                                                  \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
+      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
+          <<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp,    \
+                                       CuCount);                              \
+    }                                                                         \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
+    const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
+    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+    switch (N_in) {
+      case 1:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        break;
+      case 2:
+        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        break;
+      case 3:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        break;
+      case 4:
+        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        break;
+      default:
+        throw std::runtime_error(
+            "Unsupported N value: " + std::to_string(M_in) + "," +
+            std::to_string(K_in) + "," + std::to_string(N_in));
+    }
+  });
+  return out_c;
+}
+
+#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
+                      const fp8_t* __restrict__ A, scalar_t* C,
+                      const float* __restrict__ s_A,
+                      const float* __restrict__ s_B, const int _WvPrGrp,
+                      const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[1024 * 64];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0.f};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+  #pragma unroll
+        for (uint32_t n = 0; n < N; ++n) bigA[n][k2].h8 = {0.f};
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) bigB[y][k2].h8 = {0.f};
+      }
+
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+  #pragma unroll
+        for (uint32_t y = 0; y < YTILE; ++y) {
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        if (k >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
+                                  const fp8_t* B, const fp8_t* __restrict__ A,
+                                  scalar_t* C, const float* __restrict__ s_A,
+                                  const float* __restrict__ s_B,
+                                  const int _WvPrGrp, const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+
+#if defined(__HIP__MI300__)  // TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void __launch_bounds__(WvPrGrp* THRDS)
+    wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
+                  const fp8_t* __restrict__ A, scalar_t* C,
+                  const float* __restrict__ s_A, const float* __restrict__ s_B,
+                  const int _WvPrGrp, const int CuCount) {
+  using scalar8 =
+      __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
+  using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+  using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  union bigType {
+    char f8[A_CHUNK];
+    char2 c2[A_CHUNK / 2];
+    scalar_t h[A_CHUNK / 2];
+    float f[A_CHUNK / 4];
+    int i[A_CHUNK / 4];
+    long l[A_CHUNK / 8];
+    intx4 l2[A_CHUNK / 16];
+    scalar8 h8;
+  };
+
+  __shared__ fp8_t s[1024 * 64];
+
+  for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK;
+       k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) {
+    *((bigType*)(&s[k])) = *((bigType*)(&A[k]));
+  }
+  __syncthreads();
+
+  if (threadIdx.y >= _WvPrGrp) return;
+
+  uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
+
+  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+  floatx16 sum[N][YTILE];
+  float sA = *s_A;
+  float sB = *s_B;
+
+  while (m < M) {
+    for (int i = 0; i < YTILE; i++)
+      for (int n = 0; n < N; n++) sum[n][i] = {0};
+
+    bigType bigA[N][UNRL];
+    bigType bigB[YTILE][UNRL];
+
+    for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
+      // Fetch the weight matrix from memory!
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        const fp8_t* B_ = &B[(m + 0) * Kp + k_];
+        for (int y = 0; y < YTILE; ++y) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          bigB[y][k2].h8 = (loadnt((scalar8*)(&B_[y * Kp])));
+        }
+      }
+
+  // Fetch activation matrix from either just LDS or from both LDS / memory
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+        for (int n = 0; n < N; n++) {
+          if (k_ + K * n < 64 * 1024)
+            bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n])));
+          else
+            bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n])));
+        }
+      }
+
+  // Do the matrix multiplication in interleaved manner
+  #pragma unroll
+      for (uint32_t k2 = 0; k2 < UNRL; k2++) {
+        uint32_t k = k1 + k2 * THRDS * A_CHUNK;
+        uint32_t k_ = k + threadIdx.x * A_CHUNK;
+        if (k_ >= K) break;
+
+        for (uint32_t n = 0; n < N; n++) {
+          for (int i = 0; i < A_CHUNK; i += 8) {
+            for (int y = 0; y < YTILE; ++y) {
+              sum[n][y] = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                  bigA[n][k2].l[i / 8], bigB[y][k2].l[i / 8], sum[n][y], 0, 0,
+                  0);
+            }
+          }
+        }
+      }
+    }
+
+    // Final reduction
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        float accm0 = sum[n][y][0];
+        float accm16 = sum[n][y][8];
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][1]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:1 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][9]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][2]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:2 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][10]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][3]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:3 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][11]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][4]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:8 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][12]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][5]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:9 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][13]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][6]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:10 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][14]), "v"(accm16));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm0)
+            : "0"(accm0), "v"(sum[n][y][7]), "v"(accm0));
+        asm("v_add_f32 %0, %2, %3 row_shl:11 bound_ctrl:0 "
+            : "=v"(accm16)
+            : "0"(accm16), "v"(sum[n][y][15]), "v"(accm16));
+        accm0 += __shfl(accm0, 36);
+        accm16 += __shfl(accm16, 52);
+        sum[n][y][0] = accm0 + __shfl(accm16, 16);
+      }
+    }
+
+    if (threadIdx.x == 0) {
+      for (int n = 0; n < N; n++) {
+        for (int y = 0; y < YTILE; y++) {
+          if (y + m >= M) break;  // To avoid mem access fault.
+          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
+        }
+      }
+    }
+
+    m += CuCount * _WvPrGrp * YTILE;
+  }
+}
+#else   // !defined(__HIP__MI300__) TODO: Add NAVI support
+template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
+          int A_CHUNK, int UNRL, int N>
+__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
+                              const fp8_t* B, const fp8_t* __restrict__ A,
+                              scalar_t* C, const float* __restrict__ s_A,
+                              const float* __restrict__ s_B, const int _WvPrGrp,
+                              const int CuCount) {
+  UNREACHABLE_CODE
+}
+#endif  // defined(__HIP__MI300__) TODO: Add NAVI support
+
+void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               at::Tensor& scale_a, at::Tensor& scale_b,
+               const int64_t CuCount) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
+  auto M_in = in_a.size(0);
+  auto K_in = in_a.size(1);
+  auto N_in = in_b.size(0);
+  auto Kp_in = in_a.stride(0);
+  TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
+  TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
+  TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
+              out_c.dtype() == torch::kBFloat16);
+
+  dim3 grid(CuCount);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
+                  _N)                                                          \
+  {                                                                            \
+    dim3 block(64, _WvPrGrp);                                                  \
+    if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) {                 \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);               \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>  \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    } else {                                                                   \
+      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);               \
+      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>      \
+          <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    }                                                                          \
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
+    using fptype = typename scalar<scalar_t>::type;
+    auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
+    auto s_a = scale_a.data_ptr<float>();
+    auto s_b = scale_b.data_ptr<float>();
+    VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
+      auto a_ptr = in_a.data_ptr<fp8_t>();
+      auto b_ptr = in_b.data_ptr<fp8_t>();
+      switch (N_in) {
+        case 1:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)
+          break;
+        case 2:
+          WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 2)
+          break;
+        case 3:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 3)
+          break;
+        case 4:
+          WVSPLITKQ(16, 4, 7, 7, 1, 1, 1, 4)
+          break;
+        default:
+          throw std::runtime_error(
+              "Unsupported N value: " + std::to_string(M_in) + "," +
+              std::to_string(K_in) + "," + std::to_string(N_in));
+      }
+    });
+  });
+}
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index 537e9357d52..4ac6fd1e994 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -14,6 +14,24 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
   // vLLM custom ops for rocm
 
+  // Custom gemm op for matrix-vector multiplication
+  rocm_ops.def(
+      "LLMM1(Tensor in_a, Tensor in_b, int rows_per_block) -> "
+      "Tensor");
+  rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
+
+  // Custom gemm op for skinny matrix-matrix multiplication
+  rocm_ops.def(
+      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
+      "Tensor");
+  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
+
+  // wvSplitK for fp8
+  rocm_ops.def(
+      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
+      "          Tensor scale_b, int CuCount) -> ()");
+  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);
+
   // Custom attention op
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b6ff6a006c0..5ed33097672 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -81,9 +81,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Activation ops
   // Activation function used in SwiGLU.
-  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
   ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
 
+  ops.def(
+      "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
+  ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
+
   ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
   ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
 
@@ -443,6 +447,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
   ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
 
+  // CUTLASS MLA decode
+  ops.def(
+      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
+      "                   Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
+      "                   Tensor page_table, float scale) -> ()");
+  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
+
   // Mamba selective scan kernel
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index d1ecef586d5..17adb7a92dc 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,11 +5,11 @@
 # docs/source/contributing/dockerfile/dockerfile.md and
 # docs/source/assets/contributing/dockerfile-stages-dependency.png
 
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
@@ -19,7 +19,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@@ -34,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@@ -66,7 +70,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -89,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -158,19 +165,25 @@ FROM base as dev
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
@@ -185,7 +198,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && apt-get update -y \
     && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
@@ -200,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -220,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -237,9 +255,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    # TESTING: install FlashInfer from source to test 2.7.0 final RC
+    FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
+    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
 fi
 COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
+uv pip list
 
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
@@ -247,7 +273,8 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 #################### vLLM installation IMAGE ####################
 
@@ -261,6 +288,11 @@ ADD . /vllm-workspace/
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -289,6 +321,7 @@ RUN mv vllm test_docs/
 #################### OPENAI API SERVER ####################
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
+ARG TARGETPLATFORM
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 54d1ce86d01..c647d9036f4 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -121,6 +121,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
+ADD ./vllm/collect_env.py .
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
new file mode 100644
index 00000000000..6989106c429
--- /dev/null
+++ b/docker/Dockerfile.nightly_torch
@@ -0,0 +1,313 @@
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
+
+# for torch nightly, cuda >=12.6 is required,
+# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
+ARG CUDA_VERSION=12.8.0
+#
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version \
+    && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# install torch nightly
+ARG PINNED_TORCH_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$PINNED_TORCH_VERSION" ]; then \
+      pkgs="$PINNED_TORCH_VERSION"; \
+    else \
+      pkgs="torch torchaudio torchvision"; \
+    fi && \
+    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+uv pip install --system -r requirements/common.txt
+
+# must put before installing xformers, so it can install the correct version of xfomrers.
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+# Build xformers with cuda and torch nightly
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+# todo(elainewy): cache xformers build result for faster build
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+     --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+RUN cat torch_build_versions.txt
+
+# cuda arch list used by torch
+# can be useful for `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+#################### WHEEL BUILD IMAGE ####################
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+# prepare for environment starts
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# get the nightly torch version used in the build to make sure the version is the same
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
+
+# install the vllm wheel
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system vllm-dist/*.whl --verbose
+
+# install xformers again for the new environment
+RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
+
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
+
+
+# build flashinfer for torch nightly from source around 10 mins
+# release version: v0.2.2.post1
+# todo(elainewy): cache flashinfer build result for faster build
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo "git clone flashinfer..." \
+    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && cd flashinfer \
+    && git checkout v0.2.2.post1 \
+    && git submodule update --init --recursive \
+    && echo "finish git clone flashinfer..." \
+    && rm -rf build \
+    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
+    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-dist/*.whl --verbose
+
+# install common packages
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN python3 use_existing_torch.py
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+COPY tests/ tests/
+
+# install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+#################### UNITTEST IMAGE #############################
+
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index f9ebb10ca87..e60cf5e69a4 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -114,8 +114,16 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
+# ENV that can improve safe tensor loading, and end-to-end time
+ENV SAFETENSORS_FAST_GPU=1
+
+# User-friendly environment setting for multi-processing to avoid below RuntimeError.
+# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
+# you must use the 'spawn' start method 
+# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
 CMD ["/bin/bash"]
-
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index b8523fbc2a0..12009b8aa04 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="8970b25b"
+ARG AITER_BRANCH="7e1ed08"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -32,7 +32,10 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN apt-get update -y \
     && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
-    && add-apt-repository ppa:deadsnakes/ppa \
+    && for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+    done \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
        python${PYTHON_VERSION}-lib2to3 python-is-python3  \
diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 128929ac333..9c10cd56b59 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
     which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
     microdnf clean all
 
 # Python Installation
@@ -123,6 +123,7 @@ ENV UV_LINK_MODE=copy
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
 ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 
 COPY . /workspace/vllm
 WORKDIR /workspace/vllm
diff --git a/docker/Dockerfile.tpu b/docker/Dockerfile.tpu
index 50806d8820a..295270d29f7 100644
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
@@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
         -r requirements/tpu.txt
-RUN python3 setup.py develop
+RUN python3 -m pip install -e .
 
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index ad4abf16b43..681102b9d18 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 setup.py install
 
-# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
-# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-extension-for-pytorch==2.6.10+xpu \
-    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
 CMD ["/bin/bash"]
 
 FROM vllm-base AS vllm-openai
diff --git a/docs/Makefile b/docs/Makefile
index 5b801f79d1f..d3b429dfb92 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -22,3 +22,4 @@ help:
 clean:
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	rm -rf "$(SOURCEDIR)/getting_started/examples"
+	rm -rf "$(SOURCEDIR)/api/vllm"
diff --git a/docs/source/api/engine/async_llm_engine.md b/docs/source/api/engine/async_llm_engine.md
deleted file mode 100644
index 904feaa5051..00000000000
--- a/docs/source/api/engine/async_llm_engine.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# AsyncLLMEngine
-
-```{eval-rst}
-.. autoclass:: vllm.AsyncLLMEngine
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md
deleted file mode 100644
index b6544d94afd..00000000000
--- a/docs/source/api/engine/index.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# vLLM Engine
-
-```{eval-rst}
-.. automodule:: vllm.engine
-```
-
-```{eval-rst}
-.. currentmodule:: vllm.engine
-```
-
-:::{toctree}
-:caption: Engines
-:maxdepth: 2
-
-llm_engine
-async_llm_engine
-:::
diff --git a/docs/source/api/engine/llm_engine.md b/docs/source/api/engine/llm_engine.md
deleted file mode 100644
index d6613ef5562..00000000000
--- a/docs/source/api/engine/llm_engine.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# LLMEngine
-
-```{eval-rst}
-.. autoclass:: vllm.LLMEngine
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md
deleted file mode 100644
index 181c30cab9c..00000000000
--- a/docs/source/api/inference_params.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Inference Parameters
-
-Inference parameters for vLLM APIs.
-
-(sampling-params)=
-
-## Sampling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.SamplingParams
-    :members:
-```
-
-(pooling-params)=
-
-## Pooling Parameters
-
-```{eval-rst}
-.. autoclass:: vllm.PoolingParams
-    :members:
-```
diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md
deleted file mode 100644
index e103a51d007..00000000000
--- a/docs/source/api/model/adapters.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Model Adapters
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.model_executor.models.adapters
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md
deleted file mode 100644
index 8fee3a55c93..00000000000
--- a/docs/source/api/model/index.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Model Development
-
-## Submodules
-
-:::{toctree}
-:maxdepth: 1
-
-interfaces_base
-interfaces
-adapters
-:::
diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md
deleted file mode 100644
index 55bee57f64f..00000000000
--- a/docs/source/api/model/interfaces.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Optional Interfaces
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.model_executor.models.interfaces
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md
deleted file mode 100644
index 75d58d34228..00000000000
--- a/docs/source/api/model/interfaces_base.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Base Model Interfaces
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.model_executor.models.interfaces_base
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md
deleted file mode 100644
index 069ed53e545..00000000000
--- a/docs/source/api/multimodal/index.md
+++ /dev/null
@@ -1,28 +0,0 @@
-(multi-modality)=
-
-# Multi-Modality
-
-vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
-
-Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
-via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
-
-Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
-
-## Module Contents
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
-```
-
-## Submodules
-
-:::{toctree}
-:maxdepth: 1
-
-inputs
-parse
-processing
-profiling
-registry
-:::
diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md
deleted file mode 100644
index 21bd938be9e..00000000000
--- a/docs/source/api/multimodal/inputs.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Input Definitions
-
-## User-facing inputs
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
-```
-
-## Internal data structures
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autodata:: vllm.multimodal.inputs.NestedTensors
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
-    :members:
-    :show-inheritance:
-```
-
-```{eval-rst}
-.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md
deleted file mode 100644
index 4676139efe6..00000000000
--- a/docs/source/api/multimodal/parse.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Data Parsing
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.parse
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md
deleted file mode 100644
index 0d81c8d3966..00000000000
--- a/docs/source/api/multimodal/processing.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Data Processing
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.processing
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md
deleted file mode 100644
index b4551452122..00000000000
--- a/docs/source/api/multimodal/profiling.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Memory Profiling
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.profiling
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md
deleted file mode 100644
index 0737a4385cf..00000000000
--- a/docs/source/api/multimodal/registry.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Registry
-
-## Module Contents
-
-```{eval-rst}
-.. automodule:: vllm.multimodal.registry
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md
deleted file mode 100644
index ec2cc599d92..00000000000
--- a/docs/source/api/offline_inference/index.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Offline Inference
-
-:::{toctree}
-:caption: Contents
-:maxdepth: 1
-
-llm
-llm_inputs
-:::
diff --git a/docs/source/api/offline_inference/llm.md b/docs/source/api/offline_inference/llm.md
deleted file mode 100644
index 9f129d5e416..00000000000
--- a/docs/source/api/offline_inference/llm.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# LLM Class
-
-```{eval-rst}
-.. autoclass:: vllm.LLM
-    :members:
-    :show-inheritance:
-```
diff --git a/docs/source/api/offline_inference/llm_inputs.md b/docs/source/api/offline_inference/llm_inputs.md
deleted file mode 100644
index 21f688a12c5..00000000000
--- a/docs/source/api/offline_inference/llm_inputs.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# LLM Inputs
-
-```{eval-rst}
-.. autodata:: vllm.inputs.PromptType
-```
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.TextPrompt
-    :show-inheritance:
-    :members:
-    :member-order: bysource
-```
-
-```{eval-rst}
-.. autoclass:: vllm.inputs.TokensPrompt
-    :show-inheritance:
-    :members:
-    :member-order: bysource
-```
diff --git a/docs/source/api/summary.md b/docs/source/api/summary.md
new file mode 100644
index 00000000000..46de545f9de
--- /dev/null
+++ b/docs/source/api/summary.md
@@ -0,0 +1,133 @@
+# Summary
+
+(configuration)=
+
+## Configuration
+
+API documentation for vLLM's configuration classes.
+
+```{autodoc2-summary}
+    vllm.config.ModelConfig
+    vllm.config.CacheConfig
+    vllm.config.TokenizerPoolConfig
+    vllm.config.LoadConfig
+    vllm.config.ParallelConfig
+    vllm.config.SchedulerConfig
+    vllm.config.DeviceConfig
+    vllm.config.SpeculativeConfig
+    vllm.config.LoRAConfig
+    vllm.config.PromptAdapterConfig
+    vllm.config.MultiModalConfig
+    vllm.config.PoolerConfig
+    vllm.config.DecodingConfig
+    vllm.config.ObservabilityConfig
+    vllm.config.KVTransferConfig
+    vllm.config.CompilationConfig
+    vllm.config.VllmConfig
+```
+
+(offline-inference-api)=
+
+## Offline Inference
+
+LLM Class.
+
+```{autodoc2-summary}
+    vllm.LLM
+```
+
+LLM Inputs.
+
+```{autodoc2-summary}
+    vllm.inputs.PromptType
+    vllm.inputs.TextPrompt
+    vllm.inputs.TokensPrompt
+```
+
+## vLLM Engines
+
+Engine classes for offline and online inference.
+
+```{autodoc2-summary}
+    vllm.LLMEngine
+    vllm.AsyncLLMEngine
+```
+
+## Inference Parameters
+
+Inference parameters for vLLM APIs.
+
+(sampling-params)=
+(pooling-params)=
+
+```{autodoc2-summary}
+    vllm.SamplingParams
+    vllm.PoolingParams
+```
+
+(multi-modality)=
+
+## Multi-Modality
+
+vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
+via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
+
+Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
+
+```{autodoc2-summary}
+    vllm.multimodal.MULTIMODAL_REGISTRY
+```
+
+### Inputs
+
+User-facing inputs.
+
+```{autodoc2-summary}
+    vllm.multimodal.inputs.MultiModalDataDict
+```
+
+Internal data structures.
+
+```{autodoc2-summary}
+    vllm.multimodal.inputs.PlaceholderRange
+    vllm.multimodal.inputs.NestedTensors
+    vllm.multimodal.inputs.MultiModalFieldElem
+    vllm.multimodal.inputs.MultiModalFieldConfig
+    vllm.multimodal.inputs.MultiModalKwargsItem
+    vllm.multimodal.inputs.MultiModalKwargs
+    vllm.multimodal.inputs.MultiModalInputs
+```
+
+### Data Parsing
+
+```{autodoc2-summary}
+    vllm.multimodal.parse
+```
+
+### Data Processing
+
+```{autodoc2-summary}
+    vllm.multimodal.processing
+```
+
+### Memory Profiling
+
+```{autodoc2-summary}
+    vllm.multimodal.profiling
+```
+
+### Registry
+
+```{autodoc2-summary}
+    vllm.multimodal.registry
+```
+
+## Model Development
+
+```{autodoc2-summary}
+    vllm.model_executor.models.interfaces_base
+    vllm.model_executor.models.interfaces
+    vllm.model_executor.models.adapters
+```
diff --git a/docs/source/assets/deployment/anything-llm-chat-with-doc.png b/docs/source/assets/deployment/anything-llm-chat-with-doc.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9b57f5c3cecc92da660efaddb4e75d8f72160b3
GIT binary patch
literal 120834
zcmeFZXIxX;7CnkcL_h=uL_vy;DxlIkO7Fc_MM~(sgeIUMAOa#FHS`vG37v>Y5u^nI
z(n9Zq9y;W0&;Oot^?2WVpWe6k0e&HS?^WiSYpglPn42&)6<JbZ8e%*=JW_c%DGfY4
z0vkNMi<Xx!0pDaI?RW9;E|`ENCDr65B^lLR94*0i7I=6pG49b%74g+6-}DW=ReQ#n
z*+%{t##lzqZaw&{xX9^_j#72lHkjt!@u$SM9%uNv*X#{ING`dzZ&#@a0;OSND4{ax
zZ*EgcIX6UWQ=(d%Hd8$VOJ4QX-MPpTZ^xbbwg6_kR)_;Vp*$piUBn$Bt66ZrdfHG}
zHS4PUGA6%vZCStrVqB{8aTMGm;G=oI@0pTj*ZU%yLF-z(GHvviuLX3;B<On^{H&DQ
z&C`t?%t74r&<4@&6@hJ1=V+?Ehuim@Sq2xE#%rzZDod~2^<&gE(7#KUP9t@lDF0K(
z6=XKeqEWjzH^qk1Z40OMFJEu$uzb$_#v*w8lFL^TmcZv%?#y2Pq80mM1mXE(N5CRc
zC_69B0NpOCMvH&U#^HFF*j#zp$k=IeD1eWh_oc0?$gHy?=@y+pFYMBx?5oApxyF&(
zdHSGwhF9I~%r6aJdcMB6=LQ&Dz(P;nQdt>~4fuQ+??NaTj{x{|0r;T-e(><{)A9fQ
zhQKEM;@_VyTK>FP58HWwhbMt2FZEQ*>%#gZp|6(qSjRT=YxG6A@^qd!%Zro`-xh&<
zo{petJ=W{tBXF2)3*1(pYW?l2SERzyF--Y9*VLbavJc|-({MzdpX4n+e0$0CdfxN&
zbz<GYglq2^I#J-@8!H&g{lq2mH6H$D##g`pDpLm83>Sgp;9!#c+ZPDPCGdX#^_j$O
zqBcf?bQARFwe*LV=jZ3i^I1u#gZTd4FwoC+vt^3Si~r-fcb9>O?okI>{*Q+UY`n<+
z$JZ88uOyUj)sgdD{C#{shnsIi`82MIM>xlXZcQ5&Qe9+#vcg&HTGG}z1d|$9w?A=p
zjbOX`A|S2{Jlx>wE?nWqYg;D?G4WQcz{K84w(Fdaj8D{V-1#17HEE+a*3A1I{?V$W
zaiW0sd$Y568wXcrb>qM70`#k(%2@0#BlEpeZmw%(xOC;}utlhhPN@nVOrtv7QGan!
z$*|foQrVzlsKm7D>kTW54%BSBJG(xDd(9w0bXBkEnD=hCVO<MTkBRvRqI&E`X>S-z
z1eV|crXH!G!K@ng)K}l_GDJX9?LXBaT}%zYaJJ_CHeJ@bKUOLIE)h?$?zi}oo^pX!
zh6kBpxAN6hlP(DJmOCgX>z!S?WqIV&TYJWyP+CS5abB@YeQJ^H`l8*(_aZ9R@^c%u
z$o2#hKTxJJw4?LT(44}usDi}Az#=(fm1wR@u>l^4TJ8V5%2&it-H7Ve8LMCE?nyIo
zs7riPQz{jH``zyz1uVQXk=sPwR*UH1d1p8*WlL#<L4WY-IJYBZv`~Q26EpWi2VCQ>
zgZBCcg$>n@5>nNGX0%vpdEMvcEFrCGF;N?X<M=RhCR{w2tg)sy{Bj^AOP&Dji`Bgn
z!;XXOFq&+2QckBVztoCK%tzie`{}}<%QK{cZKR$Ws!5D;nv5Ifb^fMHqAu(6y~*pC
zZHFHk&kGEdr(&5dBc<3?Ph7?pejJz3_OzF<7xC=h`PYVcB@qtl6-uv9o$)7D^t@th
z+eSLvC}0`fBy16D>~Bpvo}se0;-$S<?N}0)e;m!Hy<WwM!sJPVWM}hJV36S|E=zlO
zLc2p5Cr6>))IFOq`4jERj46k37Uo2*%o;SxNee^cU#l6I#xqmhu>3Yeiq%nlg@vCv
zGDNtb@$@Mb-C9k7NP?_)gaY_P=b$w;oK00S^JBuFo7ciLeNN@^$;YpDr%91@`{PJE
zLkzunj<Uo2`<>UgeCpo5h)R2OMNl9O$ns;_(Tk=6Uy$96lJDgz4U6_s+N$z39an~`
zoGA!(3*txY^UD-I{_M1YU)gg(8!$4_d+jnAZx6>0{NJ3)L|3!AHBl%!j%AAZ5wV5G
zy8J|c?8C13&!35C?IE`%Q&&fnDg*siN1v7uDtXoKZzb)2O>}_75q!Ysj5I2}g>4Uu
z2L?idUe^-m%1xh9wdt7Ub?$}H7WWIlsl=7y5Ngz<tT#QopanNejh!B_hb^VRtd%ur
z_1?MOVgCcfKI)k*92*&~L8?<4P2Eo*D!E*$ffU%c18m5!?1%>48gbqEAKyU(OJ9%9
z4<1ehFBVj;ep<{OZ30`sRu{Wb<S?z=Pog%9wGV2!`KeOmm;=nK@IS_{ak7CMbr*A$
zTwTdKl*X32UAt|m6VJQgJtcnIAM+OK<^M!edr<D#ty`2U(;<THEH7MU(%Uxu5Z|Ic
z7ie>EmGzrP75aaex)U40AOIdOs=%bgY%HY_b@kzX<a<1D4fgk)$f%}8X^x`|ziAui
zLGA=bp1!%iC1jayIbToePqdMF_l`lXA+Si{PQ@;YPOwJn7Ai!~S(eDQRI5v#!y}Fs
zV=eEltj7C)VD1k%{<hNf(_GrCR2-2h@KMzOm+vOtyJPeSRo)YGX^&@;F`VwPSGiof
z?O}OJMQ$^9c#AAiJPp}68(~}<%40P^ckDJlv}dwI)>M#9N3d;@o?C5qW0iLw^C1Tq
znO+PPN1v$o$`eg=CbIt<t^LkAZ|;NIT|(U;#>jo;b8sZ!<*Yr$#MRFi6$Y<Qly?m_
zlUC{^?tJ(I%gn|}tWA=(D8*!z&;4eOcy5)2+yj5J!`guBB<o3`2B&Bqp%V8wH=l*I
zPLJ0kIZLS?#dfCWwSttFyF3P~ti_UPIK2iNpRX={iM*9Rx_$$Ma2U9LVmV+69|GB(
zK4R%PnDyT?8fU8Y^P}eETi{@`_el4fGO(YsDyql{DBh7wp-dl$3K8aT{_CD8Bl|8x
zt*#z7m~2xOa#-2afj!}2O%IKK+~RvzaY(bvwhMympp7c`f0=zBfAuj1ffdg>>&@eA
z`*O%2K{5ThkOz+k1)k|5<SUEQ&gl2dycb08N$I&TCatBjJT@LP9S$y8RW`a2SCWS&
zpR9u?bBEe)5vMA7IqSfUx0XzYeK118^*YMq_T8&0h&qW**|p*EXl+{#&6Z?=!C91`
zM~fnCeTXx;I3eN{W|)NwKPRVLMrFz_nAyJUyFd_6#;T5m;f7|n{aGNbeQ^9VS|hD)
z$0e`6^G7Y#!E*L+oSH3x`48WYei9e}z@ZhL+GMYjxWcNurdG7fk-t+*3O3mlqsA}Z
zQw!zxE0Y;Ff3P6zQCjHHEo|!-Z;55s_u=R2b<HAv-}nvcxvrW#S&p(hhNA4=`JlhF
z-F6Tt%;j*0iGhD;6>4Mz(r6b-eE%G0d#d_`CPU)8;Z)Iq_&1h5!WZqe-qsUjf1(bY
z>;lMgZ*Txx7-qGRG+i_lu}Hu#9tjzdQk)6)PZ}zo4-vE{^)qsFA{}3rnS7U`<J75j
z-pKW?mv%85Ev$Q6r&X6XKp#FScAsmRqIlz_ogxex^E4s$rJTm4KcPl3kj_{&CC7F`
zi+!|!EW?hJJx)<Uh4>FJBxgzd928eTyZRAf5mPxkKh<sf$EK9P3)G-sRMWH>()ttK
zFH>?#q~|&@?BD(O0fmR3!vtI!$}hP5Ke{yn;-nd|b?JY_aX;e~W#CeQy5_Y%X2qyh
zix-&m#qUSNA4mD+k4eBKjjY_ee-1z0=>k6Qo0Br?|1k_+;8Ko^vh*Kr^h#m|pTLIr
z;5pa-7={gg94+s}9a9VNhI`4$fVtjSg<y~9V*MoKyP~1T;i>w25|;-vwT+P0HIgy%
zlgB;0JR@}Z9r;UCyZw!)F&!P>Z^Q3Zqy?Eg*FPKVJZ(X5&E0L16r70tEbR5Yf%ED4
z^kad$n~x!WPt&#_bKe+lQ;Rr=Hg7?KPLCKUu)V^ZJEyw|bVov^eqr>~zGCn+$MZgE
zHN%;!GaZDo^P)HWjiPI&x7$QHPj|0<G}tpgbPqoli!hPdj*1Y01}!Bc(mGE;`}LCp
z<A#UMPX$41TpP}9yJ!Q`?f2tj=;99hGmdNPk^48j#SGV?;`x0I=bc;9GNZVTYdhDd
zL<7`Jw{Ds^qP_F>Ym2cC*sE}=^PA|BMx3g2n&XV@(i#02#G<uN_&8<;Yu|Ite_|L}
zB`y*$PmdVETpgnoS&TC{boTbnJZrux&e=(I@_6S!emc*iJ$9#;dGqAyyf>x4+jO+9
zsuOx9DyAcJiLBSr_Wj}e@g#8^HyuMFvEAE=rUle6q;;H{*Y<1s)*L?W?G@=pxm?d>
z%q?}R!=w9sK5O2xT8k#HM-l=|=TF!+U2miz%uaR^%Bs<`&4^tE;%40H<9gjdn#quK
z<Ce|J#@$XR_m2Ca_nQK1*9_L3TE?~E-K%hdsF|Z+k4q<+-h?M#qDbIWzE?^;*RX7#
zxQ62|Z$_IJO&HcJ)gP!Qop7D*mb*JX@7sCRI&Q4DQN})3`G$e|G=OpE(5YtDi!f)t
zn#(H=_qg%A(QgL3&}Y2m{smn!E9D;m^4~Vi+u99lY#wL|cD*9b`9tT&iEt;yS#7^b
zjc^5e>l9S#=-UCRdm--7)H{Q1>t$FcLNvEEn0RAfo91=yUbX4O<<G%$i!tbWjvf7(
zrL4KEyv!()v1A?bd-p6K50F^>8H+H!l3*jadHj)ByG!cNEqI-rQ7smaCiH(KI0Zj1
z{r^w>2Y~)Rutt8$=f}5$fc8IzOF$s?sJ_lJA@@~-o+Kb&thR|asjJ9|Q^08byyKtc
zq0(m@N%i&6iMJoC`Zio@YGpfI1dn6a(ev=|P|!zCC`Yd{{Y)S%WYT4Nl7_S8HEpXT
z&4Zp;EEe{Y{|cCDcor)k+V)3GBkWp_v8)&FsKs8_A(qAqlmdBxn_;~6V-MX9$Et1Z
zF-g4V3Db)`$sFC%Rlf(A65z{P9-_xEono6~a~F(|dyxlbEW?JH!@wHR2c?Fj7BLgp
ze0U%)c$rb{?uD~i+kwnDRH1fv!fwYVVtiE__K-h8CE!8Og4wSllS`5?Fi1ib>-Kq|
zEvQWw>)qGZM@*NPsy#^qld_FDiJ$!(BxRz*pe31R6=b0+4!+eV&cV#gJTNP8^*7{~
zE0D-hh$|XZ5!OFD#`=}w1YIppV(`A@^UO#n$FXS_J#P)sX@X$idRS}Fol*H#(-xAi
zam@(9(;_5LZUIk6zJXXrCO@qROy;)<@pX3J1J;V{0ycZpwa-ILuF`4dnPIaV_(<S!
zFaEFnb~Q-^%S0CGS6Yhc?aoDU$0944X0)O89vGXN3ExUg{aVSjEtqa8oJ{<*|5D>l
zIEP*t;|r+CGS}h>rUq&Z`St+5*?f`|#P?JqdTE!dxy=9U_{e;YXgFV_m>??~@!9rj
zgjV0@+g1E&vY<+~x)OC*O2pU=c2=_-XGentCT{M^{>JhymTw+{s|NXWNdxs|-V&5%
znx0$>>S$~E6L&E--6J=ZH`wTY?=_*-+RW7`9aHJ?VZhYap*wxzIHTDoAJkuk2(S1(
zi)Ol&^fB;PAIxDTKh4l@p_v^2p7OZ(D~kDczV~59Xp#Lm{5)6avcxh|gM2KL$AC2T
z5$!oM6w<|ETFo>hhnX@zn8dWxI+Ue}`W%}{c}YmPY8tJ-H=Yg=6~XQezPUuAm5)#2
zvr(6aLEz4t-}7?bd~!7Ih)FQ@CqmV6Kv|ZZF}x#s%AO_{FI_3S#clc3P9^vfX;CtX
zKd*;>rDadW_FM-w&5yVGYC*!l4zf5|uZ)7;1h4-n=kwf{_;jHZotH9;S*oS=TZoM3
zY59Qf4D6!v2a54OYBuUmS4wQFvJcu!d@vSt!fs`3oVB|P38wXr{SinQ8*w~7kFK%1
z_D9jSslDwEhVjmrdl3O<*Q5Hswj58usExNsIK$dBDSz3jml>m7E=+b0<*T=n1tUdO
z<e6`(2&iS@g+WS`5}&l?Dy6QfHin<T4mUW-UAI~Y$BCFz%qsm)2V~yeobl0QX?-ue
zu^cQwNbi--VcI;={_d7}#Q8D%YmebXD+3vs5X_vzR%@u#n<w)=nbYknc~A#kWAB`)
ztS1boOX7-HCR2>vXhlu|N5JWJD2GX-0Hv7*-fW6%@Ae6$FUv!<%H3Y!acx~wd&1iE
z!3eIVmlVt@4^a|57YJ{g@lSeXVP=`6>KDgs?a__ik1z5bOa&mwmap6vcx0I_BN}#o
zet;MswMt!3K0lel9h>ODCdz70!OZ+6oA7amO8QXxRi7E7S%@L4nj{;G(~!DsBIFB8
z=k@y_$Eq`N+>vToL8t73u9Jfy6R9_`jwdpO=l2}Z%!jI9G;$F(Ali$}QB>d|V4IPM
z3?g48-TlT4{XwBBw!*>do1>Yhl8fe}6ZWVYoQyoK0uQd|LH&0~Hzpfuu&b!<CAueW
z`G|5Yr+ad;^v<uzhO$Al#NJwG4Rz0TC&>iAE+HSW_V`qeJ2dD|*1K~UNTf&4UPtH5
zi0baGn9J6}*JQk>#~qoqYC=x(JHj9p58Xx8W^e7sIb|sxeQuAWO)fbz8W~&}R27fB
z$(fVJPi^d#L8kWY10_`Auq?;yrq&@mQ$ao5(lC!nIq8_m+CA?}NSpndhlcoXexmmA
z)dtwjF$<OMR=PW08Wy!DFQ7(I9JBg7%j6!g8yh$8WF&9NlvuU-^3Rlo2o;33T@_xR
z_PN-&KW5uZs8HoHJDQsenM{W}PN4~9D3{eF(iIaphg<AVAQQD}kBL{Uz1}Rg=xxUL
zD!4Y8@kl9-50O|EtmS6I*`bT31JP`tjNuYvS~f{l%D2e*Z)2s$dj_p<glg{yuZCqF
zQ$5urX4qNauMKc3f>5n^q3T|=h4+&dh7gjvx4=p#)b3)XkkTcKz2!;5GMKU2=fMG-
z;!QgKsB4%@x~@$u(9<|s?>N<^WK6GZ6Fd8u^HX8XZI*l3lB3BcXR1ccv(-qq!<kUS
zO-)toQ@GZUVxonn1T<Afm}ft7aDMLJenGu;z{km#?n6f!px|~sk$uVLbs_Ipgj1_8
z&&b=wlB4rbhEt;lOHj|WYWLP09<9|jYSStP<42kI3j%5Hq6T_wGn3E2gL&hsl~&`d
zB_r)=KI=0V1?HNMn1<M;FMNnq6&v;NkhE*}$HqaKtxB5?_F+0bQ1L0t-V|^!A=!RQ
zqK5;T4n#&NcZ<)rmdu3!Go%*NireRh%|X_E7rF)>ar97E)UYcSM9{*uMnTVCrn_F*
zL{(-b>&Zz%D5a3mWZ0CLH;b3m)#RSuCxzOKRTZ4AFg@3Kl$+Y^^LFiV_N&Ms4jPs0
zJ62-{mBJ?WpLoC;Qe<&Wso$Ow(#<c0fafit=iQ_nC+8=devUV=Vo6KA2U$^KJFY<l
zOL40mk+l5BY**bSlZPBg$p*ic6IXyWz@sc%In&9#ky_`V)rP+JB~?Kg3k3&yM+ldc
zJHLA4*X#XFksHoK;AaOS-9|=tVm|IkMABr?I%7YnBVSe!Bi+`9d-{AbQRzH0j`9$<
zWlR5r#h&7&=b=Zmr->!hePz#EeP7A!&Q*E1vn`j~jXs&Wq`d0dJ8E?zAKFmw;pPHU
zVfD&#p<GrslYSbcYwCwut)J1XnQ)09b9z~4!gqFZaA3g#U39KH+aEuR%wZC@$Kdmy
zG|-DYb(%4wW)$q~L1a<A9};E~QOn8@7ZO{R&wR#9rerZvDRJQr$hz0V{>ozh(U)TX
z9JEkUfb3F*{~2UT2HUP~sFl}?*u;*Uq$hMu_Vr-0_6rU^?As>OqlQ;{m&vWT+p#~6
z+Ck-2#Z@0dnpb)Xzr9g)bEQ5Lv3kF`oh_foE9DSOGOy&~x?}y|G1mHSTtFZYupP%5
z(ejOY9f{lalK&P3G`!kyrpzA@RZS!wBG{raaZ92dcA1ZnJzgl7a&FQQJ6Y#uxJk5g
ze<wdZoM{Wy&y4-vzbYwSPtQx^fok&cL_m-oC06esDf6lZ`tq^UoCZ~$2QTUlXCuy1
z_(l#ewxx*k`puP0G^c^bsUzp~<Ilyz+4-L!efhGHpJgNIF+Q!Q`{R!O)&q|_Zkzkm
zrisuv@S>E9ie|KFhAGo1Uwh4lG5d1elOE>bQgo%ZtN$$8d~kj;qfa8D>qrYqm1%%b
zH>B~}Pk6K6%zAE;m+Fxudc2&4-pfsP*l7YdaL8^Czh9ZWXmV8&a>__GoiSZ2J&4lr
zuub_&(Hf_$@_oOJx}{?ial*RKF(Ol5bEFK7gI%fwHX6Q<yX3ZcZHIR_MyK(kf8XHg
zjh6`y1X@H@?=HoCpy#!bOBHfgioM7losMQ--QMXH-aMRsdY6<`T;^$PZ>sRg=ivBR
z`X&7PNfFZQAXbUYxV)qW=jX&q^9(u}Ii(DnBYg*!>U|%Lbhk)LbQMJTYu@22((LRF
z8#<2Cc}_ex2rDpd@DRhxmc^P4+~nrqEuN908RNAXVzHeGk|7?lreX&Us34xC(0g)8
zQo8IK;7_q3j|w=koCPy>L$KWZy%s`-AKZN+<^gqRg<>N=;`;?1kzM`w+YKt4WTWVt
zsUw+ebc|^@bLa(#-?OmXPq6=B^lEYZTkILO|F-B!9lf$m!zrp^=|N3boQ{82Krvk2
zu%JqdfG6t12QkV2Tia-}#sr58$+TR+?)UfKP_7X6KlNJMgygbBXVSWpF34AThQ1#(
z;+C!Te#uiBrc<9d7c$I(x06beX5CDKMfOM2fFyUAv0ekw_dy^^w_Ec%V2#50E^RzH
z$#*ghuOc8hEVa&ne;lXra~M`_xfWVKnHViOP<aJKoa&W^HlLI@X6={iH;si{x%D9d
zg<5;`g6_tcSO+P!LA?b_f{rTEmDC=Y-9qeTZ9;?PLGpnjD6`d(-b$WK48DiY^NZ}a
zU&c1v86kd7ECBuy7o-Of7r}iY;Ttzb*o`=7v|W?)Z2J0fukOL`a>#--V{He~hWB;B
z((HqLbw+7X{$nHmIxtJO(nzrMF4Fg4($nAle(`DL7}kH_(LGW+HU|x@!Z_OFEWBvE
zA>lb_$-&zXkZl(7Y(5sJjOo`Q0&nto9ZD!c+(FrR&%BV$Acw^D7={454mzJAvfe7E
zr!=T67v|bJt>*LNwJph6Nb?xTDkwW$>hP4eW&aC3l^*e?^M{6o+e1tbNN*us7xbGS
zq*Hv%OaHJ)IyNkC?=^_ku~`X;c6Z;!HZGzsnop(((oV2xWrsgdbF;2^;q3(fZ+ZtL
zO3efcE`6~8=7vlgIh`N*SKsYn;I{5(6c>>b73L{-QsoD{xW?u~5K%ptYa<(OaBcvj
zW@X7p6E@h_hY^D0n-o9tghq~CZ@jwSASbg7E@JMU_%PWzNq}I!9nc6RTMQTTTffZz
zPBHdspS!4+n~4R7@~1~Fgz0Y0d36^34BqoGYG!#EYc1K&4K3~z@N3`_O!s#&`Ju|r
zr2+Z^gk%4v|0D_t6e#o6)jszL&g$%{BaL_rR16fK+v*`hOJpPBo327)eK#Q}>o>W;
z88xQL0yB9~S*3IkKP}Je>HAzB%X*l(P)T)o^n8rx<a^KA^)w!JN3xnLRS>Lo+=HIL
z%o(LN`PIP83>gCjgI1`U7<R%&&0xonf++OBZhOH2j948igPFZgnjO+dHYUmGD>~bj
zB<vceD@jH)yLD%elxTOWh%HbeR(0gp<l47a%d_PZP~7sOU}8O{tm|F>7E;ZGspNn-
zol((vgnQ9zWKD{lhLu$yJ<NrdK)zcMvCI@k$^hAe(XQ^*8_vkKkYQgBut_Ho2*@eZ
z@b9Z8G!O-5O#%s8yr|J5soe@|<W!XG`>F(^&39W`Qy?vxTC+%|1C-0?W_{gro<Jw(
z539p~!RS_1_zTKH48fmJcGOzOGocK3!;v0m%b!{&UN+Nw;0ZYQ_1UjmQpl;1o6i3H
zgQEs*)7Jn=uFGlWYMQ?;?xx5tgINyZ=bgO1u$}ALC)jad&?WBpeYa1XK_~#!!y~_4
z*(XM;Uep`V^^tXdbO|bJM;6ecmghv8cJE*7S*FbhgxU`MRMz0tApB#Z4@}Fc>pF+t
zW5sIOVgA7qLHq3K8ArxdZhP9g#ug12xS^BY^0q3!_`7{r%Bg%{M%H_wE;66vxhV0|
zuT|ZXk8-xM@sO$|{O=&!))v|AoQJ!y$wly5`3v_!7DHD->6A{$>_+0ZubHvr-rXR#
zeYftw8c%jfy0yh>b;i3f&y8mmn0Aa5%}hN;Eza`{YaE7$tL5$0H>rx>yO$?V2&Pq)
zuHC6VK>JX49;4r0%K)}c8tZf}z4i?27TZHLiZtlbuRH9o>Hamz9unLMRN3XiEo>T*
zDh(fxBU*$Zn+RIP-<joYa;eUup4X05ubkya9{MJt0q<SNwv@2WQ>f{L!qdVN*%3v{
z_ap=hs>GD}d67GOfM5V>O+))^ikjvH-I!tLV`a=r6HhkAv(*1_zmZM$={->D{!~Cf
z@@j~^w4tCpmgUfG+`i_>+l;bIf30iC+Xl*{T(T0v@a?|bShHCRzG*2O#)&1@pv@Dc
z=HM^c#SLJm)%y$K+{0aRLY#6*Q_>xl_0zijlF~~#cwr%S1y-6I{4HBT{OgLlpYv;?
zKD^U5J<qN)zf@0ER<)yD#509`qCsoohv9`?%L;wpHByc0(lID8vLDn~Y}u;$WYJ(E
z=eTS%+5foMMw-niy*gP<Sgv?t{=34=c#xmSzes;OQ+%+4N<y(GuFb9?1Z90$$qO0T
zh<{-KpJV1|NKsn+C>ApLRY>P3_;d7PzWSs41{w{Na{XD5&&#r~yDL~Ac{)=i-%~r`
z#l16Ox27O?TxHBI+q9}vgzV3|;NZP2ywS*gSMRz>O_vT`7i^zhAhUvplD#LcdbpS`
zhmTPH$emNj6a{l78Oz^i`<z|Dvl%s3fljqOB-SRj&cE^i^i<1g6WxUQN};kf&gn_P
z2op<9fN+W)&BuB~(-vmqsY2OHSRO^Gco{dpV!;tCr4ywFqjED}?gl;rSr2){!SFtq
zbkvuyNm59zow}FQm&~+OS-Wcu*#mshPE*$h=X2aqUx2R!?=1_ens=NhOQZ=YGeMAl
z9oQm7RhvHspuO2)a}+|wuYHE{`9Qa>D-*vUcpBr?t#<CELqr_JFtfmR^EwBs7rD|o
zY+5o)CUQNSLUr#cIS}YF_DYSAEYXdSuw34dVwZK7S>vrhlD37%e-zKxJ#lw98ox=9
zm`PWfE`L6h<7zb++1LwCH&j=Q-Dktgtb+jY>3qWSFx*-(m|92o95P#X2?!c;(`7Z!
zpX?xX)L0D~c9vkvrLy(t7c?4r$dB)yD2!ctFZ(-F%)+9UDWTjl3&(LJ^&bLR#{qLD
zW5;Y;1m=uPtZZ{v`L5B8zP&&~hCRh}hQ;_<9Y@AuNX~RD1f8GI<;36doxLPf?YXF7
z*R1zIuH1#7hcWbx&wlHoD;5xSe1U9{HZzO%c@HYH<NWO8M51S(d_s*Yxw2PmT*bND
zc`Yw8xkCywoFWeJJ{77H8jGuflD{_usJDR?tZJEpHeHLX18fB3ENl4R)isAp$uW|@
z^awDB3PQ5V^Gct?;6ZbLNWQ?G3J(EE>+5UEf1%^gKkv%|x|PdqR?x5E{{Lx%r5u>Y
zbqbAN`Lh3jPxNg-f%@j0P5CR?`F{?d^sj&drJy8t;SZd8CE-GFdFXyejK-g147^SP
zR4qNe1Nw&@{_o@8uH~nsykPljgg`A$Hmb;BrP#B)(D<N&?WCq=Z_<OOySKbZaF@gN
zlq_D*x!wW??|0$1(z$=5m+HUM`mGBpsdSA8u(9GSl=n0nY{)J&pE<SJmK)}Acp<|t
z8S&}mTAEjDg?|$DSUbh9b9~Dok<J7s_4#WC@XK0)>A|Rgh4U9aTA0zcy;wo-+~PW{
zeD}Glb9whP-Y-wGeZX@a(#nSQ)cr)CHcB~|LEEaBv+J{`Nn3@5ZVpzYVgcwvh5+kB
z%im5h39$>1q4|q54}Rvg5@`|!O=H7OLlN;hSq4bCRT1O}(hxHrBH5H!^@Q~o#WHZ`
zKCt8>#>%}v%_jFj!XPzi*!qjDJR3*FrzAfW?rvVggZnK>a<@!=2};e{`<c<8zbqev
zzY_P*To2J{FYm9t8$MVxR!+!VrxpH-<C&dMr_1*22apQ>EJI@~z^^V5?nZjr7`Y`1
z3c|g6H{%68(=0gj!Qi;T;x10*VukT|9q-V0ccV&Aeni6Ux}xGMoxm~P-+pmR3F&Kq
zW~*MpvlIrn-Pb$O=*9w6W07Z!VCBTBGHr9x3)*%+>KM<tuV67b&9mvVBFiYwYbTja
z{Tfp+&Sw5GPWUhLQM&`Wd8~DZOPDzlXibVbe|H)+;Q89V#{`ZX_Ip$>5zW;3q59>U
zV!s1dLtNWeGtKWEg@hRMxN)ASI>bYaHM+45NT2c_@`4*|6sv7eMSdsy<BdHwp(PLg
z5zAR%fWvpK#$_1)Tp?BxEtl4~LO0f98OUJv3LM-%ao4cs`QH6;5g(J7BO4PFH?p+5
zB&=_)M2Q_cHmp}DYHAD)$R1Pxe?9_Ru#;!(0C_GV-$HEn``|DF*R6g4>M((<JWxOI
zTtT)a_>F-bG?rgvr-_nP$};$6-;}-255wmH@SB^@?nlG^>2BZX$tJ?eh4~*lSp1Zx
z^&9S15dyC8Lk1t^gDYN6nr!5uMI;-$i6z|gS)l1YweXMehS7~oA`7)So=okhWf<0!
z@VPBd=jWBXtvoSo^l~2ZdocPbEsY+#RmD@2B$4w4fuG{i#R?x(5thDfTIXVM(jo2;
z@k2+)wDDwvd^`YG+`8$|xP1wbO5QXb&MJDGA1#ef)VX4kY)e>?8AWLfnE&8i;10Xm
zVNKPJ4?5-6Uw$qYSSVV99$Flyf+SX(m0NHaU56MCQS?|)s46%%DT<83lSRPcujds-
zA$w&syf%yDNe3>-Y<W&Er>B8_16{1qtZ!)PeYXPNA|0EsH((}BGGF1=T(e-$iXAB!
zZaSE32UBF+pknO_2tIbldn<e|JISdj4PTTtA14)%0wnC^g&Cs1sU)=<0B(JJ1CyWy
zwte6eMztJc*j>Yz<W$X<X<JtW*ZAKc-W@e|iTQk;NGPNOn2f?n=MM2_Z8hM*_7F0L
z#gNw9bYmX-qgII!%Y5cAq>3=c8W>gx!!h^EPBY>4GJ1cXtv>9?0TB4nIxzSj1~%>X
zj(dE*KzL{j$SPcG=%$8%5+QoXvT4hm^X+KKt6|IbuFDydeobsk#Wm<E^C(fKyEr1|
z+nXU<SE@ruX=Ut21SkD{Tp*3_=takpc)%~7sIB&AG|~BPweUGi)V>vJHC?Y;qUXeW
z@>%R~+E2VnZYdS9^nOgtx?jREn_h<ad|s8>$f;S}F%_{h;k_j8tM;HXZ!St;c56B(
zU<eyMK85p7YAEqMwCA7bA4$lfrZ)8rn{??e_HQ^>H_3qpa6)cE`#Qz{hH(j2qIthv
zB@K(?ZY_Y$F1A4xo+}pD8!h32;<F1`E?pqFPTA=!sasK5$-&Ov&F6@=qvdy~nwXd%
zP@F5<xvR#SRg!3E(vYuJ;DE`jkXlzQrD!6Qmts1|bO{|k;?Ft70)1WvX=sfClExAa
zhk}i$bCU1Il;S$?MO6U=&2e7@UzOcxmt{}V!hY0VOKh@Z^Jvrgzzq(Am;4h>O(gWU
zpT{z*!T~Yncn2VR9E4Z0V==pZ2saDeO3{OfL77Mz3-L3ajs%8N%?6L+w`e}L17NW$
zT7q#_9k*TI8M~vj_q(|e$Mx>t0c(K$j`zv|GG#E$d-0a3(o%1#(R8kQM&QTvW-oei
z+$o9pN(nwM#2?7=8>XmfjiOhc(>{LT6?oTXqy)2+KCl@Vc#+UxFjrZ*=`H2kqY(r>
z@d9EtoPDD1yL(FtnekR$a9@`BX8oE2bSa$u_{UGl1#z~Y9C=$135cDxgmi9YqUVR^
z9%oqoLy<0M%(XfK-$)wXB0zxSvmbLinU7Uf8rkSzuo_<<1%<BCx%CHn0NnNX;r5(8
zAR%PHw+a3}Jy}Tb<%x!txgx7=RunA<CNsLf$PZp3V5@i)ly_X9krR&lJ`B_C0}yE1
zQHQeej<!-@)g|n#(6$2B#;|(7cs5B%52WL8O|V10NCD{yzHY1A1&_M<_zOV#bB+6K
zHemqFs`*KQO<TZpIQ&wNaRzhkk;Y{&*BYNF#y@nvkVt!W4k(=C<Bl+^j%&bUrK7Zt
zjVr;dts$hlCmrvB3`_C|3U;6YT?JgoH$B%B!AvZg2dIEa=l$CNZGQMFI_Mc7d*uKE
zl!)5|U?I$iX@GOnv#UfP8x`?0x2f}itd9Hk+F8PzjosL2?*}~FTfnlLCnhFlc)S)@
zPv*gOA8u3lPAi-w@jGsW+;G3rXF26}kh+GgtY+3EpYH0+cLd<X<3h<CCX~V7=|Wnu
zXfS8<;ay?ukfw5L0#A+7bNN7?GZwX7ASd?j*g6GNMyzOo{~1=d;Q)Q|#dG_gB5@W9
z{0CQuh6*%02*E*>1Exq|#pV&6E2aC#YoY<gWNuzkX5+;9BgsKL^+a0fGO+Y~!jzz*
z;{nt75kRc9r<KQyOEDH3#MQ1UyKX`YeX7IQ)if;<a&@XDlMlNlFU1lCoM8n<G?@lf
zM79a2_YP49{a&jDc^JU5&QFt70R7ZtoREzEx=`pAW6LTE<`-4f9_hJt!h6ffVuXDK
zVADq53-eW)`BP%gSyVLYb{Eeqmr`8cW7kIqf6~D<1_J>kl-s?+G<<F_OnJu!r`L9b
z;c<Ox<SlNvMDqtbHK%3li0MJ>rb?<1ib*9!%i38pSE*W&v#t}QWnc(I#iyGIrwC0G
z|D%PsPk<xPf?*xmU-pKC`uL6tK&(e~?sl=48UTsWnOfzREqM%UFSX}{b3?Hy0)4#S
zB-VScGvx2k^<#;|7uTVD>d5x?HLmvx$H*+Mf@6)mWKobEW&RyG&3fvOW8^GK5>Qz`
zPVCXpk``3g&jFPsBM-ImD+xXlAZCy7TTo>5#2nYE<{zVd42tj7>nD*Mwkc>Q@mD6v
z(p$&TqN3s+yzuD_M_nsbTx<KD$ZZex+uv`E*c<IvOGd$Tk48j&_RAh9-e@qPhdt~0
zHkB+D@a{5FlI(1-C)lODAR!47CvxV7j?Oj<aHc=mt3wjXQL2e<HtwFLa6;(Q0QkX(
zcUz^p#{*n%Ip=5fZ<nrkJrGgUNF%#_2xqwKhC3spG)P-K^xmk$8H|(|8-KP4Tnpp*
z>W@1+!0#wBp}TcucuVmCs48eWJ|y`~UL)GS&RO~8ZGWSxC1OUu<CUeyK9{70L+t;-
zY%uYCfb=(=#!AbY(qI?BE6ALU2#n2?Pc9FwDiH3d0iv+=s*D!Y+%FAVclG?b$qthB
zxzDvulUkmQ++0+?L-l*%Asgt#ATpUJT!53K>4WxE2gv2U=ujc1+UINY7n7G~Jf&rm
ztmAnziUwgTUn!zIoAefs#5t8xk5kFe>y5jd`wIzKvxSDx<fi?x@dN+5tuGN1zHDbk
zn^m3DnvbZo)M0c5p}mP#X)~c3sh;Q~kA*m0eSm16G~#EbvW{5P8+AnZ8K*A_i04Hv
zcF9X4$uk(!X<@$A`YRtbRooT5rdC3Ofl>~q_nDfi7&`UWV<p+22LWurK%pLE1?P_s
zmmCAuRxYY1P3y68b9Zx9)2&ZgB&6PBqP+^wYdA1Jcl-XhV-%pLpaI;a*ZxDn1wOf_
zww|GStk}S`Hn<h0yisb#K1oX7WZCr{n|<f<{fFs{Jrx$+r%Q&CnR|k|QOS*nm&`^F
z2_3lbH`@7adwpROx!w~TCC+lpFu0>(cT&w~mtn^$9_ph0m#x|c?fhKIUefxSBr1nA
z&End9h$G8JhI1{1e~6(4tD{3AAaPIioVnzex^zZoIBTbl1oLzdk_~0$ns;1vc$3kI
zddKsv40*H!UD64GOa-7bsY;$pu1f!ukLzB2A#r#z=^>UJ0$xxxcwrzJ;xPuO;o{#T
zjyxNt)n#G&u4|{xs|8Oix+tNgAtbdRb_Nik3RLW#m%+L$7O7cE-*dzbuzb{Jt8+ok
zq$F4Jw9I6V#BissYsj3+ROj|v4l2oS_Lram$3g*zGd{wr`B|r)5vt;6-}fgy84x=|
zI}6<tQZ0kN^rYIF<GcwyK_HU##bigeeiPC<q}2CHE8P4z#mk$I(h+OLdm&_npmDE%
z3c|WRNR(gXJ;q-R=5e4;qGH#L&vDJ*0G##N1(30Bk>1SGs#=*0)Dz?=M<vPVihTD1
zWRtB?;Gql~YjzI{d=hY2@YUkJ!K)yOziKYLbS##una+*jc6FN^G$syJ;!4)Zkld!T
zea6Ap%@+xq7swLD{OXhWbN!Z`up%hruz?t$Ecw(mT7~Sd$gA!1Px;PuLB_F4f)9Y2
z3@*g!Y<$Bx<6VRaOKk>Th>iq}n6FgeS<)+h5ilw@R<(C+Ak##sW)klH#bm)}B_|;~
zC6tMUi{EptjLUwoj9t0Y7dUi|AJV)i*ay4JGp^jO&K!Q+v9|-WT*00m?XK6a!F#D)
z*NSV#gUQ6EgLq{PN==#?BSO^P^s%}gOdixe*XgT2dR>B*hB2#8QxD~=Wu*TAymzUw
zhSRZxC^MOl<LnzO*kk+c-{?vCAo^!f-iEt(a9D6KqHzhQp`~T9E{hk4k(7|+4yv=+
z)7?3Z4UFG=Sie4nqJC`NZQ7w|K;OYEo4(p|V$ejr`5so5!n<ZSSS1)Qc;MM>Qt`!Q
z#IbBbVI_V;=S^XwCIt*G>k31{wT&z+8lJ>6Bzv~fja;x0h=ruWtp~oGAI?R6l6><q
z4}x9I)&3B!V8AQiqTzS7CSfRBo`OHji3+HYW_ZJ=uR{Epa@f|fX41nwKA`=3)|BQk
zXC^wJ`~!aP%<h%8t`3jCb8<#~d`wLdL=ZfC_&WSugKT88-X#h`pr(`ne0^t;kpNC|
z!LM1jp57G*nRcK)J<94odWJllHFzrI0((rAc|poR=6#*paTrMjpBZ$tYdvJhT>N%!
zTVQ{_!f%LMAW(Gt^qkkn4NfyDEZswEXCyw|G`-@1%kTS^BIrsiO12QjESL3+=e4zu
zB5)Mz8YvHN<zylq5maKH>LQ}&mywh?-~InHS{7S;kkx3!z!yfjEL1Ab4Jc63ykIE7
zAKNmY(Xxc_g+8`o4#bu=Gf=#k8_~3H3&us<%KjE$wqHR7x9rS=&#v%u_^9vQaT8=8
z`O)r)=nP=_2dRJ<FTV`m8DztAB@;MfnyHl=13*P(nJEJJ3vv>YX+sS%P+X=?A**bj
zafYhJ+&ENkafpHBX3>e+yJGsi@`Vr%Sz|`ccch8peKl07jSh*3dF30lY4($?XFU~{
z&qI^9{t0U&(qE8#rOF@BgXBay#4#&>k^>6h!tbQ_oiOn`*J^C}q`ye<ZWHTvC###5
z*E7Jaco`mit1aVb3rLy^=_~ekF0<e;xhCYbn-FW{n8YPn>it_qp+n1iFY-d2;?+tp
zi10f8Wvz#S)2<As^!a*LrjPshc2A&oJ;pdJGxTV86TW|@E+5!wQqwYgg=VF@%cIFw
zXKILxbwT6u-+3RvODxDUmVA578&h^YHY`3*v9FGi94fI@N9IrV)swOg?im=l>1zXa
zw=uB9vdN3aeHYxdo9ryLn`<pKC?T1M{*~xr)L<;3=UrJ@IcEtGm7d)u4h&}`_uUAd
zx=1yXBG@4kNXkW4Gm+>T+ZAI13$d@=3;^g6d5!LzM}NC4z~psBNEihsDa7qmfcWhe
z7Ul)X09Md|QE!3cP*iR2+B}OY{%BUN!7&$!@`d;hxojn#mc9Mz@eRK2@)5IMzcy4X
zVI&C+svKG60%+cRJ$&B{_F?RM54fP~Vsag{i*VDtE^<wOzdmxHqHRKFR9OxxYyr18
zYi6;e_xP2h130O2eBSiG*aj`5!1dDRU1lg6{zlto^t(%;c6CZmVtjHURiH+J3~x<J
z!&+F~7miIaceq8V0c|;y`tMldC)SV~0z`!!?fzfQOI_<s#KKmX<>GiAc7d!O3U|2Z
zmR1{yUJ~~H2kL*O<1dH+;_wD_;$H{v4#^=ZS+9jvoa9&cmXrIrh!kvz!N0D@-wbN_
z1Ar;hGkPiW_ZR=YNrPF9O%ht~w(`O%%{jz<6FuoM>8=Y~kGBS-T!~dsMMZ@jP*TQc
z`TeGk?X*NPP}{P~o^$CJGyfdnYmf&0WAD@Qj`c4jTuQZpw*g=cBVl0~E7WGkW%WJB
zou6SsXdRUrCEou!v!A?OfrLV+80jsMPmIb6TnumSsfk<w+3wZd|A*W8Kj!<W9QYAr
zGZdR^)t6?mTLh-Ur_z&vD*B22UgHXs(yB5(2&CzeZ*|J6@GJ*k!w-BV20c#s_j^KJ
z@e&I7yh#F|0GP2beANdYZ5FHgtsIpt_val3lVSgFcLXK>Ki(0)H2mQ+;D9XS_`WLL
z``8Tkah-HP7W0@6oSe1!TI)RZ12{h^bWNW8GG(=oc-P`GU;l{u=`?CH1OUl&T-&RE
zf$jl5ud{Q^Kg^s2<5hsBc4mzEKh*Iok?(Ggm(O~62Z`u*OE4i-Jv8+f_oLKheBe4n
zS#xdvUy8wB5#6IKS<+4XpDP<k%;hu}$*))AU@Uo+2Kn)z(%rC(J1vdxsR{kx(I2pO
zEF}PYJ{umx@H2-_HzH<vw0k&rc3Q`JlY)cw69s*LSbXR|kv1@qk3ZkRmERM%@z?zm
zm#v>oXD()co7&BtND*QFFU+Y~5`4;HYEj&t*Hj(Xw}K{258VDI&-w4hq@)CR)c*S=
z9287S;p_~FnIxx2OZbLW*6MqycK?2VB|V53*hCkpn=RHviw_F&41o%4DO(etGt$B4
zKE&aZ7d{oH`M#*#&%wBGmgBl5ldbX1`X%&R3|I5elq18T$80w;&8-XUG4LQk_G356
z?;GF(0b4y_izho*KOIjV>`(0CBF2)6ii+~bW2?(u^HiGi%gqt*SzOjf=62vwM{F-n
z_~)1pM^c3_)a9u^0RkX}`;B<LcTsLImb~xY|CR4PegN9l2zsBF)IlHcCAaU>EmfWF
zo}Nf{Q;|d<WwrIepavFCPb+`coHA56&&iPJh$iArHsNM1&k)8>v$znqEl6qks)maV
zq$#1_`9Po!O-ps}@UhbR@Gi}v^YkV_E0j2*z8sGkUR6zc^>aV`?J*KhK!BK`$xUQo
zJaTYpyzII8_YaG`XKfD63w-0Y(B9#NSHbHWJ{y$)^>ldZ=UDqqMPQrK5CK#a7H4SX
zLA~FndbUVsNPI2ShWVw@I@a@MzNZiMI4I3iI_nkp$F=g{4r|gvk2-+*<C6~kO8>p6
zQ0%e;7z{o?Dx0~t9z?=bu46wo2F`<8(9p!i#?BqAFrWg~FrGh}4n)mFuL^(3RZgZK
zqZS!>RBHZSc;I{!eR7tIxWNTi<ZAemP=M*Hf3EkmOlo1$<0N(edq7$>f^MMOaCst6
z6?%<wiNUJx{2OO;RTS6U=U|`O3}doUpuK0_^QX(^XTD7~4NRd|d$~J#Z1z5>-IB!$
z^_oBgcP4^-e_E{Vnwrz%kS31)nAIjHKm1;6;(yodzT~Tgx(Ky*)fIz>L=<N+1N#h4
z64VVFW47Tef-W_Xs`atPx~7A3!rU}oPqee)%S#6eqK3T!Yet%m3Fc9PWAP7-ZAuZ_
zxjmmAw_a*G%dNH>_#*SpYpVHp)mC`tZ7W8YKZWhG$oEfz-9lB4R_5*RuuWQ@ia7uN
zUjQCJE%J}IJ7fZ80hM_;bD&hXq7^qh$YdvoX!a}JcAdAb9SmcpcZ_9P0@+;^c~shf
z!<qPwo@4tFe)>SA?T_ZvC#dE#kMOhOwNek9AI=J>zOMiZ<Qw=FNVAZp>1)HqwCV@x
zo|nc&ah?baK(l(ZVn>er_$Ry~HV0e*9(WAM0&Jp0j%&HrcDgykSX4XK(Ye*)n?QXm
z@)D_U>ll1;@CK)mb>8`z5&(v&bgph($>IUcO%nmHNQUh8Fk_A0mB#Ma>RbsWfFLY}
z2?rQ!g5YTXHTRzx)}~;#MASaib7Pcm00g5yU228f9T7Jy<kH^l4gBrI{YwFq1c2vS
zIB`9t=bOCQMO_M{vBX@n&t|#v)VqTVZxGn^jGk)+0XSWt1lwYNk+$^sbC3tx{Qw6L
za969!IRysS^6z27n4$0}@$(@<8!?(A`Q^i($sF+c)H-`=7`XoIWbfq^K!=nfh!=C5
z7g!|*nvI1SsxC9}oG&If9)Yj^KtvQ9)pZeqKW<KD1Q*HAwKwzk0Z3!!NP7DvaQfD}
zFZYXS^+mThhOQFvu|3K`Kko4yZ8KTHX|TNA4PG0_viF>HZqP2!XiIZ>dLNYZWl-D!
zf#R$?e1?!;a>w%eVYdBM$8aWI!vMc(pJ3t=gK0W2@vuISJeBG(CR=xC4M8kj<R$K3
zUdq20Rc4ju{{=ZXXdh1UWYTvHQxb|aFkQTe^X(+tra9j*8-E@u)6C!dtqJFl^zvv&
zk?S$(t$RRBaCEIdI=3a50(!scRL+U%Z?M3_AJ_P4<>PWlJAo*el1E+#8HtNMqpx=D
zZRjt-_++K}^y*)ej-us9Bm9?VWPQzKxp;`mkx}Bke59r}gZ0%-!gjTyPPisM->*Ko
zCY)=-r8P605*eS{I-tI*OIhFz$mZ)j0Q628&D;Gg41^4uUi6BsU4bq{17e4$>rkb1
zlZrvrkDer6+ZzUv3QfvD30t9lRtRkAMFSG6lPCbzU{T95Qz5^yb8AxEX4ic~B(7xB
zr<g3{ka`0;s3O`1yxoyR=hpe$!>u%HJ!rSDI-bLjpy6!p4Rs1Nq}YU;cs<{bW=A_u
zg-&?paL(?l&*u1AX@iMuiEK=e(bFV>$E8;N7A2wYgLo8xS3^qu&cP;(irzr(!;fg{
zqtZQ+J<H-_?@1LNSvuf41hgu+PoVSMEYvnvz`MrhrpkZAv-BYVdOr$z&rsRO0dHoQ
zO#7hnJ(awO7j?lSaC_@rdY}jzx{1z>3>ezF-_x+O$Bo+#BqEI&n+{O`ZzSNvPJB($
zDBhFNF`Un*2Vdm@*`x)`NF(Y1#mhU&m~1QRjIePw^g7+}oE8q!M1sePD$*Ymzl75&
zRi6|mh1%u;MYt^+`B7(SUS016cAOZN)kB1hRcWy8w;oMvi#IAtHm3RP-<r~tJF!I-
zE`&Z?@s3^{E}jbw1%EN`(al10DF{VTs0whpnT+M$QB&YGdc|gd?9XQewL|DcN=*^|
zWy!KY1VM<O<QWnc*xmAGp7QUYG|+|d^UE(5TX)fJ=B88JOaX)S4n&_)yxXTRP%`vV
zz)<qSe5}gP*HRAGY0k4LA7Upft;k522(H4xwW?J^_KHWu;VhtT56|@<Er?-q_MCLO
zqz-BDazpDg3P#)OGcCcP#~2!VZ7rGutaZam`<maY1tRJqb>!=gBo@631=m_b$E={f
z^EHT_)wOeVob^VcnLxY=`=czXN}drVKqR%>if-g<JVlc1x7%J^AG0mlZ=qYz9z!Ua
zjE7vIi?3-pg9*Ok`BuTCq!V=qi;rB@sX@DFcAmU#^)Y~UbFua}Fl${_{d>Yc!=ax`
z&k~3yYNPNfp-__d$_f!b!Rn7W)$sh;+x+DvOlZ-ylwIS}<lQqixijM^3J^y#{YefP
zr?po!(+?Cm9rnI@`cwH>DqD1dO6q<kou}Omb%(8Iv0H_C;a!8|g+Gby1|V;=38fou
zRV{&$8VfCftdsb_F9O6dT2r~`ywr_Q6EMCxskp{u5<_lTc7_7mT!kVR{6PG$zx0NC
z744U*Zr~jhTZh^Y{v}O|3M7ykp3cs<EE^TwrF&T2Qj?ybR}mqetl;HXWWIXV*~F95
z=KT<HF2umu;AD?smg^9kN|LqLh;eem9rJs>T=z-u#J2J3nu0wiylYau_v`#`qM!li
zD@zwYY@XahbM$91v;6dt|FDh!G17IabXnz07gjNqz@Tx5Cc(@bq#&BZVO#&WtUTyZ
z>85+8Sr=_7Vm6rP>0+#kNQ*eCERT^cVB+z4K?q)(LJ~}xVv~&yVvRPBK1k<)iLg#n
zh=pwglTFzA9UEB!It;eSE4MC<gBaa-_{G)#`t<Z+m*HX{k;$66WZXwK-4eT~(-YJ4
zBW3<6n_IWAm>itwl25TUFH3P%cGzpjX74oxMdF?1Q_5Nma!c8fi!!nJ?CIHVXz!)N
z>5u!y&I`O^aui~>%6p$W$lgBZKiRA7^NW^)fx!;Yxu`zFYl@-?8=e%v4yi$(pLLpx
z$z=%!@DM1%^ke>I2y!{Ngu&UncN3Y!D01=~>4+Otai~qxa$j$wBlwb1l9E7-MQyBm
z?<%%JkbCe+SHBS}@i32d^tGfE{##4H%WVeyf=((@PWjyz2#>--#eD;6O$&`~JnhQ>
z%BS?|kVYf-s87SzBPr6j1_e1y%fHZUmIdeJ<h<D=79E-Y!aHFklJu}nB<I~F(pjO@
z+4HL`h5bNTRZjqq`B+tEMa5r9!9RfZql940bA5SS_$L=N*3+N&`p0*!nAGVEc|j$~
zk2Vi-vgfHX9~8x=0w<TTbqoztE>w>-O(H%XcU7l2TwA3`Lapf3?%!Dp<24}p3Cnl>
z&PVmb@&6+3FT<+rw(oI#)1j1<G>C$L(%m2-(%oGfX_1mHkw!vNx<NXmOS-$eq`T|4
zc+NS`c|4!*^?&=n_8Y;?eeZj(xz?I<%rVAVaC`P?(qg1tj_dCP_TQ<j1Sk|DMWh00
zNxU|auVputL;zO$4@SmAX;KPdN&CjkIrfhl>_6`)0j=?05&As;ds$GJbw3b+2oksc
zcWgijr3U`XpsSeVKa=~vVf@nAD+>NSD6HRIvgUx2Gmx%nqx`4A|I}m(4B-)ieRa;q
z3JV|JgX=v(IpBJYMTYbz|C0vmtwf4RZ!8my*(*PgmL9=hBvK7T{8{aK8KGo6mpdMB
z^6Gj|wg!rgK=7_C<ZlN5sO28+fDGQnVmADId!f+-CB7{cNaEhXxbdHI{ST`fIqLPp
z6MPH=>0Y`U=sX&4TbQB2B3}$-@+bevr~~XS>hMy*5Sa>4a}Aee?P%L+D0v}_Mv5g%
z0`%&iqaZkilash*vdM5B|7fL`rN0=+!e$C<_ufa-W!FtBCF8A87HcCzQLKLKWd!(~
zavA*`aj>0&KL6)cl=Qq50%Cu!hZFrmy`f(j{o?^2MuME)D+=8$FdxC_|M1Yje;<G`
zbE5p`hx&H}{r_^0BA07*`8&@4G0K$SV%?p7DFlB!<-acIMK}-le}64V@W2cbqlfSN
z?`8h}7xL%<>EA#6H{QCxJk*^ZD4_q>Yl2)b<@5Q!AK;%ilM{R3dVGB(#PZ+E{QWOj
zwmPK0fB1Xx^<sVo5Tsx&<oG|NrT@ArX3i4df9A?R?i2MeIgM9^2Os}^nZM^k%7;8;
zkS>`2(5$KyLnG42JULNPep3x*)MKy29ShAGi?~}r^Qi<_xB;<*%AXhT8pZ@cFD4XU
zh(s*tnZ;U2ZV&zr20<FIDR$s;R;Z=O)f6A|l>WCMdl=D7itvb3*)54T4DrK1T}QnH
z4PdWax9r~iPaM3|grz|lh5p0_5KDI@BzdIN%?(QOM~q251pjYoMWzL*#M84EO&cGt
zYjn#00yX{<2eh9bup?Zb(6ax5yB<a`f({Vus*|GrACSg#z+$A*=SKhk`(Md%Rkz6#
z!s4o?*lp0?ty0S-MJf3P2P2LHK(D8>vsl+X&fIRJ|8&h<v*+87o65Bw)yY8Io=6P_
zzyDc5qZ*pncjZS4wRw0d9QR>ONz7fmf-MA7f{|aoO&1Jw_x6-#j?dD2Adk4oqYl4h
zwEKl6enp00LnWKEb50ZXe;j@)t-UO#)7J}m_D?>snD&*%v6x<Mpbo=>Rz87(!5m2b
zHICQ&ECFc(Fl>15wPrwb*b$VN+5tV;5P0vV-auUEdJP=Gp-Q8Z0+dg^@b%D>98W1H
z48qflA#((CZ&xkc{|UHVMZJWx9D-$7Ce%TAp?FWZq6!9rT%cxk7?^YP_ZD!xSs}YS
z1!}*j;ANV0??>?UYArs@&Bx@PiVDvV^jjjmnI7Ar;xGNVGs(HbSDJ;zL-|b75Ujc}
zI7}7h;g3EQul5mOyIY}-=a@=MD3q^MNH?dF%e}#fk3hD2_0KpYmpl9G?@K#`f-v7E
z31Uh~WQZUbvPRK7mC0oGwiI$E#KS%LEwA03e`yYW<o_ph_YjAbWDj2?6<*|eZkGTt
zt;&2h5Pxrt<{Gy8V^lf%qa9DtbKbjxveW-Hkh1Y#9JH$WYnXDp+S<x5szi8%cfm}^
zZUH6Yuq^#bDL$PSO}AS<B2xyG+*4o-b|h2>f(cj*zO|vlN4!W84Dx1-f60k_CNhK9
zD~a8!ny$ku#9!ez&)!w@EXwDPRVpg!A!3fl6RhLiabi&-RJ?|bTAF=Zn$Oh29@sz1
zK;Ie4d(<iSZ4c;{t8Q6MWuThn)NHdEE!>qL4;s&rr<9bGEF&DtpXs9hZUjhPUKqo_
zIT5^Bs~&6w(r{bZ;Unl%aFe{rrRALNEQJ{24@g;2`(Kj%7gzmjChJ|}OlEn-r%XuI
zR$9LKqyh7ehSv#unqaL9@E3QhfT6>}Lk7L8Jt*G%ZFW}6W#hYJ8ZnmJ2pk5l9=^`U
zuev1fxR_a3et_C<@<-L&dXEBTHjw%bVVKatiXw|7SZ95C^UPuU0&2X9VE#>fbq(Q<
z5%z34qbrv7=bzj6ro8m$qH(fYS2t~YebtfdF%3HFQmx-)xYlrh-Oy6l?Sp<JJktqv
zy4W_oZ?cs4=hnnKd5vxt(DuJG%_KsMUh1|SFE`_D`<hMsDCay7ma4g1W+^+q5lQNT
zpQOq_RBX2OE9;2_&C`Yq@jmtR8lO)mylM?RIg&Fi$M7?MbR@t!KFJS)FGWm5s3<<d
z#_gz!+F?D;^oo?B`5;5zsz2UIQO3dHW&Wq(m&V4%ewD`lyBL<MD6p6C-YM@I`r6zH
zqSDEOz3EX=X*km<q*2FzGck}yE+r#fRuKP$4V)&kpmSIw{+~?+SWQ-L4rUvDucmrD
zE|<M!)u{-4ILInl0o%_ASC$Jos>T!cb+Y?a07l)*m+2XnOUDDy(j{_oY<R1ds_0MY
zxqk>g0c8CPcBWiBgbLkidp(<g`6E<hfKd85Z%>~xi&JO#PZ>Erol%x-z=!3(_YJ^a
z;SQBTwy=g3(rfp#Ok?MJiic}YP4qt*Mt5BkJKE@pgBRfog!kSVT;|@%PH=48XS+f^
zI{<JxZF~G-z$YV#eTnAA{^Iz6_YyqvhYuf)_LuDuaFKP*Xh>nvB;KiDiq@MFvVKY2
z7c2VT{>DtBV0?lr2Xh40T4KCaoQUI4nyCC4-s~uFsPHCAw~!Ab{BacJkrb~slaA9&
z1^-T2P$*&<(aal>R<5i<T&lR@Kzs~W>S>+pC8}<fH@{FIo33<mV@?N{#CXwd`wV;g
zo@uDNr5>uu#_i>1qQ=I5s^5#X|I$cQIS&6iM5%BA>teu|A=UVqZ-2NC=G!z+@7Iyk
zG4mfLQFmSnrjxQXi?lQcDhgqmwguuAN=HVH8&U^o;rRVA;mPHsyetWGXiPM_LIwSB
z<ZNDQSQ8`I2F|?67KZ&W{EwW1+_TByxg5QU+jMB18SS5Q<>8W;iU5L=BK)I!_wQR}
zf?YT3V|^~#-$}@SHfzi;G+@`ILgV>=<UPZ<AS0`pAN_yq)xSVhfgSmo<d2W{*Ubbs
zA9@Yi*6;tpB>&IN3{XLm9U1wZq)ZrmO2H}2MLP}lLbxA)rgkj|3Quy>Yb?~ANO%DU
zf%zq<5FYE`#Qy`Fhrr_Mq<o--cfY$dH6JZbRemQcD|#3OIOQ*J<g5wZWF(NE)5`tO
zU^D;iKEPo){mWr8wFGdq1^@_PkicOIT66gpT@h`7d!Sp{i?9dY5(Q-=_W$q52>M|^
z6c>QX%I@9*Z$4MU29tn#{%zT}r_>XcbxR_hJcK{{IeZ{x__uF>amrQ(1mFV*M(k59
zenDD{CSUuKZX+#ce-4$-UDM4W{08y42g_lvO`iP~0G;y)HhEJ~G>Texe%}%jG!tpN
z{^1?=uRDtc)}V~yA3F^PuJyQ8KGd0AGNcbXvzG#%+(29_hQKVitua8q0|<UA7MlP<
z=bdS@w;$jUvgQMztHNw#`qLS2?*mnDVI(UmLDv2sV`aesO9Oyr4p>&4hln805?t%v
zth~RwPWUte$lZ)&o&j-O?dS-5jSo0D0Q+W4lk0yN_k3fwT^8x-HdwxVYGm=>hH6d|
zEKQu$Pgs`^Ti_;w)mQ)m`TYIEJ8O=G-Y3^r+qNhOW-UykBVubgywUkbf6Qvkhxi-z
zQT^-F$p{KXMapmwHC(+f=!=V-4ainb>Q=slH+QQ2M`nddBe(aNUpY^yOIk~Pj8>K2
zF&oc$&B;&D8&+Li-Rjw7-?zxfNIeu{m3ai#*-gMLsTxWbPgswW5LKwA)jfSo!tJXs
zmn)bh^>?b{wX2Y#b($j|o&~;Dc(cl$s%IheuVE0FPPv9Q{CovjVfrLk?6;z#5X`)}
zaSP=6^1}WIM`zfzEg5M(v$tRxCPR{AE)r=q=>MwKBKc-%zNK%;yXF1qLTa)`E7xj6
zJwJ^6i=k=bg?oNrgdz;~>*m`z#|3e|HL#)f++(vDPPX4%`b_AnySnA6o<@RY+X7h$
zVBp*zp=7`z2e7Qwug`XS#8&8_bkmiV+b+FXS0|uBVGgv8=6U(KjBDb@N<|xdXOnX$
zT%fHi*oN~`)hj;G8HMY?QjeSq_91pba|2gxPNEsN&kSc<{Q5Y%CA6!rkB*L0CSNF5
zXH56oFyJ{`-d;BnS4i|Fa8v;;!{A>khHLTQr&a^}*9$q<Cz&_(=bvYQZ1#!0hnZ#F
zqWAG`E5-uDkF6(p;-P?m?IzsuZNlf>c6}z%p*X4YuXxsgqAJaSB>(6OK0ZK8uLU!x
zvR2X&uo)kyzm<J6An?M}vt_<3jH4r<ow)4oRv~FLx`$;KBn2wiycz(Xsde7tp3enH
zzX93bp>3J->bK6_y>uJ*mtIg!Q9Yo@T|L6-53^<%Uv7}%KAajs5l3x^m)Qrg-qPdd
zviA1S|9U9>dUK|#2xAdm2)}|GdH8paYxxbWq)_sKp^|k$iuk7F>R1jszi8iD6i`Q0
zI7icFRyT6)n@}g(xO&|JVy{PGjBvE&ywA%bB*m%yHl<Ovmk_Of4hxL^A0iCR){T8-
zm&Y5_pDx&WtN@$ffkIAIahG4zFpcvnG`f60FTXIjf+uwsOkw-TT8vQV0&mDw{dmy=
z_IT~N8PjnICBK(i(ngX?=))J{aJ6YVdA=aNzeJe-Ia4xb3P{@wcV(b1wDaaDY`#92
zIeM_KLX?{6IkO)Y6%lMesRd&;1zHD>cKW-zDmVb8t;%{jk)Y&$%e<(hsNt@gN;aN&
zx{$n=Z9?1pd(Fp^o68EmksrK3Nx+E@KSlUlu0Bt9ce30}F{fbnG@uu!wHoMJEJt&O
zC?;CbxAlPylBsZZX45+-8@g2CcZVrX(?qgKPVtEaAC`0JRFa&g!-IKqovhoqv(;u+
zxWX1Emsji8zG?cjR_1pACOu8=ZnXMD;dSi-#SmO`h51A!=se(jdZR;po^tqa1qdR;
zBi_9VRdLXWt$)b45YF0aDZV<q$lp;K*H$Ak93$#etY;e;rkw&`FY(pASV3ueG!MXY
zcmNYWjr7TD52tqM^@?*5g0Tn>|0Mo4(^o@-=r#xV?gYU&<WXp~USCWqy=Pq9R}Ppp
zw8Uc}uxE9r34%h(a;ZgBJ|Szdsa-nZw&7dt-d2Oix^eCJX;$^#)wX3*evx`D-<b^u
zZnP`;Qeb{zDZi@LW|&!pF)3WMt>wb@k0PS>;aofF<N?d0u&cDJjH|IwP(Z9IhcA2R
zy4ic)o{Y$zHNo9K9{V^fK|O_lpJ)m)X(6a)nrxQL_xBGZMX)5)7j6ul?jh!U-uEg8
zmNo{bSwona59;NuiK2b$h%Bi%0}TF)BDukJKl~)S8ICKIII!pFQztn^SRaAEU$a`*
zs5!1?{D>-@z+SPDbX`Q$aNr8nETAS#%bXX8r*3}L2TTJPYEQ>C=lYhOcyt~twchBu
zIWD+40#?CS?WMwMSW9kQg!Gg96A*UOU$7{5fJ0NIqU>$0=4ZYhvYD!V`|<i~in}Zu
zU|^O-GzV6Rmnw<|R*iR84OtW%TdCy^yM!Dk>$VAu`rZQ!F<?7M_<S&0l&qh(0qDIY
zVBBzNi$H<1muOW9n7&h|N8J?*?$>>%v<WNg8fF{b7H+fGi&{J<gKu`1gBVaq?7Cjy
z*OHoO-?&@O+70mSklp{BBV?OyKF3L5GOQtLu=DpQyv=2#>35M_DB7#pmYFdxsxJVz
zO67+&R3iQxJ|a$=l=zKEs+h)mckQamyP6+rCA0Hxj%vj`Cr*G!GO2F*-58CK^<ZH_
zER57Lj1SKZxi{6{!Eyh5TkAWU)_1yHz;Ukv-#nv|N;!3niDI_<#(eU?rNhnj0><@u
za$(<qz<}+<))}nVY}$Ob5Ff3}cmdhmwLS;v9A!Sj$-Z7-ljXmR0)x_L%Cj2&W>&7Y
zVeh<!m}52L2NB`LSL+~t4#zor|J^P3`&|DRcU3+R5bVxJRd<~A7^mvB>+$Bd=5FI;
zPK)j{Zs+WK=LW`_4X$z5ujcTF4COB$)KN2L+238`uIj;1o?nxS`XDu8$D4~KvuNj6
z&3D)8)O_^-^Y?Z?yigm%X5vGvHn`nhJ<q^|8*sw9vddBVQFDJY=P<LIx{&Lq&UbUr
zIO}3$t^2uq)JBPfD8q4qWNW@h*$q?Zx+9Ft`u=L-&E2(&{60gp?dh0m^?<8mzG8N0
z?PsD}mnDyTmmz$h*SU8{9y2~0e1m~BR;@1(#uK!#d)RS}d#M8*$qK2Owe2vuv*UH#
z(>jl<qht+MZNI-ef3xQ{GYA$WD9CsY>%GBx>#Q(pD!L!GUDVir!t>L7qVTR}^PYlZ
z?xW;yM@^?VKR2Btm#v=%qNH~~`~2Cx!yHIZ1~6{>xsEz1Z}&$56|(xcuae9?c75I&
z<7&ZHh)9-eZB$MX_1Z1}ywJU4+YP9+YA=p2fR5&NzdLQ-JWsHr%Ca^agO=UAGX2_P
z2=QFUtp}((I5s|{-`|fv6#h&8_m&Ip3Fpp3wtiDUs6Akee~y&Avtx1Yo}dSM=D{uk
zye2$*Khy79`v+6^r}Ou&#lx2`6CHjJHYb}$QiaUzM<vf~6nxYE(3L5KbtnHOsbaqw
zsmh(>OJ~y~_;(U1QjgsfV&q>?rP5{Q%dkD9Tl_FTYJl)**j2NtVRqH~_f@VI*PNSK
z;g)qUFc_0?xf$xP*iqHJWQf|5u-ja7+ZpC}bDd|Axvy=M7(%OUfOs7DaW1r)D4NXo
zU5yVPC*|G^Yu%dd&Fap^>{{JPY7Tw_K2u$8Q*&~wy39T|eth(RZv(tkmK)&r^~H&8
ze>SRrYXf1ki%!Q`BwJou_%khR`_8?lacHbB-W&NrsB~XSxT*>fZ;p7t%FK%_Z-=kI
z#JRo{Wu>qP@8`dM<;Wt}*B(kjWD$7z@P%J698ik8?|gpq7<{`|5ya2ucz>^Zj2Fc1
z^&I8j5jGyQ9FLb6ihEG9I34Jb)zt!{p*b|2)9~ZhRkJ&AvNc^}eS2x5k9IXCSR^^l
z+9cv@#W-HY-&{qj>e%bfvg{_g?4i%?bc%YFKIlz|{0j=CYA@Fl=3EMLi^O!#*apNR
zsib#-S=w<uyu-XR_4ZhPjDV_fcV{OuAI3CZgS3a`xYnCbqo+e?dL5xImkDBP6W9E>
zREG^!IPh-kaoIP6Z9r)Dvs!fijKdBv;gcGN6p5n6k3)0j7Th?-KF*bv*=gBq8SJ8G
zf?jS-!fw8_mzXvO!Y@`7rvffxqaWvh|HZckhp1W&DWs}>{c1D5OLg<8hv7K+JIB1!
z2wE+%5de@KIA%1hSA9MNeo98XRH~e30BF@ga}7`x4kCOe`l;6{;1(`=3HnNFaFXvg
zCQEduI><bheL9YYq_=+$8P3j9kX<CmB0N9UQB_!cC-AOjNts-fqY+gK?I~kZBlY#^
zwkarqj-6Te2Y|}|c)x9czsF^t&1!ZM!yeR7-x6B4_)M#2ky^oYGG;ODKk3*cLtq_E
z7u`2Svm3g@rBnHQW*!=EyV76_lZaI)P0)WIFxb#i0K@{4WKPN^(YePMvY_!<T_o-y
zH+C0*O?@ZeVhZc1ICgt@t8y<E$iYfW@GL$O2iFWe&OeHaFiPEsN|D)wl-ZBjI46qA
zY&a()Tp_i2#xqAL@BXULoNR+@uYO~;1HcPAr<b!Xz{@Lad#8Rty|I3LdfKt&3g4Mg
z#kzV8k0dBK5}1$0ffO3$r%MbT%svn$Y%|DQqR6Hgxp^J(Ivr5LLe}I`-G<V#ala4&
z-@X`KS?~~@;vs~Y(}^c=O@;!Z1av1DtfB9sYkGT=8wH(}%ArgMYd6ler0hE0H@?}B
zdCw<<KyUWLm5R9Z(Nu4wCnZ$#&|RO@I0<gj+k)g&_^xGL)5&8!4x!h!V@-|`Gwv}a
zumRP4b4_R*&hK%@f)kD?CBpNq8ZITGq5Jq7jhL6-6+2El(ah&Z40Ug(Ic2?a02ua8
zho$eZV`G#TMiM0d+w~l3s{zb1apy1v`1D#Cg0IMvJalS6MsBawK)=8iN{ocgid5<=
zhDOF-@Lu_chBE&XeqM6k=76<LX0-)3>#!^L(ZJ#-9UV{UY_Ek^y}}TWx}ev?Zn-g%
z+nkG>HV1a(*7L3=yVC<t*l|J#4^=3IJC~RjNa&j$!*+S}KD|63p%Z@G(uIJpZ9v%F
zJ-TmD!)i79_P8NO4^Aqj-LiKf{<G_y6lBz5<}3&`2!-xzGlyD$q4I-U*AuTT0$d!-
z$62z;N%}9O6E9-jqbkit#ExB;{O}*=*UKX(spKh1@GGbWE;e>FzXxlkC{O0+1`bOd
zs6@)pa(0Z)Cw0|n>)IcM1@w+Xn#SE)sd^aGFZ6vq`;c}utzS>%cyyLgGQbm>dxKoZ
zIu{Ru)g8k?&D)zzh=-g!xZ0&S+C@wF8gsw;gMm&7a5up+G1TC5Wt7F;rQ+Yi@=bpN
zdvkTF8`y+1aszRVXVkB&zBwA!;TO0$@IemPWtCLX1vK3kDO2{S&{P7rumm>qJu~9o
zcs5Nbbc6gG`K=P5(HTQDsKBRqB%D$OdlRktt&cTHWK$EWMeYkivVSCeDK0qT5R^Wa
zLm%Skn-*|ughML7jyc7#nw9d_N@zItAYHWOxd-8$n_&+8aJzOmn(f(sUmenc)rSV(
z1w2UASu>MHfL;hfmiA;pC06L!4Ge6NdYRZZffQM0@-*wkObsaZ*LdhE1vw4drC=_p
zX!Hgf+>nT*A$1f<>$n;W(1>!VZR1JRDIrpWbR4s~>#teMy&AgRpJfAe+O=nhHjiO<
z5<!yn%d)V}JEk|i4`C9q+;h>?ZD`TJ^)PsFUP$wi2cvt-<<($nYp-#Pc2@pf^EL7r
z91?$;1OO>=771Hu2zk1k5H60-7iyJ==wqqlLY^`y2+?_be`>CO8^nZ>;?FJ`L$6(-
z1IdRvKX=)H{+jdXx)KICf^txn?MhY>VqYLG*%w4b+3$~H8$YaZH@gC%1*|9E4}m0s
zct-a?W)%xp*4G8=^Ccam`JNT^reXRkFGvL>G&A1nyUBnRo@v4LIK)JJj@>%+^1B(>
zUOXvgM(euXzXTyB-`;Lk3Eh8c-(ai-P*#x+B0T|_dOD?_V$C^5Zhd$-k_$>rM*_Gw
z$pRip@5d)WdfY{$m<3HbIcA)nbiF>=r1TuC{nRy!o-<rdvwJTp#8n;v8F#i0H;uCG
zt_`Y8yM&u_JD=m;<F@}KR|IyLY(m}Kx**onYdhjjcyuF_6qBr!A3&bMe7@kA?E041
zUa%=%|Be3^(lx=6LGU{SR2;?u-rI8baB#}dkd}Ow)M$q73JhLfbaa7P8tKiVPYDX8
zl?eWCFl84x!<&iqGV!AiM9LN2MsD(j;hP@O86o5Nli=B*2Aj2ne@%aUNY(vr-SFAd
zqp!%taBaTV9w+?#@0YRGW^gy0&RMv3KxL)0>#1q|BSR34ip15XKZ%d}P0-eKMFIiW
z2Vd8xca<@N(FG|H2G0-|fQe`n6``;U*gbHcmOQ-w5ZMDEMqB0GvCy^O*di!Qm~}?$
zTNv-5qdBQ&lf|iB2^?WrFBEpv+gZJD;<=D-3XU|xKmSCY1o%QCEU#92yUf)Q?3Rrc
zN(!U0fe@TNejmIP>4(+MT=kSWgRUcgbHV8EJRYioq2|41Hd;XtbChB~*1E9tVI;Bw
zOp4SJh^e2wJ%Kc((rs$*x`f(-0d73*RSE%iJR9%>P}+^4EBq;p@bS^GoV72(a7q^h
z2AaCrV!ix}PY_Q7lULSPQh?$u#Zyp_?4l|SQ;T)qgrlZTjpVB`<aY3~NrQ#Jo4Wk+
zIAB=T@&?ReXL2-sh<Gr`P5lMXRjSrY_AatpUbVxS{S>;MkM!e%|4pt9l&^%ocmc_f
zLbWL0!P1#>vq+?2Ohn|-ba7dk^MIU~OhAse<B+opJUdo$GCqFm5Io#C7?BaiXnlzS
zA*X$@{pghq^ZI9u1s(xbX~_Er5v}cxjyh|%32M_(C;^W{BGT3*w}@F#ND%wi7rmk0
zJeHya4Dv#>n&{#Q4C@(q85G12T2j&P*6Pm>uRj|m>Z#yHzr}b9C(ppT$>>tyTm5Pu
zu{_y=D6RL!{*~)Os&wG^W5a<bfIT={eZsuOIag_!k2D;P*{3LVbfJ@)oaS`}Dzm9)
zvqQFvo*&31A#AR1k0?;pj=jmRCIa3qdkq&zteNrh>UnyzLg;eVOjuK8E<$tgWMety
zcO0%wKc3MGbih2k!r;V7FFlgFEAW5wOy!d4Gc6_R@k0$dT9uwQYOh-_yI$NOk-qi8
zv|Pc$Ukc+4V11AIGPmO{1K=bwtdF*GHc=f8f#JCHadSMDy%6M$C`mH<0XePuqW<+6
ziCM%i{=s`?Y&4hF`ys1V&S%6FU(!VtQiYunRtCLMDB+X4BFPl(o%q8K)gch@dp}kQ
z$KAl(P3+F<d*_)iCvcJ%d!7yV?skMiX2<L06YTD#hWB@FW7&vg+t6}y8$;nWYIAzM
zRtV1Pr5TtduD8ffx;j0`7M=n?VTj3deFV%UgTb7pUc_#TB;V857iemD86-UJW8~#9
z7y4T`8M+>GBm1wCG25L~zU^?W%LxgOz$^0H(2W)8>Xv8acb<|Yp@=iO9}q@{0>5**
zK}hG9Db)Pian(nM>1T=wRobFe6U10@Si?7~WZ6)JyCccRPRAV-MX{AB0<0?3!mq4%
zv<qLSyh}n3juVQ7Xb3;*SIDa?-HaefKVvv5UNkPf8J-WSH$;{k$KOfB4qF&M$K(%V
z{Aw|{h(5HJOTMV@JfyI*YQ`%s7FCIfadaNGNZfjR5{y6WhjlqTuj_#}@#T3@&atIU
zC`3uZ&wH~Py&-ZZp8<Cs{_>{iRmMetHR82<0JqHTKICYbBtT^m7+ug3HhHr7=ZYmX
z5<BjJ(wsB=tnXvWdy2J6vl$g6l*M0kX|y~yL*mM{h!sOlk&7?|d8({*&k6>OznwkG
z+vTL_r9>{J{40LEdRHN2=dOb4*B5%~9xvsvABVUVkJr>U<l0}8Y<Xl^2Gy>$Zr-yN
z#jf{}vJmixPa@D_$-TLBr!d?fX6A?ACGd<7B_WgZ!nKR<O*z>?ZGA_U^#XAH<rI-L
zi%`=d(5-D(3B{nni=E{f6|`F~pD0GX^rb=;#L7Y)hcSAl`<z{%+}x%vs1l<I=NHd|
zy0ui^$esuB$!pECczpi^d~IKbxY*5be6B7OvNcYAQANYSlvwM}rN)XQ?{7f~gsViO
zkWsv`iW}lz)=&G=UCTfaksl8V*YXvY@MvRTAJ`<#aeoS3k*Q5{T?v_RIX_2|jW~Vn
z8lAFa+l+vpF|}T0(59KecMmLEbMZ`A+k)_AR~{jFwmn(UpQ%cJQ6}`geA>qKhhAt?
zV0H3fIn1ab?;(%mO^cqDa^^eSA+?0vu=oSwWUAO~wX#N_r-=jI?JSu03cJ~UsJNrH
zR?_znD3pVcB>=e(seA@Pd%5oO8zV+X;{yZZXBVhN(=qUK$gb9HaOFBBc|%-YdEf{0
z)@0kedJ<HA)coEChW>Z!zymVo+oU=@SI+M+GCx(7*t2s-57Or2m$#4lL~}GM@`jPl
zE<&1Dacr=UNiX85W#?I^;2bn8tBl<A$MR%HFFEX@5gsqjuq`WYo=z;tF=9kYL-^Op
zocJxLm6$xti~uOU5lxW!+BC7mbjJl{B{73rXW0mg<K?$~hwoK4cb&f_fz3{^X{a%_
z>8LB6^;OhW{A+=ErZT6<EzSyixW)8j<w7lq4&VY;$C91^opuFF8#xOnrrYO%;9)pt
zyeHzeDs4n8FChu{U6AQ#a8wtMo=a~&QGR=Lca>k%8&f)bSyzz5Y>R|qmov89WSQ>z
z`-8>s&k`{j)xyL}i(L(6+x50S=SKJ26CDEeYFAlSU-mC7^hZeySy}}Kg1^)$bxv+s
zwGwR$pA@YN0fs{4=5%HdB5;9F>{&GI{<`7`UV<hzdNY-lTVc=T=P$mM@CaZbQNRin
zz5hNVH8yR3I=_0jf}7IpD1AD!wHsO{?z3dAZahPiKj9HLh#z*~FU)sFT|yhm_+^Vb
zO9C&d*q(DeHhoNeX-rjjjYPh&^4D43olDMAwm;a_bK;Ti$_q}4v@XAXu8IODsk|JL
zgU(cjcr!RuOeRy!bMZ_pio=pyY>k?^h#pQP+clxqUWG6QLL{nl{HZD7zn^Y5%IU&z
zZnjl*oNSKZ9F%M>p1-%{pB1^;Q7Y1r9@{n_&7rR~Kfx-t2n)j`N~Sb<+sr+EhXnDg
zCMK8mM?W07K0ZngBVZ9@3(hegNa9<a(5^7VRCo>Z*_O|!ll&Cl8w1O{L74c)4-yaK
z3(*Ki;^|TAew8$N(;C|I?e&Q1RJN=&%+33)EHuxE3zO1*1CV6qxg~3RKot71k*fJV
zeOxg2L6S<HWXja8t$g3o#6rOLDTBKUeH@O3u^b9HSMyBY7k0=@()Rw{(e=fh$z~CP
z@O9BLiI;jSaalI<y#Rtp`vgJ?%AK;!RnIvYx-2#p&m{Rp7@2@fQ?~iX4hBK$8~Qd$
zqyZS+MYP4v-0Y-0Pr6&|Vpv#KJ&u<SX{BxnH4xH4738w8M6R%jGZsW2UzU#-sDsF{
zPsQY&jQg@|*msjRo~)5%dNc<Cz&xk%bt01D1_0tUD;5r>;Pl@`aaijdUfP9m>+g+h
zP@Gzu^hl56AF%wYg*zTj6CD`8;mY=45+DXn6WkM+?QD9F8l=1l$>gUEwz1b@yPX0e
zl0$r^u=SW$a6LaWJ!%Bg$THoj<D|Dr$6!Gs&#f<pb<fP?fNIQktLW)n&q0wGnd@>u
za8>w0%vwJQlERYUx1pSpXt6THWHx(azsNA*#9Rbr1<TnJVtK_RYC5-qb~k!DLXR$5
zrul|T%(8srRUYRfBSoP>r_sQ;x?IUHQofIEJ1ffhy{fQBDe`ci1}^bSLuLqIGkRmO
zG(+4J<*p;wjC(rnSJLt{B^`D-C8%Jw4`;`}<A12bXB5QHY{kVp2#iLoUDg|^PXE^=
z5&N*{Pj*KQyMJibD}QrYwt`KBs}2(vJyAA5V?{er@I!;y&P0ms*rZ=>DLoLv5&eFY
z*)J%`5b@*wvt#pD-`|hE<qvQjmE%91T<~oSwVJCTXw*g3M1p_e>U?3A7Mret<PYV)
zLLqwzK<sm*7ra&ZjUsv*6#O_ozMuHXmMp4+Mno<eflh61Z-#8*{_e=%t{;^T{Yhiw
zT-c2}+|Q0$hRFuyqtMn8T2Iui#hZdNQ5#3GUjJ9}{?~ivxjYdEdvtOso>w%^D<M@%
zkP3T?naX<PMGQK%5zw5G?!PXy+bCquXBCI%ad*$tt_{6Nh|T0ND8WV+srNZ8Grk&`
zW_0MT-Y%@Q?>Tw$B9PA7x$xlEdYSDC6-+|b_P0^aSU(KTm;7gwu_*X5x_}U#s~Mq)
zsKHiOwCloG=Co`rTW2>@l{4_&k$r?j9~g7xSiUh2-6yP6`HZ=~2uDaaPL1Y?ie5Qn
zRIK6P`RJ8Q$&rQhqK(1a4^50de}NuNl_LVxogG6M{)X&HCX{8(S6?^7`qGViRB8-y
zQ?Nw%5YY%THqJ02V_)8mwu}4t>`uSV>0lp<RZ4Z%@A_t4q@HV9drGfOiW5$o<t4=J
z$?x#!(UY5B5{x%Hr7q1OytdD+`YH$U{O>JJEjq}YOZ`gNmMHik2)js~zvt`4<I@I}
zVD)|pG4^}Jyi!j)zcn1D(rZBUMPI?a6PR~BmLAip*~}s`b{(rw|IO5y@bWgu7o58>
zGB;%NDX8cva!R#o(flVFO-Ac{wUV^E169N(LmG8mJ)*<;dNKt@i=0=*!cfa5{QSPw
z@!zjJVT*-$94ulQ*n|gmbw?}Vo%KeL!g7Q0Jv7WeLpw;$ogAk(0x8WaImH!+YDD#9
zwq~o#VnP~2<<~XWE66X8-;PdL#9wTl_mA{TZmyx-a=vAMwZ$2ts_Nirc<pp0sPtn*
zk8if(VfkMasQ$1KI<h;0=XXqv%yUp6<tpfll}`q(=Y5GCuS`@YQ#T}VHfea}<;9|}
z;=vy=mkn$zC>7-$W<6~_oi=bn4NrG?o#O<PJH;s&RwHUjo|UE@%I(jalt6z^dNn8<
zK{f?61;2N!u8<1ATp|pXoMen5;i`8dP<SXR&HZ4}=bxsD4Eyd-+LuY$;T62@sl0;T
z!;!w>DJdB}{!AbsT^dgX-%-Ghvnh{(E=TA!8vnSOGx52R6p6>utIFwMn3-tvwqE{q
z0j9ZEv8Nf+fp$eUC3aH&Np9t7cNe1lH2VEt$386!@Htk-zb#RaCj1h81L#g&ijEUa
zZ&vq*#QnGF-y+)tAj4o|hb+FcNxLAx>UW7oK!7K`{`&pZf<h)eZZeri7E_s!3#@cH
z053DaI+kfe!Ir%Q$Hn(c7(IWL(`3ypkv4Vg7}PMCL#y!s-*DdtCv}YXgw`ma>%HOi
z7Qax1okSVFTbC$yb?Jk}otB7geetYZ@?svw;+@zlDPXL{7GgLcav0F_bN#yONxTHW
zTU&HmO6&Ip6sU<4sPY}7+_@#!>2IP=c4uV0Scla{2tAy;`iJJd!-Sf!7Y~7F4c<oA
z>B#dAab#Z8n+pV8kLT;!M^Yyt($@sX0qw{s<kQRnN9QqZ&Nw<ug%=JXS0RE8@>i;p
zu^8JP`fe~fN3WB9R}Y5`UObN9lJ^i0H474OpLC{OdGU(j{#NM-ZKYmdL1ZKMtU~1S
zqE*q%M?kyuSY(A&p75$~B}^7OdxQZcB^K|oOjNrc54y;bAj)N--@KctbJTB_wyqC^
zY%XJrK5+f7<%Q)xJsJ+bePpoG|7bWbL8F{Zc^#L`b#cryV1^s{1lEl@Cs%+<=Ihi|
ziS9x}p0IfQ?WO!~ekhMUu?={A>4kx(ao|CQWix_=9v%AA^-i@7uyH8mh8l#xXf!YM
zPRpw7BASKDjS%`Gv|KstL8l$|zf<u$e>Is{C?W2Q2@{t@=5BtixLfaY`300T7R=$F
z&t_ixol`Sljw)b{YgAeN$geMuTaJy>wzV+{b`T8zO}9PnM*3K0GyZ%VScfURGo)M*
zN5C$aPCE?T;vpop@JE;PI+hoZpAaEOJ1oHS6$s&5Sl}&LLkxowJcr}?5{Boa8%YuC
zUyt6?(Z2GOfj&#s9S!1fv&eNC5aTbPlXr{LhjqWlbXOPg9Z8x6ON+^acO%ijq%9ER
z{2i)OY{5dfXEg_bA77Z#43?j2aK2JI-)Y!zV@tcM+Wn*LNBVV-VJ!kKNHbpt47WeR
z*ls)=7D17GvA{SlPQx(PfcIiRX&Uo|z%qP?S=vWW5jCsg3XXVoV5KIDGX_SB7!#fN
zgP`=Y#b1rDcWp~Q!5n&@7SY-!bymS1_5|$5=PG8|<T0#oh@zb$PBnKkeEBWCK$hPH
z0O5xh0EHp(VZ28#waw7ogPHu%+;aS@3qNRtd>NQ<H5d=thf&d;a)!|*aech(vUJRr
zFMT{c)*GUH9*g)xv;N&mfo`ovg_(hkY{42bZ<H(_I-9V4qhW^(l4zM`#d}M}u_j-b
zLAcit!FV7-Gpar-$(#S7>3Fz1sN&!gU?yBVf;EG9T}84m5ok)Cv}3NBp4AAd`E;?q
zVkZEurb88s3e48r&MPqQ-x`lyLTP8DP1Vx1#6sah;WpQsz@DT$qU_1Vy6e5bzu7<e
z;v;b<`NP+NmS4e%-R?fm9rEg7^NcCs)g}+EDJF1NO@y7Oy>%e6AnZ9SlUy!9okoTZ
zlMl|mVxKE`2iLL)owO3Iay~~A1`c?m^RXX6*S@^#oSWHJP(Jb3ZuBGqC;G!HqtgpR
z#HaY}*A`}uy`kquGH{+S!OpfrH>_Kyv#ngzc3Q7=q<GqO3~3`d`$!|H^CYUH_FuZ#
zLHZ(U_Z!5CykyxMt0h$|rhreL_11X3^fkvVJBw$_PJ3j*MIv*W%=!GxGs;qlNF<Xd
zp5dEJwUUuvYaeFN<UTPZr(+=u4tmD3t?h-yWz(5!w+Ic!UEnI^d%gij=05q#`R<Zc
zliXmtK?Q<Gyx4~coCa7gDmSOfvI#gGEQH7JXla+_=4zJTDylru#nTiQ=q!Z}J*y8J
zN8{n{OJbudJboMU+(Wa{LVirKveRNLFFMa~#%1r{nBov|)6^>yPND1#Pey3h&BFcR
zwZOu_`(eoXFG0dk&r#%hPo9909hmno!#>a~3^liU6`&7aaE0WGn|QL`F>?;c@S=>*
zY47*JTm{3KovjBwvL)_BCl)0ANG|Q*q~LEwg29!t{$lD|?GLBjjWrEz2g)4ETFKPv
zCnsn&E7BafD&Tl3W1N&`7$(~rLiT2>k$Q0qk^{#tc>OtFDeWcO$=oVCoQ9LR?enY<
zrV7+ql&e}5Ycrif38$7k12QZV*w>eX=u742hHRU_i@cSqN$WW6VyEVy<<an}Wrguz
z0gP{nK%rsd)3+<KMlcC4&Hj3le^p%%mDhh>O{x7}Z>%P$@8L|TiJHC|vG*W9FE}Z+
zhmMX_f;QqD@oSxpsXHFhl5VXWIf}9A07n6xr%7As+(JPy^L||Dzg7g5OueiovYlpm
zoLUQiE`ONwcG4xO7hA%ll65{_kAAJc5cp)Sjq5F^r)=8&9mhG0ZFr)x`WHm2r53+)
z4NmUx)3xgj&<zh=)w>WJKh~q+0Uh+xR^|Cl^wOyxRHpsJNFRxfH~1ssZr)s%Lx;-7
zazBY#(LPC}X7;|1Q=afK)~ay;(HsMJU9&wxkuASs%F4s{0?zEqqi`Q8X{GSbtg$jL
zV9kCG_=cD%)2<4}EB#zs<^~gB=MfT+_>tHFt!|LYBF%M#G*sg#o4DKVBh&`sr3YyE
z>);?P#y6iI_dA3%fc&a&xoyDYU42n?R-FYJ52NCdjCB?`tw<r+eN%dWIJ4agAPosZ
zKBNJOIiSY~IzjwjSF#C99QSohu1?G(BLDRibaIPQEH;aR*5XqcQc_DMHc6exccv;M
zCm$pJ;{~9t2>lkc1<-Fye>i_uIc$G2`p&;bdNF*XsZiU4$o0Lp+v!*w;y7eDTwqNu
zDc#2hE=E^Jqp~I`vX}`%g7C}hKAhr>^ywBGc6(1Ahx^T8_8>x1frg_hLx-0VmnZue
z=mmZ1(kO@O1CxT$+I}l)Sw6OjC|Y|nhy2R?A`l==6x|P@k^Sa`tJZ|)UrWX>7-FM-
z&@m)QFH$@7%31bEQn)hu%H${<`T3?Sf>P#YD`T0Odh9|dNkk%9-qzdLUppHdJG9mv
zXBf)GuK(bzmc)C!&>QhW@=3eIen>I5|6G^?eXS5Xbdb?%wD_iVucPlH8EkqK{tKBp
z+6<2Gedf9&k+Nc|`rDBtp}FN8(LcMU`Sn{w_Sv1k(Q9|lbdA<7@dRjCjXE_HP$pA#
z*MifK)_<pmF>!rZ4Ng!^`)EtKpWWn^JJOIbC);P&_OSB#+BCtiQ9S-oMe?uxLm3^C
z)|=3QYD6=ax(6t#)7jeQlXEB?&7Ksi3An6gc&}oMX1sM>58G-IaOpaCrz<!}c=*Zn
z6ned`yQ9hCkjQqH?R@~fsoS3V^aU16n6@!6g)hh|Rn>9MG@Gl*TvxC8Xl7q`Gi91A
zkwh}O6D<{yxxvqm3@AM`hk<6_8Qrvh+N6TRm_B8Rg=pyT=j`g*k42*fKb$RunEL*3
zxy?x5f{!1#rrH7LwRmuJ-^?rByGrMiA*|Bx2jun+_uz}|v~zlwIJ?he`E%`CLZ4;T
z9MzrSU4H19uG^_HQ`@9r8hqo!*$H+)`zOH{rm|oL+6T;_E%1YYF(sMn{w9|4{qhf;
zdX0-oiS5<}McKYcR`EQz_j?S0HF2^%q3ejWJzWu$!KVwKo8CA6L$lW#A?Rb3XhQ4-
zz^j=|lieztEvBqs`fq{pav$%FjYsZrfC*3tnE-!=yb{9^F|)`%qQ;M)ffr^ryee}X
zY0cu<L6%HwGMF3>Yz^NeTUWeiW4$)IDPBwWvjHTWJkOt?N62*jNx_-d^}^<OE5FC0
zygu~e5NrefSJjwmEyaE3K(#?m$X6nbynOha@~U44_Yk4nYOY)FlkG38d+J&RD}2Qt
z&Fi~W3*%2gx`CB+`-)J}G+wLvL%wbT7NuCMs*d{BS`+;@$60yVWZc_*@J{)WaNxf3
zn|%T_{R&+77d$*;OaRZtr1f*ir)0q-Q*a%7!_NYDlQbI6vIzm$WXH=Za0gyU(?mB2
zcfgo;xp9r|^im;B#CD`$N}F&$)-aR_gKt#xN)c;rXb+6k6P@$vN8vL2oGnF~6NTUV
z0E;Zwt_kp*7^pnbjd$wy$Bqli`iGsVJnq!^8UerKXM<Tph#f=O95pzCBdq$cov2(o
zfy2Z4>TEYAOBw(_6WqWLd{UCzd>nf{JxnDKMOhauUFVHI`cU}=9i3WnN;Np_vM-UV
zEP6vEZNG$Qt2a&-8c$^Pa~_ll?~AyEtgr4`l1Su7dXAro26o!*r)Z?`o-_QCUG>3m
z7<W_wlyqszsb<`?B*1S|a)8!L7Xi;@n>P!5w<kB;RrKmd-cYoVL6@5)Rb~r!Za`I%
z-k<NC+W4JVZ`?D$ywG=kgq+l*xYa$|me_hTm9J{Oxk1GGev;=|Z1O#~MT2BT;a%Na
z^~0)jx7MV8UB#&y|L;9Y&|w*(hZ^dB`TK`PMc_sL&q;E@Ea6|&B%59SH=P`Gm}Ea7
zw2&tQ{IzO+u~TKHC~xgjQ9XNN8Z)N_fZgWk)_UjHH@9DGMC4#_IDu>FX?g-O*9YS4
zCW%~ynB4Y3K&_+qc3{FqH9U{Z7JvDa!>VA_Imw&PZoL;Pb(;|eJw6BXmpI>O>KhlI
z!JtKd$i?mXLL4Av$wIvHPa1x7_~Gjqt@nR7RVlogjC!r9vc;>Iq$~4-r^OI3W9wNb
zwCqA)lqO4!gVqBe?z<HesK~hHJOr&pbVm5U03}Azx594=)9K<MRbyR}p*~Fy6(6XL
z*vjW*8R}0WR_m2v$oEzfz9=tXQGTX-`bB_K@UdVvBH9)mqw$C$y<q>=Sn}3h0DB^(
zUOQ+Pn*u`13%6})yW7(V7qj%ru5~_(Nk+33`EGnov95@iSZvwebZ5<mX_u}GA>t%#
z?~$qncWU8XDK2-zcy`M?*!{nb$P92HX1B=kW*y)=uY<fl#^HrCD#KZo5IZGF@Hm*v
zk~Uv^?YRq-0DXW1V_f67pXKKLv--o@6Y?|R<2lEY%Dw5+7_)_j`jY<Np!3`g;U;#N
zTpFg{cz3S@npHskGM38e48dIQiHT7RGtj?0xhn0UAK^!h<p9)Iww_$T!#~+&{&YX8
zD6706PW}C3sWYHaDBEoV(kb@AvRGfj+u&3tLoqt*Wao8t768Fnwu1dm>L6}%<9jz}
zi_mNL*nZBH#|X<w?pFkBXp3|!gxtV48|N|CT9jhlCvf8dr5rh-{h7vMTdNh@SMq7T
zxd0k*21b;<lO=`~VSq3Qi3<xoOmjOK62IcjA^O<$BIXOr@{0Y<)LWp>QZ8NsK9+Jx
z%^t`s4;BbhM2(=V&y`NQWGq73fM@Vd^yT9r;^5=W4nrfo`NFM&=!95Dr<9X&pkHos
zeb|xD`?>+NW#VIuk<=tXvF**Z_mdcv8zzcTCts;r*;+g<`Ah5ZI^V&PKOOvFJY^H7
zo;9|nNPN26*=XDx`Vjr<l%Ybqf!HJum9&XhQ1K=GSPf#SW6VQUaUBPT6fK4(^XXE~
zcJ(2P==Tbq^&_ui#iy*bTfE-q2vY1KKx|$H4DKfsyBJuV4KZW;WPO#EjX6|s8brV(
z`saJmM4OgF+S#NJ-CeX7M?+zH-vZ5s?v@G2c0tB7r9Intl<FSjx)?SR<_2r~>Cqs3
zi6UV<9W*;-3WnlD4MqxXHEhf*-^>QEQE5Ay>WfdnLaQP$J!G&%NgB`V9LRklA_=q1
zr&_4Rg%ybRrHM<W6<7;BhqN2vqTgr`auT`$HZfbjFUpEN0x+yMSqJnGd1=COEldLm
zfEr>0@DCBocmMdvVxi$Xjc!97w1>BV@Xf3fu-Sz#2!uf-Vt0KF8M@6&^=UeMU}|6^
zld%dBG8tr?1Yj$tcg{CzxlSiKo4~+iw>#4fmOGB-d<DGa>Gi{oY?rAzK(VRW+CkXi
z?~2}rpNjiU=ca=U1v_blP!LMsG?m9ZP~|A1J*~~tsdcd2-1ndjJ1D`<5V-+H<z!Kg
zCqATnpWZY3b^@$59<)q(WBYU_ptC9Z;7q)p(|IzF8=81d`;pk2i*4-&*XqZ+Jk6*~
zcN0O45tqk?@vuepXYs*)hr@WD{l<jf*}>z>3=ECKp40&@a4%S}Pt+mBqeTl%BdBkR
zGr~kO0bQ+AEw<1O%dQKUY>UoVcm@0CE6)UMpka;_sN0Pxkn1k<o=w6RA@j5kCc+W>
zLRx|SfYn}2e)X9>!_i)(Qpb>d8rOJQ+igqHX*}GmQ+HI?I!G42G4El8q=><OCQjW@
z2{LF8j)UdjWnz10oT&H~r0?Ygwkz%1v+#1=PYrY`;Ti40kP$Y;roQoy119zo*X${}
zLusZxV))6q82IJk4?OJzsjy&4R^P$1;*|+8HQJ`_CNC!H&IEQ#B}TNt$I@7m`UCy#
zWgVkV^J*2@gdA2|W^M+hGi9c-yUD>6tTzx_nAKZBG{TR7sW2q{=680P2_KE;6dP)J
z485EB?;EA8UR>^bCfz1o=x|0Cv$gE-A#}zt>_ooYDW0QIa(zacu=hs0@s5PMvUMJd
zJ=v+F_!1qBCP{L%rk$=8)H9$e5I`qZ%0VPkK+l=-_}&p6RnOJr1g23qDiUV!#lgCN
z0D>?_zIVa%(kE`KU+=T?cR&X#a<~4Tx~0VgH&$_qjUoEVNJ&fZx#WK1eFH2TG+51G
z?I$R+&lF&?v!sc70xk8mZN)m9GumYDz84Id=&wyb@l;MbaXm>7&=VpY*3&uNDI3TQ
z?`O-BDhPfSYiC7cJs&&nRlnZ3Rg-DIJ-*2Tmr(*g`RiK~a#gk6#`wao02g14a-x?(
zXOzE$4mzK#OERUbN3jaW)ss1h^dJc_L#aGXq=6efn#ogLcLF;TR7T9O2iAomI38oD
zJuWwk)zbp--Ob-w9&G?SaN?oxPl5;&3=L`w@u+R0cC_wOqtsXe7Rk;#B@$+NA><?o
zIhQ+S_6CJRgRB;la;!2(P7NMTq<9A>6FlkHGV$+v5NeH{=^5bpu-I&G!=XVWp}p$D
zi?h|oj#w(kTUApi)T;msy?+J+YuEGbeCBzo_`GJ+5FiOe3V683IfkmTBpZeXT#B}6
zVJ@~}U=DUjz;!-WpHHV)r4}^l&03ZpwH490>dBH`Z~}ytU@j>{tQ(3-B@=(SN5hz(
z11RnKxHfP$m@{o<ZrXy96;tr!P?HVHOb76qShD0>+>r0|=bfjFGfxPU<vravCq;=&
zXDSKOY#qH}^$97K$t=r!Rfyu9Pq6OokX;<u>-n-;?4(rV{QWV=Od?&x%j;fEZX#UC
zs6h`J(;@LPIB482yKj6yQ$Q*Z%-(if7|EE=*Ac{+A)_%T-L}t>qjA8Tyo&9yFLi`J
zUy+aE;vECw*74tP6GS5`%aLMHC$s$}G**5)L8&Ey#D{e8cHJl7aJkc#yq<Wmb=;eM
zvyU=Aqt&-Hg3mNq?I?K#Nq@4Kg4<oCP>Y7Htmw<7m^zzf@gbLnP`urJ_eZrW?b<sP
zJ>QS+!h)4X(gcte$n#fAt-m~^or*~dNUu@Cee19kD5l@>f7`jlWAZOGa{k4pL3%A*
z{$gIM+)%tg8%n9}&2b1l`OReT;oxhm<Qzqo2JWbTHyp;tqw@D@<z30=!<j{aWbxx$
z;)t(3?{793$KD63vg-$2=(i3P9$+uL)E7y7prk|w+rt(wali>ahZ3$7If|Ws*CV+;
z-_Pg&0^uKgsagLiB!1>{<?@<)=>j!?#PAk<fn5mJ>79(r-b9YPcJmiNE74aexR8_H
zeb9}(0Od}sN-}=&H9W<5#9HI-411#aG)`DrDCKY|!YxB%27(nz#|FL|%Cr7TOU={S
zd;hQ(lDWZl!-E5$OG`uERi7t1Ty1B1$agGHC?NmK1snl`APBVz#Ahhfn*q4TYx2b*
zL~MbE)Hq6rn*znmghV7%21G*hk-}2-?wi9T8uywAd7Hb#^QD)^*mgKsGD2x#?dc)|
z^X+IZ;*(CzhJOt*Vvo(lUmJ1^91=-<v+uUY(5|jS&G5f)OMqOOppX906i-l<PVCZv
zY_t;<h0U%OpFrOhTRH;<n)0=0@Q_Z6Nb?iAMNAFSi>qQ^U6S6R;Wx<CGgbG`@^^_w
zVY2(+JWx67%F!(>ozl`@J*T^@;D6Hgg18Y6%Bf}3j6kb2{S%jNJ`FaVO}b>MnNpZz
zU67N_j{*4^aOhJWiKw)k$*h%zqUuu#&S2dY#Az1a7)|V8`V~LGPE7loi=u5eFM<a{
zg$#jurZA~JbU{_F?j&x?5tH5)CnPJzo?hFbN;lTY9_90)<>%IUkWP!aJ)WAIKR^(k
z|2Ejt6-ixGKT6qPU0{M}Vg@~9g$~N?AKu(H2-;jcwh3g%)_8%b!EE|nSx)fF(+ht%
z!>d_vy77o7VBE)y+~owEEn)Pr2x54$O~Wyf($|+~sLtc3E!j0AY?yl>5PPZVog~O*
zaraB~I8~S=+HZ}@>&p6G!aQab_#Mf~Bj#ca*cJ}gxz$n?Cxd#&KDZjya~>zA0>9T9
z@f}Fdc3Ot1Yh{VveWAi$X@g}L#U9}%_Pj<()|ICL(|MEb&Jofu_@&erqr?eke`=*;
z7g#G|-?Q7b38-AnGs&;pT)ETQhp2ksuY)YB_#Ut;DyKc{KfHL@NB;FPl0Z)>Y02Yk
zf`yeENSbTA+5FXMk0u$WSL}X^zcm22z`667+3kuIciNHyxtenfoab5s;T{ty?xUpJ
z9BX5cAyemDfJ2_VkPpk*N(PPc(l^SQ;Mk|Dt^8jS>u=`{pT8#^hle$7if#A9>W^h2
z+8)WS%2hAR;ubOPcumZU^c}0CXZz<$s4{6VF_&;(Mn5NIBScgD6TjT>43%qF$}NnX
z5BAsz=5!F}|3}qZ2UWGcf1sNVrArz_y7SPjG>9PGodVL`AR(x9cY}0ycXu}k(%syb
z^ZTBA=g$7aade2i*Lv6Ud=e@sFFvZ;dhtWV=Cm>kgOH_kux%tdU=FSK3F*`uh7S15
zbTZ6+X$edsd^m`Cvu9>cLi{FY%qMgIe39_)zNPo^O=H0yO8Tet=4-u#-00nfD-FsI
z|D6vb$vhYJYxKqsH+IN=bNoHw_*r3&;VHmC@aIMxX2e1ZJyH;XTXYp3o$s@U(;9W~
z)mo#mE>hmO59}0*E_FN(5R-m#?&yHxfh}{nS?5M2%^GPc=mm2+|2fx2Ht)zdRcCS-
zkFQ{7gvv*Bhjju{j+-V8>IDt-ws&N&BLZ0~WyRwuFc9?9*gj>HnZ`-$wAbBSx2Iv9
z3xoND-mHViFhbBW#Lf|vEf?vs62>09G4|4Ea46s}T6LIhr2AY*GuLyZ!R;I&(QZ1*
zA7=j-bSv)hQ63OuFe?z^a;@fQs3w*2y<mz&Z%~OG2sjafzW-MKSI`C3>w7Q&HALxW
zq-tzk_qnq-!=(QgrQ^NTQO4)ang*|o?w)G%$KJAw?B?T^SOd37>Bj|ZnvcT7+byR(
zns?+j=U)I>&VB5m)=uDr;rG&3Fj;S}TFOm0`|1Z)zZY|bJ|emql=5$_e)jVATg%!n
z0u$LGz064nE!-!H?M`*vD<#^EUlOa@dD~n0j;*M?6yS@5aA7Tkp<v(D99=0jeVu<_
zm}gbARG>tS6BGi74<{S<<%^mS(@=OsFZa3r9))(UuA;bYgowP_FUy5ZZAkwtkX3~p
zE}NVFuv$6TfqUZ&QvLVQe!w1Prl0IGCspB3J3m}zzV7J+wE@3&)is?_H=h7;c~ux$
z(kj%9k|hTgV<kp3SWgZu7r_fGz7<=wAylm!nf2hHs%dWWQJ`#6_HcDjrS8!>4|iFK
z{euR)U9|^H)6)AYTI~qQ=98ri#}HxLtpQXG?FNVO?5R_*gpb|LH64u4tm@cTjsBza
zhJX`n|FKP^*kkNFGzr#SP(SBFkT=w?4i{B@aXeT>s<7H%z3F2!zn{m?qo8^LH(NG`
zeFvy;E#l*xe>ga;-%X8TJUZqATX+Th_y>>KHHJ5nG(zLo4DA%c`l(Rq8rMex7qP9R
z3w0tSfdYI<5r&2Qy^6NSD`>Q9Tn=VKKY4txQR_=0>CbY|VBW%nE;~#P!%C^UFXTZH
z;3m;z4Jlku-w{3b-(`y^?Ewf#(c84$Zlux_)eInpck7w5cL=D9;~vQF66ivGq=V1@
zu6UnfUvz@LFulIdTLUS4Bc_@Q$m<N4(vkbX8oLw&FcV>?E#VKNP;s$zB~~}@x}5m=
z&IYD$$JQVV9NY?5d;zjK+x-)j(2;2f4>D>%1RaKK3uGUaB5vXkMux^_y$U0_@(FRt
zUAX-R$ZN0O&H|%Ad<6QXnF|VO4Ou<gn|<dp0?L%{{wCx4!H?7#2{SD<{jpI47~hF}
zf`L}gy?-tP(X<<w4<*%93Gvs$Tl~t*+h4mg18v$p>L?BhTyn<SiyYCL`?e`0{RvEc
zkVvFxpAU4^6Yxb+=w$dl>in6gKHnhhBBEJapdbm8lx49=^Waa{KUQE(Xl9Y8NfSiy
z_V2Js{I=I5eE-esd)AtG*VSzJCfL7e6|tL19h0+$m4qQWJwI3N>^p`g?O;H^=rPmu
zR)4B?p=)<oBo*mS=8P7T4pC}Yt9Mrz*6T1fq4qeVN3M8kN6J-~v(FXI;w)({v-;=S
zh2VG(dSwq-E6EXGOoLy@&G#c`TNqeNj>2-#J#~67ABF&tt?)33P91x%@6b*gXxQm$
zBFl>><EJ0_G`la)`T_#~zR`TfO3+4#^X{=tcfbB((GwVpA;f_T?uuMLcK|H1<)Z5{
zsn;Kuak3>E#4nH4LV2Dr!>WDlCBEGBV`|BL&ubh^Tl0RUd{dQ`&t01M>cvdbp3~xa
zAHuV2`SZ-~=YKN=5yDjr4A@;%Q5!B^W{(W;)5+0<X~FjTzTSk;7sTU|O;t`r<PHHR
z_dXBOaG9UhhlCNRHpHRixmf(w5Ah;vvQ=Wi2;!Fek*}GBimr)?mNpo0sN2lS^cn(S
z=0Fu4Z>Nhd#VVK_-1y4N??*uU)HT~)e;Ks)zntdFH;sU?<6iSg<d(G)dHD-I$Sa&O
zVi3~)5&@&gV+)*;ht=CoQ&mhlGaEkMc<GOdmBmZWW0_F-p#~!{Iyz9l%8l(zAO?FS
zQ8K&W(fabT{bQ@A>r1dJdo#92|2yaZzDoqmd^pSjWaH1;nlRmy8(U^Kcn`DpSE&RZ
z+JdD`Zq1|=Jhce^cMyY*N|}4DW>-E?lj-;W(8d3Fu>b#`!c<=YLy*v?6<7WL{=c>s
zTQcvbNfbT=o&gOa8jTC9&Jk7wLgL~>-XEiTz*0P}U5si({E7A+xZUHlvqoeoL&6Au
z!hG>~Z=~F1Hg9e9-YPZ>3<`GOOOwzqh5YX;w`KJB0yAIVO#i4`D@a0$SIqigf=}KA
zK)_t!Ky??04JIW?cWl}EC3Dr_oDQ=_-KcELRsP6etPKc>pP`AIe?#h;!lSv-&BxLu
z9E$dYZ5x=b@me#LnJ5;wnM?o2K0T$2?RcE5iBQo;Os>J`CH)!Cl1QLV^XPbu&&KVt
zsBH>CV6*fv7M`wnj|<GGStGYK6;I_cKhGP7S4k=-h^+r^MFhhm5Ph^a<2Rj^{@+{b
z3rnQd>{h+E+S>Yr%jmWMfyl<QFo_(%kPgXz1gF7l=v2O<XDPD4<&>6E;Z%-!2QX#7
zFisqDY9F+I>^5JoL{SwnYSzTR+`MvXCOT^@GoPC7=G+e$s`NYI-4l!>?vr%)%x_w*
z(m<)*Lb2|%v0Lbfiute0*5VcPfHi11WJF2!DImdQFH)VIK1N>?w&e$b0}4Cd&-c1Z
zv8GcGBn(j<6`<=dj+TjK(Vk3`56eWt9~3?&fx?r0l}L{!DvBCEqom2h#oE8=g98y)
zzp6WzUvBxAS06X32Z#l(c)VmA{;@f$Yr_IyXyvvZ=DG8Tb%C$xdtfA-UA6+tNn5h_
z)2(@-QjHb9{l$21JQ2@VxAmz;hrLgUJ+EFT2UIhzcsjAtsht!z3nl}fmk!zWpSaV3
zkVIZZPb;^Yt`)q{Pqk&-r$^Kqs!et6cfs9AjS61+z@}RIXxb4F(q2gFFsuK_OjpYO
zH3G@K<|!C-*Y75*dOr^{s8@86Hm#0!?h`o)QP3x27#~`a=<i)$db!=k%y0J)DggNZ
zH@H6Zm+10%TYHPVo7iX>#)*h)-}<(>Y{O>O$@KaL5cSmv5AWrreL36ek1EqY{lcOa
z!MjiM`11<jlN{ji4aJ-OcV~SWZHY38T)<-=J0BEZy;6qRjMe<Y1wa`UAuR8Q^2u~Z
zcDoHj5@%!_IwKO7^yc;E3lJin2+odyV$Ir-Xd1;O4oj!F9z(tj5C>*GBG~v1LM0Io
zaMb>bgL#<^&!cVoWyje>U^y0BwFX(taknR(Xz#yWeQ~Fbp;zSq(QvGry$Z_gCBXfd
z0H8$+;D<B^!oKX&g(`F7qo&G&0?lhK;Ow*-mFV$<J0HeU;MX@9@etjFk_3f_F*|^i
z;d}`uGsBd_o<8e#D|P&<0-uH)Ozae2#n^h3UTM_-Dl>syRHPWd%tb$R8!n84Jb<2n
z*7{jv)ctbb8bo+_4kMi8gB<*w>?nCQ@8@P2PI|L@@Ou(zM11&m5{#>yc{jHO*dkVX
z6%U#nV#C)aB=M2sf~lYzzBI~oj`m1p#O!qpy>qpnZtGJ+C$M6+nd}koQGlADq}yZf
zXG<3+@QxMYbC^ZmBDyZQY@*y9{A2U90wh?Y?pG%xqoshs5xEl)=>IFOu65Vh^mxTL
zz$e@ZWGeASquna`l0V;0q@sezdxi0<m(0(db_Bk>`F#QD#y)2CXW(A)&1eCz{U`Eg
zfMv6QV#vi;W7#7>FTw%ZA6pZ<tG_Hycs!i;?o7xq#|WO*s<%8?pa03C$oBNl9?7$4
z+#PlwkWC?AkE_@lAEH%)XE`&|woT%5{++<*oXSOL&v*A|npT4Kz0!WDa+GPt^d$i!
z38V@pfeaL-`?D9uYdvfDMLz;Udfyo~{=69Fn>6zU^^nTvfO<~^m5_sTh@_4BkH<lT
zzh@hW9jC_c4(H1asbPezXy>`B?6@aXNxVH83e-+<qkwdnSb=}>%d%U%uGejM7{S#;
zzbVLxVIxhkZh3xkhlvE2vn2j#usQx<9P?~VWw$E`uEj<V+8yG1h08F((}jyWWC=jd
z2?-_lmaQ18)bA6!d%72{9&W6r?YccNaEn>|5{tPnM6b)<J9|5wWznZ71(lF_BU6FA
zv~j^ZH_${^T?~w0Lsgcu<ikvcioJN&eAi3%oM}<=1Evrnm6AL0+n-kQ^<aOy5P)N8
z1y|i3USyJKs_P6_d%$-&*}$I2G(8=c_4mxf;D{OmX%>#_ez#}9N+-R_E!F-EY;2jQ
z8dY_wEBLB=Z+VWz2Hug%=zEU#5OSaB6Wc%Dqm_xgu$5;`Tm#AM{~URw3O(Zw+*?dr
zxR@V=q^yEe$hVL7xhON60IAMBmfYDAAC0|s&UBWvjaVT)uHrzk#S3OYsL0t<B2Nfc
zf-~O`&_5$%miX^^8vzpw1ECtbKlavRYCK@`pGSZm$d-!2HhRfUS{FkR&F@O<l%h^4
zCT%MI01kEYx6!NlqI`Df1UC>^?6oW>aJUr0@q%94Q9SvB&DLrW7EoK6ium(51YL9D
zM1MbzB9eb1*-=I0{O+$!JZL2N1ba-p101i@G^E~51DYthl_n*wx7%@?9)L$e4uz00
zXg5rG-0kHNtN@M<rvgI_LklqEeD)xb2Ok!KIJQ2-{wjVmVlbl(Rg|qmIfkxwAA!qy
zx%w|N2EX@&N3GR-;a>jci`R1Z1cVllM*lSwZ__jbx>`n<vTg}Zd6^8OZL<JG?|2Q$
z!bmyPcpqG*sU}x;SCaSLHUqEOEoT^%>TLDlzF29wi~IZrG}4Jl<%wP{G6ukKl-jy`
z>OtB?Rb)B|8Y!ULt?dB^==NkQ9Zi$r3PP*gO%CSD2MC4V!b<dwmLlW)R$TvE)`^M-
zG5C$>Pmev9{u^=U<C|9W1_U%Oao7SPWCsWXE}inn0g{rv+pq~HLLSBE`un3sSs+XJ
zED8Gnlgyx$TkOZkRU+`o)azpeQ7<iv3n0&E`zpR}_p+r96xupIobuqnLSzfTF`C%Y
z$L==^`avg5gMu6YCY1B6dUf{Rth@aWfO5X1dV7WNoOWStCxm~}!?3Q;OY{05lB3h?
zmUuqGr_j?!-|?JP#V{=XBpur%KC%6?<<2C#T&AJyDL9hgPS9|z^?8`F<K&}eGU85u
zZw8*C{2zLV3{jffJ}q1wMAM}vKtwTB$Zotxv-wVGk3wk}Fre?+W>w(u;|7R8N#`*Q
zBA-E*Eo&A7W%@MrM&phYoIN2&zRj0b&&U}qmeLFU)iPJE;!zlAG-6)y#U~(u-PiiT
ztSiWQEq*UJKvX;wB~(R6;%bymvNsD%6j3c-Tl|E($`9#3@3X;K?^%L@ONne{=jXsp
zrgf79q~nEyI=@q07<D~<m#9SZ5+ul0YNMzZ7`O|LDb;(y>&$Su1@m383E?5Bp(u2T
zJ6vPgBwQF-V1i^k_R$;yXIkE-!)A|ME)WB0keFv)2yBQ{@#(hACF8y-*T2P#pdNH5
zT=lLxzXvo>#$$rl5E@AZ_gU-n<ByAZD#Ff-)5k~o^V;DQ(gvbFCjw5gR32yVCF3qb
z3g$D1L{9v6xwzneM@QkS%{BmXW3@NeTa<zu2I`T<l=BJ@FRcGsy3Z#+r}w_opDCU{
zV{I1ZWPr#3em7H8xYA^}8RzQ@AC!vILtn~3Pnvye)Lq?-%y0V#?7aW5o5+IXmg%L}
zZ%)dHl0wvvm<wl6(xGE$?E~8r%F-alW2h+t?ltgP*1jK5AZRU4H~IvDT)uD?YWm_y
zJ~84C3c_8?z$1hh@d{R)phzDP31qIS9O4CN`~jm1MtfO^MdC^M2pxkSv<4+mC@7Kr
z1ucKqXVc;>LMky(QTwwtf6R->$Y7K~&8#LQ>6Cy^ou}mhGu2>#C`XVBX64>vEKB`m
zoXB^A>iCt0ezZnOznj9>n+iif0`37E66KT32?UV2LZ&zDNdON5er%{mbD9JfI|5&s
zb6dZ6hiZ?g$&$i+cLt4JtlR2a%ZtV>;^igXQwqqLB3wD}*$JQ!3~jkx9hft<HdR^t
z5jT%xWY&51{Rae*veB#l#)fi=to%zQM?mN!+s4fy@JlgG&=VHjMKuddzdD@c;Em2r
zBtN+=$lIY^Grxa}(F+i-4u;*SzvaDD1Mvl=5JY`+2n~S=IsvXkNUkI>#J}om(CHkL
z1ggLa#2V+px<(1u%}Z!R?Se!Dafk;rY^Ij*U|G3LBDk^IxNX-c#AA+!zat-vub$Lm
z7!z4^EPFeprbnQYNkoz?U%En$0tKFHS2TX_jS-1&!oGd|aZpHNOci3$`<;+_)S#uu
ze>baZp7#+?P~&Vju#}J{X15VM)w0hSZYht?<&5O{w02G#jTm}3w()~6|E^}78%OiR
zv!w&YP`(MdhEBLD;&<8|Mxo2SNmUW`4)l!@e2Xd9&NRGZ+?{#wDQo+8?qCb!df2n}
zGV44fKgC;)e;WE$ZvMAN%i(&NmDH$-9g;pCeK3(z^a1?Jg?=rTvtBigEB(79i`Q09
zArbnUw2HhP38TSik++71;pHc1z<lSbhozBqnrc18c0J3-s7|=tEEWtikDL#MGm0pe
zxVA~DfUXK(@P9r^u<MD1!Q>@_%xa!mv#eU1RdQ0h!D)^_NB}wP2$(BEon*YmeRy<K
zcUf@!LUHJG3opIc;RTPR><4J`%#s0Vs}))N0mPD8!)o}%E!98ML;n%g9<Q-maoLp9
zZm*doVoh83?A|8}2<7(U&qY6#{mStvSx8o%2Jc6(y$p~(R_3`a)g0oX>5)UoY4+XY
zA42YddLgeLCJGNWG>oaCkKR&u;FL{j-3Z{LZ?msHvH0A9>!%$C#fmFd0x{1E+7=kl
zMKOvd9;oD(U`EP6$=-)y(-w<j=j!@6$2UM#d?%=uXzbqkPUg^oRIeOm0;U-*wZQUN
z#^dYXJW@T(PVEM8Kv<%0K4EJIo&a>;#Fl?mTb2g+LaOBZU>Mnj$m>3r>S*78e)G{k
z=qF{B9R&Gdhc7&9sot>YzU%(}Serz$le2@-^Hm8!QOf%pK=yq7NJqXCoW$k1>B|B8
z37)_*{y+rKY8b-1P}45=8)26g6?IngmTt^e2Nv0g-qEH=a>P}XEHW^{fgr?SEO+~6
zLZ<XjlTqi`+8xl*D8rzoQytuD>fct+8dz7|!bY0$Kv8M@=vk4d1f8%Ene)!E7MB<#
z%%#eyHgo(k`2BbM1f~piY(nbL@gU_mPbc?|NC;hy3sNc?h+MRTu!-~>>m~ZP4^u<m
z$+VQ_JV?$o67aK^!ihBY$Zd#goj~99`&>vvoI)aE;=uM@YpLPnj3tmsUMb(>-LmyO
z!U}((wOFk@Wj~kHMnEB;IiAwCh&34uJ+rc2TKmagxzER_rt0izYfGjH#p!`)jQg_|
z#)2)Zkx3V<#J*x7Fdo>I9f{iCKKDzI;A7V|925rp_^yKZBscR-H~8-Iue;sz?XHdX
z`~|dA+DW-}NTi1uNWSLkYupig<4x-U0VKJN`H{V^`A^CKMHNJhkGrFhx7%gcUs`T+
zqkRZ0o&2RDpO+<wA7_i@@A`E~y<M8@eGnD`kCVtUIK>+o4`q0`B9$ZJ-80??whc+&
zfU(S>d;0E(oO$XXBi*OQepeo4cl3xmFn!Ov<%;OzcXyy1i>^5O{5=-*oF{?sGs%Eu
zc^$!IH4-AbE|Qix#FKFauHPu<LU@F;GwnU=1Rv?78?6$>d%=FY&*&p3)D6k?p3Vyt
z;$Pu;oWp#C`i>+7g_%yN-;bs<Uq;Ml@XD%22<vDxP7ovrn!{Obq}Tfh;Ueng0U~?f
z5OEl_gYg><mlLNyqBqFXF(rCFm5{{>GOAetmy2#!1hIQ9!U~b+g(3mt!R=y`)8T?<
z*5Jyf<g(vmxb2|5C9<emQ`5sJ`N3__aAh<`aail<c`_6qrXt-a1}XiEZT)QQO(#F<
zcOpwnqd~-)(V#q2k9F$7PtiPqb!Pt4Mb1fyrDyVmKwGA@O1|m7+M8Pv<_)B4#K~yZ
zS`mu(sX}rK=T+xg3+@Hl>zcs*x>We+AB&BBIZs7LOW!()IXxd}YJ+M-pB}WFl8IUL
z0e$36)<-&5iT-2ZF8#|`Jfy;7ghbng*A*jlzdmmYjX8eZeo@mJ2_gV@B85D=W|*D^
zy{0c`2)9zR8z-lN*;snUhjo9_wH{1ViQ8|?FK%igT*ibKH#OwF?6)p{q!Wv|vUT$S
zSC#QoO<%InUgldhUC36N(DPm<t*u|Y?qhy8`H(O~#w#N*QbA>BoVS|n)d<#Oz{hoj
zNQNIlJCLH2Sf++QqNhUWO4qge{gKbeYCmfj2-E6GspH}!)!O6SizPGd{BBiyls4{e
z20KQfQH4w9N7mZ?r|>ID^UD(vfCNe|3kv85D4FROzR@D{1+Ev*VhXa$W98+V3F@=8
zb4K`N02OS0J*vJJOQ30J)ZPm<_!y+dj_3h>E5fMxA#;<?PCS$fX$Pq$?c`v*sY7cN
zS*w2mXNYgk8=SHzoV|tWl|hNg^EdI`P!4OdJXuH*O*i-Lx}iCir6uXBZ@dH-+=<5d
zh3^rRFzAcPWp&)l`#G8<!j3}enPyEt=*(<US{>XChNejT_2$xc>mkqmG?_PArf!yE
zzr(NNvt`Q3aB%C6Bqkr96kp3!2e2euYlo_9$n?W#`-;plt5qt>Al_lgD<*E;`x>|)
zs2YGk@lNx~Zu0>7t3+6*KBGOm1Msl_WgXCscBN1kRNTSdQK-86gYYo+g@BUE9Av#J
z4$I0?cJ@#uQe{_m-S<E<at(RQPW?{$(jX;A#)>1KbIeL|{;kqPh`jTrS_#fc?#Amh
z?^P;0+PL#6l4;5U7WG1<oYs!7YaX2D?r$_o)UM}-flQ*ZZ!XdMoahb-y2_!6*Vh}u
zmafS$2Fi7{-{@*}1NklH6*s>}r*P3wte{EQMuz5H<#iz$tB@t%3fvE|@5q+_b$k(q
z*_zqvjU=lVvf8p#$Hv$uSG;xDY}E%eow=aNX@YMM9BQEZ)d6|yQ3Ju>=QYqZonR3Y
zP9pH>aU>n5GuR;*mADJT@Ut<Vi9);z%9}9-xGd=yy22<)A?Dkg2N92*^hf_O8}IyU
zB@%gJa^W;Eo2oz3&0WNklMStpLPNo$LnJb9utg-H`JoUyO2vUd#j0YABKIvo4pq~U
zeFP<agaJeudWLY4O6&({!yE{-iKZgrGx}tnP-?C~$VA|DZ*Eoz3<By$FG&1Wzr6v0
z-0KbU6To<V@bq3wj=I(W@XOZhoJp31tk{UawbrG!Kf9a{EpUi5pzA@eQF;VjA3VUF
zj+4F{LbP4sw_d<@MMdjl_Y!wPwZsjNy-3t{ww2}mx`oKOyMAkn`Ylusw_5v2)bA1D
zS&&}UD|-~Hl3_s4F^vt9G`7_LdVjUGG*KnR{R`4Ivj;+*tT?w1e73<7n-|L~k@R6o
z`a|x}>Hvjg3h}5AyB1(rAuNtCT34+o|NA~RD*tgbS89}-9|UVo&oidejWmOl+TWf?
z68#_N&TEZ>i4$t$W$rHa7)8D1;QIeW9XJG}<%DOy>^}?;6{4Pv?Bvs4G=w7%Jm?(F
zGZ%YWR3~yX>Ww<-8ecWEchH6#Gr`{C=Nddr8*h~HezHJGf^*p5LBtIIJ#i9uR<Obi
zHuP3U(?9rdi|-QvWKqkT-aAVWExp?5@ET$7#)B3h-g~BKhj5a4ys7yxWGdjXI<0)R
z8L8^_=nEH3Gs;yw6z-Czr~se>MeZLZ92O6xEs6wj(#iR*SZJDaxEtD@U41`wFoLDH
zv2jjLF>`JG{P`jH`G$G&JJ`2mMtsVD21{GmQjASe{)aZ)C&3hJK}|r2${x4>S+!bP
z{oGe|MgpD%r}kso9`WI`Z0Mqoxd;c*UuU(Q!^FB(3cG!F+W9ufVw~ZMp3ytr$TN*E
zl#Td?T!V&Nmqnw^c&uT0+4#Hb<Gj6=$16f8j;HBqOCRh2IM`9L3@D#bJXl4Juc;ni
zV=b)0iTOSVcG%4PiH74A(Yz5pT4@gU7^=Z5+t7!zJUNG2e^Y0_8=ocl+kc0qyiCC2
zu6PIfBT8H#G!YK7dPM-ur{j2aIK9{DP<|w&8)M9^R#DEXe>v#$`UmuTX`B#63~9nK
z!Nu!;{1J3<D7OWYqMLB}!=p^tKkFuO9V*kna~EmVSDbAQ|Finhr#F$F>|JKLZ|-++
z{bG#(gv(6jB#`k7R5wFfn&*|er%4UNAhT=*@CN3NqJ2auVK|eBuUKHs218(#h5NOa
zWP2M><ym0(i1Jeeu+vC4a%>#6x;zFS4!P!92@Erto;ZOUg$-BZKFV$PiFa{#Z8;j4
zyL(8Zsa~_Q-SK!Bd7KW7EuG%?A?}2IKv|@7-aAW&n6m~CdH2P3K#HHNL&o9ttPJ5>
zL4|24s$$P?Jau`@OQCy@M>}ixtZ*@J-c&dZPS5)uSj!%;4<0(^5pBG8ed>BOL)*t!
zB?SQ6gcIj7bC9XjYcGse!+5Qvrqfk@c$rtJuYYdE;HfcLr>17bpS3KOGRH;u%!59P
zFL9`{NN0sQBYqZqr3*Oq?%>Y~D1K-cXu>KoB0BT7?y`;^x4rPL0mHOr2J7^Qwy7l*
z8J8PTy?&00Jtlz4#u)->V)4)PRegSk$T|p%@9TXknw|J^A{MSCd)3+xqO~t?%GaRY
zY~`)FZ0MB9KehA<?8xmP`0y>_N%npT2H3zWV0vI<-%R=7NyGy@r)}O}L5x7s_OC9J
zK*Wl|bg4{}eMZX>?3^Fu{s@*;Gu8{@A;0Z1<E1hbI}6F3l6J#*X*8!q;`*e?N1Esc
zm=N<}??a}Eo)Kc7b=*eBRrD8|Ix^3`m`8Ba7&#Wi7vJUN4V7`|Ut0@spB3x;{LI7=
zE{GSSEHR%1gD-%frBPVLwiP9-;W)gqG;&vs5Z^8fF)&5{%_u4MMp$m_0*TCYES-FQ
zh6q^>Ho&G4j`Of8Ah#=wx#gW8X3ad=@_2}|$^UVzC-*)HbkO!E{*xcX;fSN~RX$t?
z%5$#=a5%nY9@25~R*&)OxXCv*eLLj*u)Y)~jmTQuLT!O{%MzJ>^hNGt=1xkgi{idm
z%Lxp+6_~-K!h9tF#2Iu(qfR&qWZ9h8XMEk&_~)pMz}XnPef_9+Y@h9|w;zyh5>f^f
zjrlxN0IxUqry#Z)8~jH`!`@FoDywEN_uH&@ooaep!hY31ij7ux7bNc2N5;sw3^W>i
zwSNow7EBRy3(z;Au-IzY<k8;aH*&3lL$zo3jWH3E&O<SB68G`@`(ReQ($aa4&eYcL
zcuoN}Lz?=V?-<g`?)!J<%c`yft#eyKjAHXgF{~3QA|F~gP(!Wp6(b8GN?mTLl9axU
zJ_1a$pWPj02ZM`i9nYSC%uG3SLN)73U<VuWsjW;rI`uS~hPfk)f04%cXw+Gw;;PPF
zJr0Jl>jK~HI%RvWa7%wr+VLB;Xi*^AZx;3}pw~~}tf;t*^1z0igrAwOdC>iN2K<YF
zyaDg~<RBCR7ybNhn_trf%2tzh4wp4^8qYNQr(6Hb)rzXUuW_VoeV=L2st)@N>d}z?
zZU4TMv9!+DkXtpP!k@cn6){^KaIatsRyNlnBZWI?j!j+Fi*cv!3h7HP7S`qe_0E!@
z>Me|c1omaf@Cun+W~{-ge8Yh>g#J{RV5(Ssos>Bj=Y~Y&9BL9-kl(*CXUs|E4+cvV
z(mm*HS}^?)RPoS;F^f(gE5Z`gP^@`<tFvB?J^qA&8&w^mg`RwcAStE}t^@uXjQ2O$
z)<8e)Xhzsi|8s4~4`oykKi)2<iI3I%jeuKpyw+!Bgp7ZYqTt;$h2=anXsx%v7cF6p
zXO{sEDqEMRV(qSe8c{x)&)~b_qwN8qC8_ZNCR*sivq8Dxb3ud{?4r2m@DAbCgLP9G
z1%D))e8(A5W13kiip*}SLK2oQM_1S#(Ri0W(+ZxnqenMA{d}^h$C2Sg)VGcvst0Mf
z?2Kd@Pfm+fLwPCqiH|yNJy?5~iqWQoIYW=P@-kDncwV-EJxma!bi+2YfqfiTJ0vVp
zH=LAD;N`6F*H5By*?PZvpAjtUtf+vRC_G;WIwwE<fQ>x$7jMi1e$S1r!rwg7%58Sy
zA)A$i-72e}l#pOf>cj-cs1y=1+iPcw^62eP59X{J)-Yg`B6B22T*a+>q4Wi#teF+h
zjX9B~?W(kJ5zEeQF7`I~SzQI}4gy2D=o|9`H_Ix(Wn_TBFTV(-cbRL3UXJYNwq`Mp
zk(0Z0JkOrXMo^sA==4Hnt`N1&2I*?TAL1v=Q@t(HW;d&q!e01i#@8g1W1g017*Dyc
zJj7-!y+tL3IFqIKTdlFRlSNX|-=|&3vEopZt(z-wgyahcNXdgVvJL3K`_>?YTyYQf
z_pF|LEY*wNIY&#k?ljz$&la1~U;krJWl@Q+h2>1xXcSlGJZG`Ua==|{@`VVYrP=**
zf|&9*E}wzf!_BElu@B|k-sT(IWUV-0qp_ALziz%-u*7-IqaYhcrhN4AkeKNS2v$`N
zm@wHHR9@d*{WYC=MW(FT_Qr=mhViIT;X!I>TxKX|?)I&sz<haDQk5NW&SjoGR_C@$
zD5yiFriWJ8!|(T5i%Wt;%3nd3N6U*IG@4qUy#Jjyg1HbN#w<}AE-YX!$h!mb_b@%r
zByI`;(Q1_IX`}wMT0Wr*Zl5SkuXwYuIJN}`UcRpmHz!jqffa^?SyS~E)BB2<_1ymv
zB?wRmsL1K?t860AY${PyIHa5ff}=q1kr<hMaH}AYL$AvvU?OsRwmrFeUm;LU$$x1$
z>zZ*>qpu1zHJbFw%pZ$m13aWOl37s|O#@UY5D_d3&<lfnIAUi)xKNzBy%K8))j~gw
zf~wE9rn3S14VvP(df{X*{m76VH2Fs|BA<*?LlFXSOcOD-ebk*7=z4Mgwq*=MHRj}P
z`@vZsT(&!iC)s%?coWHb*jtB;LOnO~d%ebc@tywnvMu|%mJYE%o$8ziA%7Mn4tq;H
z!%iv1g`&j)*63+0Ksju3dJf$TIuHsKC(HE39_!NR8Xe0hBq{2d-pc#+3w?IqUmMVD
ziqR&dmKHcoInO*`X*)tL4GRIzgvzdQeH_OYNy53DuAScM9G4c10|93TaNTe_aQhTY
zndfy^{bh0Q$SC^{)cZGL??3H9*Nd{nU`4#ae>JvYL-?#_@yq!JOvGBeC4hl95X%bj
zB22G30+DGUVr%i`hQk-Vq^e5!oEwX9ZT=-xV3%{N?k3@~ijU)Bm(QH4Y>ihCxy?Dk
zy{O-e;A`4nTl!(mGmnb4);~d9yZS}pU$6BV*#-oN>+{;iiC{E;z*Bb}ypBm3PK)of
z^N6SO@j@4PVwjBvIYqd=8A+SG@*`hk1p4R$YCsl$;Aa``;({*>fp#>fkAT_hH&EBH
zgS0T&y;~*rz}p#05F-}6_p2N-SIO6<^ZaL!g~^}3qla=R?<EW{AKz`iAg_?KE#m@}
zU&3jk`x55Z3tY5dK%TdlrEUs0#XXc+nJ!dawWLOr^;Q!gVRx(IT`QsT83w(oUvfic
z!YPw444$HyZFELQhe0utTm%u%l(<EJO&m=URaT__W!GOGRR7Y3eY$1nlaK`V>4*Lq
z!<w50eLOTL&jvs*O`D$$;j_!`O_U$TS@Zwjhs4e8L!n<~aDa9aSW*jR@C#$nh1#As
znZa76{<@C>?NynVsY-!-(5!!n_q-zw-*~!6<R7P*C*I1K^NT(AmFg$=`BRUXVrp5?
z&5Y(U-MMc!%(-n$;qUu%KOv@f;^Fq(2rJNZYB#!-6&Nyqoa8-<AZ#_Mlq|443u4wE
z09Rt7p!yuP0R0qmObcPc={&>RMHe(8?vJ&<QB_h()`0ockV-yTkJrhA%;i44g@oVb
zN4>aHE7*N)$^KnYAe_R@S?G;?Iil@{(n?Z;GcJz^uUS%fk8Xb$%u+h3TR82e<O0ko
zarxP37P`jf602xXRoAn+_VYipHy-+f|2z!8dn@~MLy@2Toe<)-5Vfb8!2B|@L~muP
z$EPr{q8)p?j~<x^(op4;Yul<?`mx_kzF$hgFq66;uyP_S#-yhv&ON@Ug^grTZ>kNr
z5qzMEedm^GNuW-+2NsChuGK28<!63BrY@-WUb~C_N~<;-@4%*4nF1-dx@X_+*S|D1
zlr<kc%@ObvySBT9zd6lU2YMWkYpSd^VP)Sc9)fw6;#xIx4RZY0Ug0sF?MFn47~b#<
z91*$f7u4qh-N6p1EEqnLn`geX8Q;*c2bLC7z+6!h{t%zs8aN2;PNHXTl#Ocz#>^ez
z8fc3en^hA0d?a2xR6_<=iHs<i2v_Sw;24`bIadPsGm4Otl&Bob3W6<8nmMQkIL?kb
z0PWH_12NYPs$x#j$e*?~Z7%&Fr5jLc{rk6|;US?i=P92S=2y2;b$6V+-ECdF?y%Gb
zHa;1G#p3T2)fzs#m-E1d2cqU5f~CQUTgKy}Nmycx=qRGcuQ3oFf}`TqbYHuy!mJ(G
zJ4mnip`c67h6IaBRyNw9)OKH~!GU(U#IiK05oSVZw~Pa*YYJsR7@KO@=JwBg6nu=M
zQiQ}-xL2vhPqmA?Iv=(aNcPN?qB#^dK6<6_Ru2PAbolD9HD1$5RYvGz=Oas~hP;}f
z`sGtg4Ksu1oabO%nM?l>cwY4YjFL68Id#$`ZoSuNTS;y1F&_tckov%D>gx#qx$OB~
zUQu^Wr>SC8Ha7FL8>W!eihKbyQ^lq*?)J<3s&+USMXiuqHgRMSjbBr`$nswnK*DAb
z72n4Sw-4&WFbAQ?F2)F6b-XW9<Ln-&DszY>%l#1Q#m%PDdkVep@o*Q@`x{-vliyhl
zdlCVPVhPk)wFv~Ag-MB=hKWD(Qk)7Cc<dYcW9VxaiX-2!iaPaZ?sHxP@s3KG@iR_|
zSQW)7pGkE%3%vypq23BRO?L8QS5Ev-(V)+q1_5j0PvQqBkSQufr|bZP2Cv_Ytxgp=
z2V1axT65h^F?G8>DsapyhI_S>bZwN^4z>%)YG^gGKXc_#x4_O`?WK5hy+6I!1bpB@
zXWpO{#-G<Cvu4G%6J;lYZ$ANaCWy0cK8ig=H)mO#H!-t>oMREtqY2JQd0M95-u6=$
zbtUIj-kF3A!~sIKHyWsCkk$TE&*%m?uQWK)Lt!8%f+I^;uKw4l$4K)0&YWT7bk#|)
zb-5`}0F|=#*X<Y58z&tYUaG@D=DkGlL0rpk!Onjr8muo8jZ_<YfBbF?iHEd!IGea9
z4({*ITpnY~UhV>qvA4Tvt;4|ep9F+US+~@MuSNlLhBgdR={dX~B*&cPg#=o!k|<?#
zST4k50=w{5V9=`K?(<%GKjW;v)7^V36N1^@?QqOQIGnxk3^Lg>E?qzipK*A&ihiZT
z-eA>gHRwbbtd)sF5WBC%bG=5|1ewu2cMQP%pLY;`TNSU)pX-cu$}9bBC@XX#j9?F*
z3^dFwKLWEs48g(o3_Ss+AKE<lVhqC5`HEtHv38RD?o#aKS>f(M@1XLst9@>Ki1+Qb
zDqGFsz)gS7>IWEj{$j(ySR}o%Y$+^acA#S#1S-26V6u<90}7iF-oFf-oc(D+R}3#!
z^k&?lxt_<2V7DZo=y~hNEjsa>eG4Z4`r#g+<=R>gM#yKzyz2o9C#)xjGY@n(3>fP0
zW>AbIa=zMh>pod)BUI44o2W3?B(7p+q$fJ+M*^q;4zCB;(rAhZZmr#9anFShvRskq
z)QSSuci<>W1heg)aS~U_5Qxg*bo_gC%9>DoTmdBkj+>#|3Yh(I2~z8tFUxN|y?>72
zb3FrHt8|sT#(r`$5ZB-vMp|kfJ{sAFe($_N-85I@>0c3x2B)0Feq)wex&3QTgTdIe
zVUTuE`}rgyp1~R05^7_2z|k+_66i1qAC!Vl{auMkvV4`AZg>p1>eO~wl3eHR|Ga2C
zy_Ur8<ApU%0Dm)~N{ViKgP@KcDr94rD%I)+=r(IsiwaJVTi}YvJSn*24FcFKaiWTe
zn6aEUxNUvF<Huu4&m;;TwYV>;iTFRq$N?b&gp$SnEJ@-}Ga@1G4nU@#CH~X|09<t+
zffhQ1687)mqO*&zH0Vrw(Hby!Jjf#54_4W%uADDA4s*_a6P^$ibgS({*fYZ!V<oPB
zdjEBcTR!j!<cKN#Sg)SdiHjUFpfj33JHS{!0yab349l=PE-{w2cQBXsGQC7tuf)uP
zZ{k}-WffNe(|Mb&Lj#fi!tQbNkx*d5^OeTOCm^+_v3_LwqWO|34)?1Ql}}4BU>EtO
zN>l`cL-IUyl~X9g6AIFUL1qM!AE4P%N})MUGf&{4^1zL<&;4e_X0A+8xX6u;`)+W9
z=tnpycI<WL`!Hn|_&+b6i(&%r!adLvW&?o+Be<t$EZ~z<>cMHzrF=`*tnkD=vsIt&
z==*U5)$}3`6h`!c)#`IwQLF6r(Q@?hySI#@kL$s`E+1A-T>gy3*Xw0AoG^Ud9XX6~
z2W<sLoxViKH2OP%A60UsW!+<WQSjN+0-mEjtEpEQg+RFS3MRb=v4$m|J{>;SkeDGv
zF+V;7@#Ci!>-mal6U#rV7p)+wf<piK>g^y>5(di`bPsB|#B3ipG9+Pm?8sTgyH%iD
zWAIt0*q3w$QKHCC*mTNc087+3M|paYP#~9pT!9!t0nmQ{b^+L|Lr1yCv)xE@Vwkh_
zMKlQf39Qce?t3U<xJ@vG%1=)2Adq@|^hr02R*jE@E`u5%=N6?7ak?DK5F4V+?Ek)o
zG<qH~^A!Z!UI==JdkV7$xVJ?D%ax@!A)C>+#8r0;=QIwR71<4398!692RwyjA(*9i
z6hrNA+<_uA_2f&jpK3M~StuJbE7O>~0h2Et*z4H<w}aIh<SyI^`bpdWPC}ZW1g-z#
zaz(x?${XGUfG;g+eUJGOpvbLk_E;5H-}@VOeOsTXl_{t^pzbDyKy>`!FL7L|6HWy6
zM&REF*VKX33b-yl2^)Ytb4Akd%c;oExqqtxOoH_88k&$6d3rC={QM>9VPy^KRbNw2
z(;w2m04vCe_|D?Ra-CT(e(yRf@ce+8R6^3Zq6+Y&X|G36oRkDe_C0<UmlHC%B(vYy
zk#qGExF3X!kNmjrR*M2C>4Wb<Kp5&fgbLt=4>yR}%%_Ot;Bjv4yNYpL+zymZ1^@bg
zFSa0<d8q7+955hFO)IL53%w)^Rv1Y+o<y)9kSmkFIVHE;LpbG}$(7S=7SGyzNd*Fk
zbBZ4==s&_BH}c>83kwCL_)fVVw`Uci4ZR$=C?t(CByEWFRi}vUmq(4q<C=I{=tA-=
z%k-s9Z__0Y44pcIO5wEXbfMsiK4Uqprs)=!uwu=)o~N<_I$e5FWnZ<`3kI!AR5-m^
zq+^|bfd(X6lswJWc?ZQ$W@2pRiF=5=frE%(6@aeiMA&vhYq-|2{ZV?CRGQ=dsJ#46
zZ<YPv*CD<d#n^H5s`9XZ;0m?h>+lZ(lTQ&#=Kw>1xhBxLDEs?a2IyA3-5E_A%|WCt
zm1^t_3sN`N9<~OZ0*I%J;&qmPPG8%e00HUb&FOyTKhWObMvRON8|CJr(Zj)C^Lvsc
z4bTM|%WIAT&_KxOwRvU6Ja;fL`7AZqF#{?Tu=Q?1C%8b%yvKe2m!CGx)mqH=7d-Nu
zkq6B#y}-AQc$`cByheKo@^5_k{flr8$mM>_R<Rd9b6PK`Rhzvyz!`f=K-N(4#PyV^
z!C$k#ur6ixyT>GI*PUq3lU}{{C|Pst=7r}7!f#PSyg_TU4U;gI<v=v;UGg6pGp}?U
zl~yk?g`~h!2*mGpy3THM8~-27wLggVv)b1#f}y|%PpdaWGT8!@P(}92jZTNn%Qfg$
zrbF>edx?1o&So$gDi?$hN{+>uV)d+Zv5G4kCM`)DCF%$2w+>3Z;w_kuuB@?;qVb8p
zK+LiN-399cD*YnyhgUrBq)HVQA3An^(p+}sVG8YFaq#;I%VoBN?Yx2Pfggl_7^~+m
zs6CFsixd*nhT;-dJJSLWna9%?9l7bWU<g~@Tn!Z;^cE#uu!H__2FUr>O5@RR@o@(a
z>L|Faq-;+q7^mmcrUX2w{0GAn9>#D|E>dB!)|mp7-S`7f*f}^eW6)~@!Uh%!9dX`;
z@~YkqR_s1q)9C{eVg*>*ZOM3k9QqK%Uv}SJ-c`)2veS%xL6{)wJK7z1L5slHZA&D4
z9uU(3M5hb{7`pq?ODF4*g*)OPNTg4t^L^FV;xf$+hYn2o-J8n3IObC1-r}uOL(|tX
zONMZj!12x{Nu<`YnPPbN%S-}ksNxeP>{}Tiy<J6_O5Q*^1UR-v#8b`NcjR3YBmt#Z
z@_jjp9N-f;`$d*sW#!uHd?+Nc3Ey7-F{7h_ZNS1}OqmNqm#O*U)A8m~8nvIHaypgb
z%G1L^J!7fI?bo3HPjvH=?T_x$iYv=K=GQjWbJwO9gLe|p0>!<tJD>R;NO0H(x<S~5
zONYSeKVO!%5(4q)Rb&I*UJcoi>W7(zm;|MP3;1ylD+Q)c9U9X7Co}B8c!`1#{Ub=b
zz6&*x_S(}#%#_H#N|0OIb)0%~2FmIpr+y&Swc=XAkfWl9VD_DhOm;!%2QX^i6~^Pt
zOg)8`rG&dsIVmBhGHc*?7;1EA`PaiUlJ3B?=mJ1_k7M){oV6+$D`&6dU|SR)b9c8w
zW@dr952X7hm=lg`oMQR$6Yz>aI~;*VCm)pa*V}UOEaX!eOzAKLqCTGSe8Z^%nvl>N
z4XO6C7nf`eo^6i;#M2A;Y7d-Ii6Hf5Gct3xf9I!XK^`NG;wrFQ6hkYCoRe`_vY>Q`
ze|PQ!&oxQ{uG+BMP0k%)ySgB<K2LTz(zg#Sk*J_JP7&XpbETShM=Tj^?VcL(<A{bM
z6ImRq3l&wo(LdKGqRccprjX4A3VbtAFixY9=-PA(5j=Y3gXvLfqQVohaM1B@ymAe!
zg&7$$^VDEMc$w>VR%Clp_?x*V$y@GsCG5`9J&ewy|Ce*NwhIv*1(Svz(7l*EqJyfp
z-L1~D79B&jj>0?OJmUZGKuN{sRl#>i{zjz95VXc0yf5EvvS+vzV)MV9=O%iDZWdZf
zU1U4tlo+yRD2K++F1vraX>VAcgs!meL6iBh(Hwk=#bz~!E?4_?TUy=D>-|vnfZP5b
zu1IjS29h8{^Z6Jl(kiZ;PGr%cc|JexaI7Nn%RD>#ZwC)&c|W&g5<B`rQy_XE!p6&p
z@YB1TgABX*KYM~2Pm8Z15Qg<@DxFqgpD>Ant7jKi*v^NhtLs`les^s#58%8^*Y{&4
zsp1D_b{m_&<D-^~Pi0T0_nE~0Tw=eWXdfIpQD*5^S)@b>;VV2qN2m(HD#4S()zVDA
zH!)GGlO0KVNJICzXJ>q;DPk9f51r(b*Jv#>cIkH!-)NWkMNXR)ExF;bEkaI<XbWTY
zR@p8jOnVJrqNN;eE)1Ch#`kIAuIJDE6&{<ZoH?%}mh~FQp8f36PCIvgD*weVhM5hy
zW3m@5i|AS{r`X|YufaN`en`~kk|OPKKy5Wp+{Hc%m+(scDx}E6WnFQN?$|v=D;eIJ
zj~zq0wYzK1;{E)v=B+zkO0D1}JG>6G64W~u%=vp(%6fuo5zh-kOzf3J-U&xbFTn=D
zbAp1h<KO>09<4)F=kOnT8cc=2g10wQl4Lfm{Oj{T>17tCez|gi%ydiY^d?P!Ub{T2
zu6MIYN(?9kO5g@1_ES)qC^DmV=R42lY#da@R1g9Pf*-n?t(qH<TYuvjQ&uQ~6{1z?
zyO=F{=^s!p(Lho7y9ONh*Ah~0Uluh>G^RP|Z-KW&I+1B#4i-u1hrZl(ETg+zb&!=H
zC_NZWJw7x}q&S4n*2f_hlKVKnEU3aAc^P?#{89h65mFS_1Gk!7gr2rlQ+_Y-k}40<
zyd}^w6B0I4dC{^%!Czf<*V;Tj>q4<_zz8e;m1zRSrUhJ>RQl*0SZ#7|0gC|^&#q{6
zgn{@aHS(n0@EVvW`-cED+yKpF?|Y=YRqpogpY|UxJfS2iZx}iPIx#i4%!>6JU^9%x
zpM}e6(oxAGSB=mSR?IA46LRsRs40O01OL?|vUPfRgdH^gF-4p8a~OoiejkDXLy}6=
zj=xg{SuN^_uae?0zc0VYQHab-v4x~oVT<BVc&$jJbVdc%>KVymT3#`)S8_&GyDAsE
zgf+3-Mm&WG^7+8ZZr@}WvrEfn6)G2GE+}P#r&YjdwBWCNq7_yBWAeM#K1Kk17ho6K
zhp)qq$voK%XFrn|{!;346E^UZA}>O_$kEWVdA{#-X8_o`ay)A7&US?a_TG9$f3?y9
zIi3FkbiRo&?JTOf|2gH_JE35aI56kt`wFWKf4xM+SyI^15yVK2($Tdjov44T`ZI5V
ze3AA{L`7dTooRyg56KSwLF`0>qa_9nsR}|twPmkUW@r*w6b!3&TY$kr6>Wob<jKe4
zIUYM1V-YwKvMs$HyHNq>5T5=Jck{k3IR;<E0#MP&3WUF($WT7JdAwV*<)T2OE<oHu
zWWFhFtJ1sNSAb(MW0-#~dNMwVo2auh#P~WJ>iwO|9Ve_66_zWY5=)+2Bw)|pm^vTT
zd_(^#0A>pc2IxwECLpKkp?(0)j8I?o)BRsmQ9r@}YV)mLMaq%p;pv`K^1yq#YQ!QA
zHtJaB`58zn=?<Wft<iT!_9#`R!4{F>L)9yv!b^$sn}%BY*}oU42e8khFfhIIeWWG&
z<n<V3eWBs7VcV7Ac=z+OF6tS9O#+7EBIp6ipiBlGTK5AcY8_OT?E?_KZ0`HqkMJeY
zq#MaGLMgDUcC@qb&8H&?lk$IXBCjN%2FiIXZ*gbj9~_^cM3Y-H$D!W9eItO?8`A+E
z<hb|*l(FTuvn?LQ-e~JZ03vj+A2TdZ_XcUZTf<$-1xgXff1lSz&P0t5piGMMGrXQw
zjl=K3G)!2oZg4s@2}2SH8m8#q#~&g>A+4f0joSQ&@l{XdZ0sm7B;o3cO=7RLwPK&c
zUy}{-COq%y0$J!}xd`_$-UH726s3-xJEQH7E6<hAR3}`Yb%|E-L-WnZa=u9?-s92!
znn?eIEGL=!np`YsNGYffoo)1&E3TY$sCbS@!obqEA$+>0`^Mj%$?hxl*Qxg_o-Uw#
zugQ62*)+0MAh~iz$MR6^n7oI_@Vleg&B;$TJoMO&a2^`Cb_z2cuhVdNTuXccPQCAK
zQ*vIGl=cKrG@`;oLXRG~_2-D6BKTUGYY~ZTV2jgtTqwQ_RwAmwGC<?{Mj=Bvt(>a+
z+Dm;~ojbAT=ag+sdYGz{dn1eSBuO%vNb=!5VQ--IVvTbJi(QHj%|9!3FH!CiA+2<I
z{U_neW>t<$0efL_;r(#yr~KEk3T{|INN|ag$!>l=FxDsw&tqaaM2B!2VrD37`SqZ)
z?Tm03Za+dvR*Xvbe9;}0yN4tbbtBvxMhZ}g{sb7I)clvqH&Lh=AsK3}-Y4w{8h@1J
zt8G^09}qx8gAOVCs`BNZaqR$3pcDO3Y@0h@__>dq4(f<)Y%ZSL?w6<wmwty!NGsUw
zS#}WCiE|ggPX1#boZMaQ`s}&yf*$re*qsTNwi6w8c=DL1e``y;iXsT*LRtp~SFCo4
zcOMw)H%Uu;QHdAA*yT6B%};es(JxGQ9nO}f=|yfDBU&>?a5+cS4vX}Viq^`f=-`;3
zh@b@6LhWXJ0I~){WRaNwB&=Kf55Vg;k^AFwm+JRL4_1a>PwOS5qmLcoCTJaRh;lYy
zf_YHZ%^F1fKST)8R&g{q?7htkSx0~qJ@GxJh2=(v_5nzfC6tmviRYL`M_Y{D7VboR
z!%J9+JhMTu7~&hCgQLBE*sNk9%5My5X};M=*0<LOZBlQu$#sey`}v%u{P)y*BU0*c
z*e=;y>l07)I}A!F%J<FB)(5nV4+#MEpi@Z_E3tC6bgEvd<C2k!dvl4|N*j(>CZ;!D
zZE41o6%ss3ko<`-ql5QQ$$li+`CSp2WhgB>L2mg#$R&xvP}v6-lW+xs04b1S7?C`(
zef*Bs=*nUr4;F3XLud5v&lL%?ak4o&rQGlbR>63*BhDe46411elI$0R<%yUPSBrYI
zT1-(sKu+4~hWVxc?T-b-<e!YdNI1W%^mb&F48Dmn%;){ug&5hmrEo^ad)O{gelKWw
zAtUQBh0n0u(}?HrrRmfdhm<xRpKRR*L*9B|>+Nruz#}E`I))x>-Lp2wOWQWx><nxE
z(zbq+&F{*DhsA!4;9SA(K-tv=TLKr4E*9Nu&K6%)-#Ki5j5CQ{WpR>)Ach@c@ExtS
zL4074aOCJFm~;f~yrhd(*rPK6Yp}`u(;1;JA>SnJA)2jN*`DA&->2pcVsAVxTVJ!0
zx)cV9fqR$WbFXgof4qNwqiDkq19p!{?Dr4pYb*rh%Ec{u9{LvB(^Px2rKQOZ@Ja`i
zLp^5}))*6Yj~?+Z<uf@hyTAHp-M?2ybQR|D3018w3HB|GzFjulK^)OJSUPR@YY~0b
zCv&=n{%X=}`y<n6R&($hgqyqYUfAKqR5BNrN!Z8X9)!o?crH<S8;#cLU$-Mf-?m|w
zws5kEux51_p-;(dQ@Dr&V;QwE71*eM<4kdXoFfG9GZ`kd%110De8Mjn+yWSDE{g!>
z484VI{EJ9K_a{q9KRwO;_g6BL`wMX<N6qg@1ic4G%Z&<lAnnAAm0bjJB1cA*hmGIS
z($?c^?I!anxBURN2G!4GSn3w>q^$RCZ}l*^t1%=C(y2UihY}Wh{Sc)nu;ms=>Cx_&
zz`sd0BEEA&sk@l?(fq99nOySn$&($D`(!C8Z-L7TwAJ{~!tc<@C2~wB5=s#}=QiW!
z4infHPJyu{M<EouUR$<v%wU7tI6^__b6-Sj*Y!II@2n8xe(<%+z)6ScFN|7(0snX>
zbH));@<Gnvu6@S<<Nsmnt;4F?!gg;#ghhiW-6;an-Hl2K64HpkqD#7@y98v>NH<c_
z-60*)(%s!>u($g=?|07mZ@VwxT5HZR=Xl0*KlktE$-@3Y$eHu`e7f@kf)dEM;ehE&
zD#<ClTjR1R!xrfaGH_SO4ALrNAd?p3zgv3LCsq0Pt-0(k@`lrtjkqdd+8ew@;lRiJ
z=PRya1+Imx#+aECnqqlK+_(j0ey2sq&nNNp(kqMl`-9|R6p^pO#wS%L|9-gt@dqA0
z>p%Z&rVsh@d@QL<+l<KQpKrhjUy9Z2MVlj~UIs~$LEy=vwwf29D?{nZ+F52@kjG<;
zcpH!<M)3qitBA*fo=K6iYnvuci;~VF*?d=MyRf`9yuo2ILOFwnWakr2XB6u%1xpf~
zC~4cipPLVV`iFOdI%I17?>DjTr5q|yQ?Lh)>Xfow{$wI9lM#gR^bQ6ScyE7nqFf+D
zkxkYPBycK|P0n>!^}kd}Q3K$NLw6~16C@1cKi5giwIIDSPsg)C`s;d^L1Qw6F`8ox
zXXs(d({ff_VPo?W^pz&ZLE9SH>_!|jRta>i(NXycJg7g`0}@=8>DCy%De#J1NffQo
ziI=x-S?@QT`~Fh7S}d;H`^r<&G_O5gcqyiSvvMQyN4$|I`8?z9_Bt<~-MBl?%VV&`
z^6DgSAOrNj^5~Um^N?{m3!<o`b9wEI3c!l#Ij-A(^~zB2uzgwH&2F^LkvS+t7ldQN
zF3wD`d4?hCqQ8Rn;4~Zmxu^f$5@cyTfW<HchD7AS$xo1F#cFGcV|F>6(X8WpGyzae
zes;Y$&iTAKFhnTgJPtOq1ZuGWjNMb4T>_OC5!4Y1eP4hbJs+41kStvrTuz_$Z}*7h
z2~Jd5%oN{+E<fw$-+PfBk(z2-*1SB{hM5CU%rE72%s`-{>2yUr&iSVXU^5PwZgh9e
ze~-Do=Fx6z2V|OjaP*{hzUgNld{GfKyZsKhtvZ}<`<S_-q7mbmb&EkFYYE6`<~{A9
zLp{~H{(!&@`pPy(b)cC$#yJVHAS1|14$Pi2VbVo4ltPm95N;NR=&F@|I{jVGKOUBv
zQtzj14nYeA3DU!lumr=-UVfUv+Yv6a2`w|$`eX$7rs~&}9gr8Qg}I>3Ro<I$O1uCI
zdeO$bJk~o|QV7)tkh-X@ou3(^1{*1^Zy@ZSJf!QgGnwdL6BWrvr97c$)<^;oX9Xb5
zRH|2qG080$n2Nyk`^}Go<-@iT^dS6gkIUMbasbjky*XBy6&+~MoqkI52D%&49$$DB
zyZF9&y_4C4Ite&=Gdyl=vNQm`jn9^v!^*8Jl%c!i21?M5(<J5>T-zut)%R*VWzOyR
z7YaoZML$09&V4>HhVV}i^~Wx_m?e!J1LZifrqX^pHUCqOD~)BH1IaV74Zv*cv)ThD
zqYr>$t9v95`5KVt)1ZMMPYr)ImV*=2$Tai!NtS<!09p>Yich4e^!DckHE8PN9)cV*
z!vVB?gg?yFM9y2_rRL9_Er4R2sGGkX?;>Nio<8{FgSn;M5WIDB%&Z88*vCxb@&OAX
z*hsoWr@;^}Vy@S6PRDKp#C*LKUMO_J5IpK`0|pV(BUYD#KX4XOY|;h4BCMCz2UB&y
zQTzSsK(glkOVW2lT;_`4urTg{SUhgJRY^mdKUdN?{Th(kRjvZp(5@ca*KG}b&j>${
zj>hE9t>!iz69H#(jR%uJvfV(9OVy!O-nm(cz2nlz8SrsKJs<H(ioDD2Una}|zg$yU
z3%Jkr>a>-1>lj6iLYf5`3f3bwZQ-`yoMk`1zLAlRnd>+oU*L~HvesU+m0$4LHDObG
zRBI#!U==<9Rq+h~TqrE1^@j7JdPgPmJ8of~R8eD+epDT5aOG~fZuWX?3g%Su9ruO4
zhVwD&-i2`u6J*Z(R$L_V!91I-VU8Y+1ki)Wo5Deb_fJ}4s}%%KGr?uar#73d{tgr`
z!mTM|hVFf+dmKO}XirXNlAg>c2p5g-S}wY}ovEm4=XJ?&pMrtrTu{b^P0Q2&J_f$>
zkTEbE`^5Nw3lNt1{RqUy^T*{B%Sv`WwyE3i{1!b|pkmb&G5bPt2@QQ9?xx8V6Xur>
z^k#x5G5eROMy{^st;!bMV?QYj_YRD!2Tv`lX>ldf&)#FGLCZ`et(=akWC*2VAGLh~
zh*RJajbD>MWt09GCfCo|cysjDV#FrI%2ifBR9Kd2YtpkP-VXf1O5E;eQLHsazX*#6
zhGaWeW9Xwvnm()yx)BO{c>f_XlkXyI>v#O|;6$%IoA#crw@B%akXh!MR*&#Qw<ys(
zmsCk_pQ9X_t+FZsX*}6p)H|jV-$CH@TPF@JUDu)l9Xm_gr6R2wrA{n3$lH9il2>MX
zpn1)pAzSFmvSG=#+M#hiYf>F?;&ITHXSL8+t5Xt~E%VcSJ;-G63*6&zAbsQ6?VfDY
zb=tIePi~wFH1x)9KOII9<^Obhh3>LbU%2pkY779@o^79jc@-L!h@V0(Ge#ah?w~|X
z5|HJY1ge&nN}ke(5{9-@!W;QC)=hb`T@R4L?NXeQmB4A1J^4FlT=@QmYM*6<JQT>k
zsno(w2WgP{S4{#6>JEUk7wgF8RmcEd&}yr_u{d@N6X*!8AjTanNm_mFEI{%P0)-x@
zsXW%fPxb#c>l&<P-7WxZPm5o+8V7=TMcOJEI1p%F4rOP%LE2z<U_KyS=oUPy8``zm
z;q2lL$5<~i-^S^%`znu5>N4YH7m9enZ>5L4`r4TSn&c?7({kT`5I_A404-)MQi9kn
zfN`gUkN&<^UxfU9b7}ct{PD+A;ONgkU+~mLC4B_~B4bBdgNHB#S_4IW5K+oeqlHYz
zUYLmQ&o@N#9D=ax5Vz(DuxB2AM^nBy9^B@zQxQXj@V$w@$=H3E`!RBI$OSprC{*)J
zp1KxS%IZStbn;0r$a)5abw3~o(Ne&K0Q=mVz}afPUNRO;F^=cqJbkeUDN`$%ti+-#
zAY?Zxgk)WilL<Jb^@y=#YvGd#HG_*@K=94`h&cOhr@TBD?lEF6IF`M=r?B?UwL9CH
zS|v3HuiApLPdzVXnX_N8<aDY@VQJA&-R@uSt3mc|bSh0NuR!F%%DckG0sxfSi-s@D
z;0GXWd%!{t{v-9Y?v4Vo19F=Ato52$v;~4E<kghcaa$0*D*<+^6<J&nps*?CnuPiF
z>1(_$4U<N*AMobB4l%=09MY6#7=Lg;1i>KZGT+;XshwI~ac|D8qfR3CPqU2xUyv}I
zKx;tAh{n1=8TJGxfZ^616b|`7+IjIq=<`NoKEOh}R9<5Z!u=GEK#&MCn*~E6ro-tS
zjk~P&=|V;x3A}362fLKj7bl~GpA$=LU?TwH9-;>)W#8icB%Kp0vr)8@#gOXY)1BfT
zSoS69VKXuYf)K7kKLDgu1)zt3!Z=6<gKFUmZv&r@8baCreQ5PS>V0X7V*&Nciy@3a
zhYIs4r4pq0EM|NINHNxL%1580>6s^L*X40`Z~p|~z(3pSa)lp#wa0C)P6qwwsTu^y
z-^EY7je7`x0$LaL8;W}Ii@~d4<ac0F?Sv1H?J~%5F0Date_ikXdjP3Yp?>`&)(8SY
zwB^mmxeN>NCZoeH{AoTV*8lR@^tiXwU_q&39Lqa_Nt3Gp?07P6=JAb^KlUfo#X>C3
z_q6#29bDXNb=?8ZCw*K(Nc5P+aJoS9kHkL@(6ghTTK_#jd05S|M#*PXpihLBHwjn_
z`Jgy_ng9EDgSciTO9sfpvmFed1JmOxR9K6WVT9w7%}O;#$7SsD)Ub|hIq^f9ccGq8
z`cNWnd6=^hACRsY!!@U3!LpN7LL}=3JaXF#tU&gU?X2jrb$c(f{X}`&9a58JAf4yC
zRb6@0m1eyCO?5A!masXgR{b{@)j)MfoRIoW|85wTr@Ss%BP~eM(gi^5713Q-Iy>^1
z_y^f8Yw<afOeA56U?NAs!AP?T<C0Y55#r+jf=BN&q&SDu^HhucGepbXE+?;+YkmIQ
zfdGzP47S!t=i~WUVTa9M7`*)BK$XUV!%UFA$P(pmi2$>Ug`BbZebRile&zbUW;;it
zj-}R|&SgG>WMYCGPeapaRTsX)J@NVC(7xCf;5X%4<AB5e2a-870k$a(O1(eM0=pcS
z6z#5Hc1D7y%53u0uM2@}f|}2iz8p46E|Q}mtn;9g*RusSoIrYrxBl+%vV9O{o1flO
z;-UQSyHE_byUVA}9tVFYRKB;Cx6Y<V`EiM6mj1EmOB3}unT=;^3H=3l{bt7ry{+v?
z@l>Uq*?G(0m0QTUFV>m~6q)=u9hV-Hh${KL1+Wv_RGjLC4$J!FGOV#pYAH7lV~A6c
zPb7sKqfu3GCVT<dp`hxu`YThsg;G*fA`awuGAMeb>2o$=QLK<PY2baCW09zj@CA^2
z3xb<4{9Rr)eQgo59NX2G;oqjwxo62+6(=%54~&+cp}W@jX@`sOMG^8Crv90N@a@VN
zXZN&8@8@$I{qLNG{)MQ}(S8`K9ETaG+nHa3kmj8AbLfXj{Og_%+_F=94BIPjna9;0
zazmgq2aMgsw(Hm{Dd*4kwK;-^xUPYx+pyu}syGjT6~G8tY<-540(J!FY8P3X+)w-_
zC@Y)JtVZD&4U|-L%VJp_K?xO(A@?=YCyU0vC!xq}tmb$_f3!`NRH23rw5|)Xo&?T<
zQH(0C*7*ik$!$1v0^g_W$#HTVXm_{gAdpwq&<By3x5;4c$FTwd&|Zh@a^f}x24hK}
zV?QZiIK(T2%Gj+JGBk3<S~VAWPlz46pX-Mt>e)`|pJiZEg~zKN5+f#I>dAf^a{HoM
zjJEsHPPb5<_4e>Ol`4}0O{^Tz{{A6hy+4!1OO$6hTgZLh+QSwF!Vj`ZNKt`JQ^%V?
z3QIXpi3S_u2I%iwzqpZLFHG-lA|xKM+AKM-PJ-4?Ak~2lB}v0nrCC*5Gq$fo1g{Gh
zq<H?OzRBQp4#X&MM325R-~y=%ZY6$S+Qvq*`jX7C6wv_E#P{j$%(?a0MG%$(znnh~
zoxIWmsFJJCMaoPaePS9Nb36ImV#E@M4qJ@_5NpbtjPUOrV8hT$FyjLSg`xBl&YDlM
z249uIa(ReHF7k8u4t2$jvuQ7OsVLPuT0t=|^5wXZ+TaC2OH7dk6w}W|*JL6~U(a{;
z`(D$${YLzJ%y(K`o%&%k=Xe7Ie?_=lX+>UjvisvPUbNHmPgL7e)>?kl_c0-J_aEw!
zgS_=dF<okjEra!)u(m3S_p|%ff&`!tWRyC@59hMp<oifC+|v(BZlvxyh-HH13Ux`)
z$ALh@-+w4m+y^O)?gj6*e@~wltBtrdJfP7IAkb|KE>nlwjyAYr+KhyoWMUsVXN-0g
z@<Rz0hi(AHNHvMpKhhKs(&Rs1ku8!CJx1g{W`UB@Ha;8~5a2iECBFZZ{oe;HbI9lI
z3CZ_8gZ%pfzpUNA15<u#W-yGZx`#(aQXIZF&3(oRnU<1|#8n4KD7UP$s)-#dadR!;
zJnQGKM=Era3VrT$EKbyp<&6osC*2d~5DJa|5@c3f=eW;WW!tfUx`L}z^F;CDoW<f>
zp!EIj9MKVjkN=l<sa;lL)NH@2w|fNbSE-2dwXY=)-}-!z1Lcz>j=wA9p8Z0(nYq{f
zJa)`G{cu`s(Eh0ZdQejrpHN)Xn$w0#8!Q%(f{-PLWEw<^K8*u2qmW=l(7E7D*D0wV
zk>zmfFLwf$H7(ZX(f-51PaH)3)NoK;hdU?zc!&*@5JARS=MoHj|A^Mle+b6fUQNPj
z>dT@XD(nXNl`6a-@Xf5E^iTK(X#wqQs9dLIB=6Ht=3gB%&tSGx+H>m8lh3UZ$WOzJ
z(dzobIV~IP7abYRS?0Xz?G_3dj#xUIoUA1k<&*J`b>KBYgERFBxZMf=$L-!H?6lff
zpJaO-@U`5MC+HNtn)0ND8{Zyeg%ln0bi+PVcG-x#@;UCk)X>>OM|Q+DM-YK5Vj(V#
z`d4>W43lt=xKkbl##EtKL8wvS3f@J(ptcbD@6BO^{YRV>DHdw}fRLNkziCA<c}CXH
zvjf1ou9YRRtm)zMd~Rvg@dy>xuj*N5rO_MT_am7A0{$vY!ww#Wtc|CO!<QcG<+_@-
zebzEQK2tnHad%F9Ok^ZN`D2Ga3r-m&5K~zVe1{vZ3+(;`P30#GPa_?rCo^ftdaF0H
z6D3IIedAu;2Ij73v^IL&x*)&OJVR!R;aRgo)J0lwKTp0|jnl2bj3R4CX3Ku(k1V?3
z%Zr|^17#?E95Dsw>853%<EPvQIx4{*(6S{i$V7%%9d__7=HIK>n41YaLr`nF8Oaf*
zB@Y>Qyx3}CEO0(nqh@d``QsaNMI?sR@|q4bX~YxL8`25By`lsq-QtHJJS+ZTYk<a!
z<>EDAyj^n0HtUsbf9{DzpOr4T|176G^ht&s9HM`Em>(D#ON81Qa+7+IEDvFM3&0!@
zyuh2Fu+-#8E{s98RP%x`v$2Pyt+cbqB-ORBtF;qi(qKon8z2#hEk&LL$8hyrAD;H3
zB@=0dIX*Yc<LcdzJ#)&r4-s7xn@?dfShw-IBh{@=j?|8GEKhC$Kg?FM^dn8t>RCN>
z<B+t){8cxM<Feg#^%=|EO)j{4l>LTbj){E0$~|$ox)wJ;@-0wc_HxQKM8uTBdIy!N
z^39J+jemQ0plMAl{fIq<YB~TJ<OG^xjaXkv2>8Cn954lz@*+;kgplZ2aHjD$^s0Fr
z?su4@nASX=fbGdC8`<`XE!_T4#XADU0L6f<WgoF_gy-2_8p-C;i^DK(7k>WP_T?2S
zhi^z2Zpx`Hb~zV)%?cR1UN%Qm2<#UqEj`fH-a(T^MNSK)(?p2!&5|-=-7477zeYI1
zK2C?~!evy7lc<*HWIec+ZAjgK7M+i$TS(Q2aS;0$xMrLVUwdz&Ibj7{$LU%!x<?7$
z$EHg@ZVOM~AEc;|FZ;x+nY#5(b!pU!h1)pU-smfd(QB+_T`aAJ4?`aB_W&Ht*7;e%
zz8;F}iIZGXitW?iU}S<^HoFI}h`j$d)~i2-I5<H7d(>f0w;Dlo{!JM(noP}CJE_9H
zkLe!gJ50^)u^lf=^E_2k<PQdj*xO91>tC5C+6@u&v&1>h1X!HF&Ey{DX>}HxSv3xF
zp3~I_0J?M+m=$SQR(R1(9K^{rXGnA9F;&LAioiv@vc$+K;~vfKpXXR;^OkrY9EZak
z_sWaOMT%652&b}Op_jx$0a)f@JU<)N&B98#J|=V^y8+=63+@1HgYJE3wR=z8g`I4U
z7&yA8H{)K-|7b6akZym?FLr`WCDnWVD0({0U_^5r%o9_X7ky7%%35rb?!FqV!|6JN
z-V9CFmmj{v^?e#89cgP$93t}Oq1{X`Ih0-Xw~fpZ_K5beTj0{@GT>ZJaP2K;2`1q$
zb7maor{fI!>Q8%5J?(#thD-Pg`c~jXkzQI8p$MyAM-S^ld;?zDAXm4^L!+TSnTM4Y
zjfo^2ncLT3VY+e~8c>feiQ`vxXxBPWmbVSsqLo2%7QDQ<_S=zE3QLO{<3$bnM=Rj1
zrp5I61q~eqwqvwI@N}9emE*IVWQ33V_pPX$mBOhyOUk}S`bK(UKowof1WlD7Xn?>K
z)Z~z+?XheEi<-A}bI;)@<m=T5a`K%I3tUbabmbsG6+$Pu$c)rFhKNJDWOHH}g3Bb)
z=Wndn5w+<ryKq1xniSW6OkV9W%BHii-Qt6^DdeG^*ekhT2QsU-YENVDXqXSSrr1|}
zzb%A{i{D;1(bI8!M^|pViGDZJL(dH<?SIWoi)q&}s1Wp&rah+FH=e<p`EItt&AFR6
zM5tpSG7*L&mxzd?$Ch)`Jtu#xg~>ad{-Neb+M-cW>TZMCSJg->L++UgBh0P;c{Bt;
z7ef5&_svSQ!U;a=r5MUTtr%rhu*s~iZIOB!&BhzgGuS<EAvJ?*odeFBuB1`Xt{+u9
z>dW71ox5DH%<Jt^&zN=UWhSHy*tRUIs&s~}J9M${!iHoF+}(kl;CU<ljlg0YGA-%S
z7}*4CM(M_@h->m~(o@Kl$HCIGmxG2OnUB4TlXV2T`vm>PoUa4k>2``2o*t@`F4})K
z*ykz&4d6T|lf&l72h_9eF5+lfPdg?MFjN!EKsO_JC{gp{hq|zn?h=$l*fNQ-ldQt|
zm{wYII5~hU{BUh?U~pa&A)F50_)}kspqs;0jAfEi>8QbjF2Zw*C38Zkm}`v<cNmZ%
z^g>=(0w`;$2tRtH`F)}l_12as+%SQnhU<^TWxw@|HNcjix1EE$PKb~2m`pewBdM&t
z>u1~$B|xEGPM5B{TF0j2I}Ep+y)dK;CF+$ngZBzpKw^-^$QdJtei?%O&W$K`@J#CS
zd2!EznueA{EKLbByxB9Kt$vOFyvb=N)b7NbI(y5uh_{us;`)TOP^Yz>Lc5l|V(Pc*
zenxZCa?5x!J)!zuW8j2iX2yP=6-WK-W!*l`u)L!O-LjxZ)DX|deyOKt5=5wbT&!HY
z79sVN?+SUvYSj(cmFDTujSE@u=E4;-#<^<@U@o16Rqco2_N^cF+hzx&*KOmkJ<PD2
zspHg3um|*%wrIg9%CUbUPmlJwcO%)2HJf2wiRgs9<IT!^&RH$Cg)D8=far*q?tHzo
zp-0#8fbDwn-|gTFk=OP~kKIsN*7s<%xk2w)O;I_l81VErn}k0oI%|)cxrDLciKS<N
z9iqsiLDa$YGPg_m5n1V=pxTt$#?u!DYScGOp&(*_o&5S)m(TumrLt^@UK!AcUrHy|
zrgzn!vN5vaB&B0N4cn$iiME({9r}f&t%bj<*tMm!jFkz9`YTkCN}_UCvhnOy5i0e(
zrxag5@u|A}tF+DhRoZ6ZyH(wZ%V<KH^SmF??dUyx2gkq?+@1Y%Q{^ODxdc;-IN_%w
z{qENX*JT<<%t|W`tW-S5{h>>RaT(3aS)fDRJzNP)LP6Y{^v{Kdt5cNz{O-&|M%cl}
zn3r>yLk*7&GXc^SHJKN&2LH3G(>SJ{-geV5YE-VoZc1yHf4T`JcR!2dc;TVL!~#w%
z>6=o$Zy_Yf2}15J^mN#lQvza;G-N}Y$_xr7D!ndijXgn4M^HHbc`r(yMaVf3e~`!a
z)SUmXeD)_(lw9YYt0#|#>$0b^PeI-}=PhB#_U6cFjwSytzL5R(#LxbRW$%vlqstBu
z@+>q>|0-*LE4F{7!hhwr*Z<tC6_QT{#{d10*K$y^Vce=JI4Ya;Q}V_mA!X|<t<c(K
zmYsii<6$>~nG>hZ=K*>4!n!i)#O~$|8gmei-_hDH98#FSXU)uVqr&UxGULJQ;@YkJ
zm{0nn;4}b%2XNaM<pX({#h;2HB7`E%D5fJKbW<<?RuUph*f1gY{H?jVIIB)3jrw#2
z%Y+Ob4Tq6T<lu!IRKUaK`{>HsTLHk_rodQado>0)+NZaS<AooTTTnvx8qP^iueJrB
zVoYwAII3})HAWt|Q5gm*Lg(viOjo+GqtcU(76n}!Ur7(7=K6D){}!>|o8{H-u$#to
zcS-M_BRzM2vvmx{tr@Q9Nk3vl5OH&*P}_7h0Qh$dZy@<zI~)#i1RK0+htx5-^^8ST
zua-fh^S#kP5=ALs66aYqoT@7HM(?k0o}pVGoi=Z3Qf+6fo$R{K*#2|=vDTpa@p+TY
zep0*RJy_mfd2X5BZ*X<c%S(q74|iEuy$AFMzM|6fLr4*z?JyDIhTt(P0qUyAV#Nt)
zqrL}Nm<TbP9FJQ><d!@D0o_V4L&*io86CB-747ZyxiS!V-di>DHeYvN-U5-VTX0}(
zSOesA<Q(_UXUZXfo_+D1?2w5MNZXVE6W$V_MJn{ms}vZV-j^H$`|vnuvStB-bE2tX
zmIFcmE%1H_0%HKXQ4El}qBF_(KEtknc++<D&{aCUa29L*X?6d0Jn#79?_-G@5wk3d
z8Lnwj7e4owi~1@c3IoVTo_>ZT1u6PbkMGoLY-9f!OnIf08Tb+;{vEq6G`hpt@$#l;
zb#jYDIu7*=yJ&{s1U<YqWKJxm{w`DNRxB8ce)>zD`f30SxqfKAua7;36J6^!E+Akx
zdZP}In#sm{L<o2gTcwbt9^sVgo}zubK@}z7e^Jhmj(7*`h}LD7E75*im6Nskpm=!=
z(rpHjF>ROCtXhHm#z@Efq=#yrXXj|W&rwp=zrpG7qwTXRa5b`qH1GwStpB9nOh4x-
z4Tp`K4EF}dIu@?a*Dr^qZk){Dq~GcQE|bgM<;%u*>6LB!1fg`>64Xa;ap?CBG03>g
ztF5Af3ISpBKJy#nWouQE#FK0z0?7B_Y_pNfyen{g{;Fa)@kVwQ5SR;}OvTakEQ9<b
z<@=lC5}kcZyq3>jA;^&5KlFVPcx=*pK|g&s+~D>Y#q{GiJMXv%ep?+J2%Qd*P&Dwk
zUd1!p>2TSLGp^T<>Xqr?{-T>Ax*&QuvCt`e`k;3!mnc|-{C#<lKZ4($EKvhfHL4Lp
z8ho;f$DktEO78L;2jL%#H2_L4!vqv4LLTeL8;?I;Nbf=KqC>ZLvF$z&7*})}u1t+j
zK~sMl^Jskxm`IC&gt9o6CKa}oZ@nlsZJzUU;!n+6FtH@!4kW~@<@!MAG4Pv9u|NKm
zgp-8z45(pZLYA0hLWS|Z&!?Z|sXQ$rLxBSN42;mQJM!J`T%B|x@u`xo8}`TWT<*Gx
zi;M1WOm$-=!2XbqZ1B<%poQ~*X;6Op=z&9O)p#j|irhbXunpw(cZu9q7&s#^d@#bL
z_MjPPR9PmsmLQMS+V_2`v;oO_;S2EZSFlmS$jF|eE#9w%abM*EtFy_@NWp`ab-!2e
z0IODh-to!_nBLx}2sk4wfOebq`JBipI6%_h%%Tx;C?Zg>NUtoUi9YTK&r59H0r>;t
zgGhORhHgfi;RH)VoPF@O_XvZo&h$w^jYjDb(kZZH7XTo8E?5k@?0WZ5sZ|_{Z%^4u
z-hN)F+Xm4>>-k^^i>%`g?G$t>b3y0NOn*!@NQ%D+$YiHY_jBn3d<@ZmS(oUt8jI_l
z+LL2BhiA{fKe6_5Jr}TW_`Kg7&$?%k@i~~6#P@k`f#*NQ+1KH#e*4pRWRA{q3H61+
zxJ)mnk@+2V6andD+;cSP1vu?a*!_qm^-(i{(?$FhXgTTh`MufX-Aemn=ZV*_caUG-
zoFkqZ?1f=(%!l3C8s!F%%(#{8xW?!G`iL5{qP!XBBij1{Rk_XsP}`2}OxcYcESCOS
z_pVqBW+)z7<-psmYzv6U1we4gDx0F1?YjWPCt{-@(uc35KLlt%8#OHIU`|ogS*Pjl
z?U6u@y#w2M&FdJ4WQ7TN#Yi@+YHJi#>ei&_nk^a*rtfxxf?d}ydiG_U`%~|u+Kb-j
zDShQY81f8V1*<DFA<Z8LGWEH6AK$E+tRp|?bzA*|W>)C>2M1j(ns(wric$nq38B@m
zSnq@Jn%?@R*BfB74J2|izy8_ZRy9wU^Cr$iLi_Wu<PnB6!#BrKF!DnqwBHWY;0;F!
z`S?-8UV&GOD)|D;3E#~nb{6id>Qg~n`j&aSQxn_)@;HD}+xO7aoVZ>FLKlGk@TMfO
zi^C5zav6w3*OTnRJ9oYm5<yj-xFgY+q@nXLVMB_<atuc;GC&RUFjDj9v$Wn~>!3uP
zcZ9R}{q@4V?<UtYn_Io%V2U<rTXjojjzX%A&=37Pu;Puza})vz=tyO26KD0^BFj;A
zfsAu>*dK1{1wRRieog2weX!s99IgYLKgU52`URQao<NP&Y;}|v^aO~&!QcA6ivx!5
z?lj7BzulJHoX7PpH`4}wiE#e?a5kZwWMs=R*66fMKS(%Eqzx6I)s543;Eia5xaD$t
z6ALdb&oMDlU|4{v=-U<0FKjApfK*ntjCx$5*8+bE+!TD00OqXTQuxt<6P&ynG~F8?
z(r#FiB|e_J{Aq?eVhtSv@H<C@fN&CiD6!R$cfsx!XkX(oviMMV3P)#yL~B*gjr*Kb
zPf9TX1Z_(s(9pbNWif+sn*G#h^6<fh%@xR1Zf)K;1nNP&rrgix)-SGKe_G)GLfjzg
z<o9I<yqA<uXinSieJFhC9p{~MK-zlgNzdiFu^M<>SR*9y-+@?gi?zdm&{!Y|6oBJ)
z@K9UO=MzLkbpKSFpl0y0p!XgJj<IyhnpLBq*paL1zDy=Pfa*orWj6iF_y3dE@&`ps
zgvDZjYqf`XW3JBesdRTXrtDp*wSMEx=_E#&iY~V0c#)=>{u?&WyDrcZQu&H*5a^8d
zaNXGu9rk9wSF%c(wye5fp8)aheTwuIUM4!v{vOB^>y2fIgAK8S+pTnefAftoo_Ypg
zvrW*0lQdvX$;XA@;}FJ`f7TX`J=CAbOS88e&(9*=W;YCLF*oQ8HQ55v5$~-Y%GO9$
zc{|l!`JjJ4(ywyxIeW{2ptQ+p7PG<%?4>PZ?Z~!as)*Szifs2#QqbT@Gyg_9x-6l-
zH$c>6Fq)$fef7r}HT4sukR0JRbJMK)9`@LhPlzykqw|*`_)&(Dk%=IjF4S`7pzt<f
zUvcmYv1KOA%TE^{*)K*kzNSAUg{L~oX#&=+;>p;_tY0^X+c8gtOmV?s)=?YYuJ?~%
z$PaM;J(Zao-s@WVKT}X*w5tK*4Q%IN)xVn7GABhN6=2y(mch4k+4&tAk<DjsS^zXS
z4TREqJ8U;xCJjdO+s~el$qx{G>;`)B<-EFr$+<l5+?2xep6#za;C$vAR%)D&aV?fq
zRTF=XW87e{IZthGhBcQmkB$-{&A9w+QuCl=|Dg(wu)D*0+1!5+^jqUU7C@VuLW)2K
zUKr?QZOzqae-zSpz!~nYS^4@qhq{J+XSOCJ<I?tQ%Gj*@JJISP&`S)vu%h9XiM&X7
zwpO!jmgjM$tU+OS1VKdWTny(A=1D*Y2>5aMWZEOQ>59!-?dbNgkccsGTY_m3vdN94
zYoNpYm9(;ER_`c@DGp|=n&gfcl|+E?dVD>VIjSp}-xwSHnMb?nGQiAA5+VLdpKpjj
z#*uMd>DIsfe)ly4?)YqGnX`>sN$3kGEwc$Owi3Mfl-U`XgB&nTejEDHgEWwE%j1xQ
zDR*)FE#QQre|^^;F!TkC6jkoz4kVrz%d5RzRA_~EVIheEoqPNgV+WwRr@$UNVi<6<
zMc&*JfPCkw<62bLzNkxiHQ%$df$ojaS=4g<r$t<&e0_=qCqF{?P8}6ayfqO#WGLZ~
zM0{h&4OVT1>GLgDe5?xf8{bD4_O%yFS{hEngHuza{AWx>9b+1UVS-8-F=I$f^vn-o
zR_v+eG_XTf0HYT~V(3fC5`r+!(!PWg>b>9O=02NuyCgxz>!gRDdI0EF9cy2TZ$rwl
z!TfM-9f1G0b`F7^xiWg*=4#)9fGnC;4mX=Xjt1dd)C(CRJu!BpesY7~7yTb^B8cP1
zLXpvtIXdtkGimx?#nhnV1gm-{dQUiCoe)ePZr@A1#h_Swn5uyvxv3SZlx*`}$MP2A
zAV_gqvXOpa-fN)6N(WvWR0*goKQ52g@r=8{ReS0`(H`ai9xe}?yNhK<Y@SiIhZ$mA
z*thp9P4BO1d;q;)t%8IFSp&GBAu(8tR2zQaZA7afJ)05#ssG#1fOy?gu}2ZOW37Iv
z`9LPl2PWr|$uFk*)A3JPWA93hJTDgAibRz(N>mD-c?7U5HmrWrlz??5$<nhT4WUS=
zdtc%uynKSHQ!u%JPWh;-DSXD9hj*1?V`rFZ1<keZ+b_zEPui_<?~yHZc<k0>neZ{<
zlgki8hdn$&Ww%$qJZqRv(RYoe|H>Du3>a-S%+B6TsZ*d)<R3oq{SssR&|RNLbN1^J
z1G&sNa0~{~r8~4&_)G~P_kxFzqc6_0i1UK4yUr0bDkVh*dCcc(!>Btd(0-+j7fi|k
zm4e(c{_O5V5*!ER7aFUW{_Pj$2*Sg@koUh($&X=i2NgRGr=d_;eY-RX**YlsQIsz`
zD=2gxgt2IZQTp>9JkS-@61q5EZ&Gq-oT8sCr)z)2A9R9FRz|><2{y0yKUhC_{eHr^
za@W0*0miLvZ8NzXj;G2~nJrkw2uIRv77bS*_n^e9hA2xWk*}O7czr=aY*ifMO1M}<
zF<9qPMq&L`Ai)sK1I))o(`Bo8(qIDKbRTxBxAn|dthKh^C5M9={}CC7(U8oh^H^w*
z8`xbpAtwF5_1w_Ng!<|Im-||-K<YQJY!08x8%bN~Jp;)P=?@m3_&ij}>mcbE3y!u~
z1H!duUU<uhv@uQ)ky(yU-HN)R=(Mg7`q?!k7AfFHu>YEqDWoHcZb(3Z{$wbH*KTct
zS4liEE2$tpgFjQtp>)nS@`)4!u4Jp9_6;a_pzK|5(ua@1VMEGZNRIbUbCP%;sumF7
zw1(e^5it=yF?mdg15|EgJM)1nzfOYw^0<R8yX#%QEGujTWhBt$riUieg8|nDN-Rdr
zpGg(5HtZJD?~B}3M9p-35jg*JlC@efQ=EuBs<xO>wV>6IByEdfHy%=DTQivdJijp9
z`v?br35>Ae_$OTsCiokLQk17qQX7*H3%V2<O{hUd<-5%*wCm2&{k&*j1CRg-T<+_^
z=WxaV&sTw!5d(2>+?}R@x9}AN)8&rRa+TNIl{FO8Mh`mlbYLD?_#l`=srzvm{}tuW
zBLY#7w4D45z6;Qg10TVs8o~g%TRjn-NvHlIH;pz9vQ40SKC3MD{N?*@iQ#QcjP-d}
zc3rbta4iRiOlO<!t&;D!q!oRl4aAdVB?+KSHOMV*KuS-h`8j_K4|MiOesAU+-SKIS
zw9j)&g(GX6R)V*<UQXEHgyn=?e)wN07bB*3cXOG375b8&Or!4`2~K)75z2BG((gw1
zMgEaJ9*=XDLmjq$_C-rk<e&Ri^~0Js!$0~0Qi&aA-Zh*lRat7cdoiOu7Q+@2d57v4
z&2QB@h3u=C7%_$`n^}*LA+GbMt+<+u;Qq;4d|epKkRUiGj5e%>aQFFfB7>0{0h_Oo
zF*I&?%esqVj=-3D7s-*>?jAHYdYSms02&w;P0ExCatGhSTE3EnqC;rm?pgf%1=ke)
zXRE8P>1wJKt3ZX23xchJJKKR+MBd6*!WCMFIaJN+%Zv31riSAIYB)P02T8ktQ3J|%
zZ?>0zO+t%n+RpdTFTF)FypG6yDucSLxRs4s8zyC{u`tmYeX&^bd3+_`W`68gZ(S?o
z7e;8<Ia^*v`i0t`A%hp<UEx(4W<B&a>Izv3dy#Tr<n6b5X;c$Ts8`$i{EEgGY~AGH
zHP(I>gLwPiUov#`&2-I0+P-=iOp*v(LDW~x7qnheMh)z~M(Bk0P~`)p(!KBW&tqPi
zd47V)EO{b)zXXF%bgx;1cy+u9Q6vl?O{Ne(>@|u=A^|7pkWu^d2zwu)9Ct`|e=VLG
zZAy348krqYEAi;=wV(6u-R=N}u!YXY+wPc6M!Me(DX?GY2$7a39lYtIep`P$j6l|*
zi_ax<t8jN+O6~EFb6l@9FjXz5m}xHXUr|AW4FgdN5Yrz40s3H$G2Mz^KmYPV%=+Wb
zOpw#49Ct4gNu8rxGv$vz1_LZBQWo>v$_i<Rn_tS6+t6ZPo_?WIl^JTKf4g%;#1sMC
zKD0r2^k3?4r(3aSz)3h|f^GU6^``d|RNu`u&&PFzmhk)zn)>Dj?;^RplT*JifI8@i
z!o0!*Ye>u0LMFuDcIvfp+_NG|jzID&Z<6r?%x$WKo&&tuS+`@kbi>utE}u(;Ml@@s
z?X02Szl%6oxJ;CzRR;`ieE-aJ@|@a}Ud$r2bkN10<yhxHmA@NVRfr!nVaJ5lJTpRj
zjEMI6tKgY>pqD*-_<Zw9y2|4{=m-4tJ02!UJ7mNcHK(kX@eBhIC3SnXKcXEsP+4*m
znl3)(LC5P4r51h&8PSn0;Puep>-uWLSPuiNGT|Q2k70Zuo22!PKE1(M-AUO7lKt~+
zl4I1sQ-l^dMwAgmU-Uvw7Vo{o2t*heg<?yd?NnJrhy(I8qsKF=<OZV@2>!53a6o7t
zoe2~(ybi=sU@73z2ZuC2kEY=;=7D>qkW!!jCboBa(G{I33H>s=C5rzOzLk|1bR%u?
z1js|H*lX{B8RJmu?dbx{fzn+37*$zdt(|XoOt2^#`{jrNj~172D!J^?O)g-5v8nxH
zpU~-?I9j!nQ~YfYa*{p<2TUbsF8eh~utZG7#i>Gb4j(p=LMx-tgJQ9AWVlQM=K0eT
zFwFM!d-kyWc&!)qg6KTk?*1qq7|`K$zn=g3U>kXu&F6sM88~eDwa3nHtD|DfEdvu=
z`~=_{hnO}U&oA3$kvli@a#Er&`BT7qRC#C1#v|nNNnM?2VC{j|!!ipBY71eWm{?Q5
zqn=7`lSa7H&?BAdX;}2%t}O2}5)HOq@~txTc4&5uPf}f$IG8=4K*b5K72Xw_S?_0O
z5B9_vYnd$3Wwu|V;sW~ymBSt6@d0b|tEwr2!8C=cR?I=bMv)YML<_?-a6=d6k?U;g
z2mj}Dy}q|5IP+9WDLL#;uE1PCPFo3%=wsT~JOh#pNMgSw39L+`;SsDXdy}t~-nlvk
z)a<j~ih!{|3|J2Qli~>vF`ch>Zc9OBd%2D0qSk=0R&BYR$<l;nw2rI)P%&5fi<lfX
zEoufTkQ?BU;X!f9$=kfxJi=$QDfd31Ry6PV;pO)c;|sZE?w#AWQtN<FyQN4RWihZE
zZQ=~~hDm{2J9O+gh%N=oQ@WG86C9g4eQv!SxURAxOsvY{sEC~=U@*%?*0<BYhhgLS
zsQ48-m@?H6*AEQE8zBs1Hz@NeA?_!`9!V$pA|)9p+MiGIx(Z8FzmUzR!pnhW&>xK;
zv+JCNwW(0GC;_TSUU8HGP`j@e4j8}EYSr6Il^cTw_0kMdG*ka=C@-0qsV}*Pz)Nq{
z{sZ;C+*Hlnu7p7GiRTHMk|1|}vOm83CLxlVQI%jUf{!%<`zHl8=QLkoT%j_LQdbQF
z9UrS2_=4PXlY_jnTz=^)EpB@CQAo=*PKQfN<q*aXqu7x0`5`H)-xAh-&EL3eYH@QS
zv)(%TUu;v5!hbprZ((I>)!LGFOA#L%E_8m1wMr&-4zlnx`aeU<HUEgPZ9(Q^9+wLF
zV8WABuwt`G6ws|jMNV0#rHm1!Yr|X&d%jBc--7ctfl&fo)YnT)rtVk<&oS=-Q@#MI
zrD*?Ok2-I_SQ&{|0{=CDF@~^v+(>wtfPqQFoyGU_5$fkn*L#}7u#A$aFCtj&S6-1g
z=mgA%^ZQff0UpvPlj`SDcSX8QDZK@g$XBtAb|pDHl{xc;C4~f?5*#D^-I^)4?k-e%
zPdIZN?`I}7|4u1LJ@U^6mmZtM;|Lq+@ZF$I;(u02-xk<U)8`}Uzw4bt=7)*sq&e)i
z&|06R0Jb;lbJOTJ>a&|Rg>H+gx$IlG&yjR)zU7Ytm@&9Zqq%LCsD}?gce-9cqx7R6
zl(c&JGS;xQb!J56>+RJx^G8z2h6agvn0v*z^nGjEqBV{f)K^?}8S5*HGUB-o%19@s
z(1nOYkPQMdPWJ$JyEG?RyVMK6sTMF~p5!5(k?kkcTW`PpZD5bf##(B70$xl!T|D5-
zHUSeO1oY<Jd&uZ=^!A>Q99UnS#M_$J9@4FJK%@w>A^yK$B#YH$7BgpGx$l*N-wq_<
zg)D2)iMt<z`W9JSeC{$shv!ZLwL^}WW<M*VO@mn8ZW>=KrHj;6_DH?nDGJhyK_-;X
zXUoBg_8tT|`+d+K5Qz!$QA^qrV^OEii}+-FLgo?3LgJ(E^ZHYL>8J(1(w1_8ss(RW
z2Tv=aac5*|WakW0l(oJznQ=MG3CqS1n1EWLX5G`z#4}4Jg!KvneP0kSRps{TR5|p6
zZxB|0$|P}{pk=>%B-c<5h?R~So!1$rGZ%$S9dP?T>WUb`m9XybvAVFdiKge5NuJx(
z+xsxca!rA`MvYNMar5oZOxqAs)r|ss<M&#d_tnFunw2O`N|)_b7~ahC80_BM(Oe^(
zssx2{M&S-yW3MML3_Lnt8%GEQQ!U?q6>OEEikvMsuEG^MgD~L!qVEr`b@Y^L{m^Gq
zi(cws8;mLci6V*3fN&bjr4p1t0(3oUN(Gk5S}E@EJ;5>x^aW5oWnWraP`G`!hn3D0
zs9E6<A3x;mF*OrW3U$}>EY<Z~h)`0wDT|&warYKsR}$=tWnkW=@Q5HS*(@b7H3RSK
z_C+DF7xPF--R<-rA?7#TR^jnZ9)tb0Cr5p9_BC^cd&|)%t)Gfi3L>L*q62EYHIa){
z+625<^31>-FXWvZ@%{~XC;5>!m6+=Pdc0H+z79_66W(doscN)>L`cbe4KEXTAg1I9
zQ9O#$GLILV7aCHY`VZw~oE;my01u8pm0HN7zphm7ADFMCY<ri?|4;24DRnM*6jOU<
zsUrAM4l8I1k2<d7_<x<7u}>&{i&1(V^b^H973#Z*xtfOk2^Il8q!Mgf3DP#m`0$iI
zg+GaJWMIXE2*HGEnr+9NO27ukn;}J#fk=+ZVX^@&pPP(Bsh(_qkEf?5f{W|pDjs^=
z1N0>uSGFz=>uD6OQA_TM>vH>*oab7PT@4fgp#?$sDKVa0{QN~wZbV(<2@U_6KY^_<
z`}y#e!4mcQ8_#rBpNZ%=n9s#7Yqo=cu{>qwG5^(E;`}u~IQs{0GHk`XKv7x;al9ri
zhq&w1c`Gr3j(hy+rT$j2IFisPYj12Zk*jZ<@Tm+Q2m{d3+hX^u3zJvhMcZJXuZ0f#
z=qlTaK`hB!`0l&O_OW7d(^UmBEg+JAh^ya+S6-`kwyXS-T|g`xAw{(9qF6*7Ee$mE
zle&{QYIj}`cfrdFS}>8#1MSj@5On5}$DfY{WEz~~vO4*@$69n07~v;LIOVi|etuo{
z@*}6|*C&Z`pZ@BzIsEV!Ujijqih7=pMCMF6hsQJ_2k<t}(quh*OTH54ImCCBMZ+lL
z!hv`bh<Tt+wit!a?C4Q?WZf0EE;Ri+Fjj$D%lcVxTC;+TcnSn%w$)pG^+M2iwpL;M
zC}bH!oCn_|i7s7euT1Z22C_*YQIXJI{Zd23L~6e%^Iqu|LD3Dlg-CNGG+4#hO}OQ~
zQNNQP&y<u8LV37Tu55glMDyqSTbfnsiVs)St1`=UDQ?SQA>^a|><0eN=rPwMDo$Ck
zx2-bEmXW(cciz)neZeX69$^(`5mokGa-MA5nJPDtOH}%()7|5z6HN00U1@*J1<85a
z)H6p8So&!#&Jv3%|A${O2OB<j7D=%2g2TIm>y(G$%V6$fBpyMzp3(*A(+-B*mKOE|
zDX7O`6_87DD06w`Bd0!wQ^AM+l0X#%ffUTN+K?{L#b{XmO{f_oN$&fF)4hzU1(aNY
zLEt}wk)y)?hg$w%ezguJ|8hy&HJu7A|EY{HFj_`1ak5#l<1hpLTk!vX*4BTmvNGrh
z^|WDm@BDv_QevTdeI6#AHvYeL_<w))-(T&*d?@`q)Mvi_744cSvGC!D32f8557R+S
zq?#1-=|8`(JURl;aD;Ls;|H+WnLPaBYgcz8GxNV|#e~=k<mBe!GU)_O2j`@?Xq@g`
zx{UDzxygKp5Z0lM`j9f>{P05Vi2Ctu1Oxu(;E25LR01uc7l6lfxlK}MQzwRi?;Htz
zxb(w;(>MS76Fr6thYNF9u?k%|vqAYsy??1yt*BLVvcI4(UUZ!vZ8Y<9cMHgLpFL(0
zHz3qV#~Zwz_hV-tZea1x=)~{iYkwE~b02v~{+Uz2XfU6F{{7f_{>=ne_0u3LtUcBf
zw0xF>_rsEs*czM<_}^V#tddy$KFohx`cYCPTQ**0qHz5}b-08y=`M<S;kmwdykXzB
zJix(x%7eKOnDbhXu|EP2EpN)$W0_V7wD7}D1(X`|o|6{bD`_mKZz^92)Kq6y%&Kov
zjWg72Qm*1L2&g)jrgZhDt_O9uY05_m;9%3qkj~dmE#P`dkb~kPNMv|T|Gx|T)C)*y
zx&Ko_+5$ZZ72x11H=X&}Fx}u<RbkQ{DQbFu_jqe5#16$r7dQ)_>SXmEZ#_u+K&l=c
zyjOiNzirLd6hJH<;&vXGXFBecAwGRTPbmg%qC%h&rvsC5okrsg5vTPZa%Yc@=W4lX
zIwr*jbk-LK*r$LBm<zZOd?vr-0I2W%;gVxsfBc{-Ft=3!FjO^HQ#@S+=y&Em>~4!@
z{sJH&w#XE08GJw}d%dSMj7I-%$r4+2XQoQ+7mRh(*~n6F0JH#$ZP)tgtHG;E$9`kz
z-GeQZHW*iDWz>)ZhYSKzca@v&t_rpQV-||~E@Dz=JNu=ooBV^{aby;dW%HGDFtz}-
zPZh}85%8YYK-s#>$-VCkBT@mq*e4;HmNk+iZs&7V`G$QH^6?L%WVM455N~7Oqzd^I
z4?5&cPFD)QCcZMeK5Kdp-k+ZrKH(*S$F@Z1&dCd~x#?8E*@8xAxf6XHm3d5YP)JkM
zL-?<2bZe)4Z~{i62>jRYfE1|+Mqz6RIiP7Qgp$Ftuk0;{(!BNx`cyKc(7{buO|F$B
z!jTA8<)*_)O@zA7WJo*C&F|)GR!t_w03OPS`;QMGRaSGd^!o|UZ(OF}$d}R6z^MG-
zZM;id%S2|w3nVtvQk(}@s01GqbmkPvJTAV2xdo(5V(zu!=}#}8n~`FdfFh1<jgui4
zw1Y7PNfXVyrd4~qxj?k}n~2zgap1r!p7(Hdn|1e(Ujh`rF+ib&=4aU?reB@y76@H_
z`W|`<q{+PF!6uRLBf#$a4kl(Ia3*Jy>?c0BSXQV36N_T9^IE*=V9MNsBSMChRzC9Z
zcv6=?Q+SYfop^44=7f0ryJh_$qDrHivn0Vo6nBca+-I`=7;~Q!vkES@d#YaqZcx^H
z<{Ij_Lf*n;KrFLet22pnlqJvA0djUnQi0-k@6J89512xiD>xQ=Q+?Dp=s%URNMoGu
zy2zReUTN2E*|TyC20cSBA>YP49<KX|w3!n#F=LXpj+pc?suN2gE>-EcPkNiOk=1yy
zgc|hx({44G@E8L!Z_caw1F@E4z+WBS{+5F^ISAdy*@E})Uw$ycQssl<_i=}Y>5J(r
ztEEgSGHp9KIKcsp4F{dft~veFHcW;JXElbJoLNOOhatONLgx|y;*bNJ#J3M_-@)jd
zq~)K=1!QtFyW^CRN(hCU%5ShMHvhKn$bV5jYI|R#p`xFZt&my_#x}%YwLsl|gO4tb
z^*95`hr?k<-NX#wsmH;nMiG$i@@QpY<YHiSIK-s0I!6E`4BF?*MDedssllH#0gs;+
zD*9Q-qd)<$1Ybqy`GOFsBHPton*Kk)=T~C;uc~MAwYTy^c{U~f)lpwb#TVl=BY}UV
zusL@}8)qFLvEKqyQ5ezr&jb*jPzw?VB&~721>LoDIfLiV{z`c4;&9pTdkKg?W)Y{|
zd&pT7_SH1~E@{LBgJl7gO=}hSr4i5{Sj!UZr{H>r_z`fQc1pZ>`;Czji)Xt~mar?7
zR+CAuH&ZyBowxdnXB4F*NBLDy0tgfDji!r85ODrlyZS9rQW01o*RSq8!y}%+5PfM`
zu=eZO)qs`y%~hV`{(LyqrymV~RP3jp<^Djcd>H4%t<WxPy!Y@+PZ<K()rT1q>hp>!
zFu_zS?3fFT9WQ;ASQ;TTSl*HTZY;?ke&~6zw0tNx!uE>)ly=`ih|?aT${iiCz^Sp&
zy0X@7^G>}?Q5^$$Ii>W0l3=>#<QEzSqHOaXyy$F(q;?}u;!#CApR8)z_Rx&ngN<Xt
ziRq+6KqFB$>*KEs?)eQWgT)7R7?@{q{1^H~kJ9A0=bbV%z|<_KQ9nKtEVOBGD4Wf0
zd<<2e!X2671>f}!qiOROIF}SgRZM|{FD$;auO^48NzyE<7p1Ue<>T2TQHAeE;d|6R
z72?osT+{$N!Wi%dW|Dy%`<Hh0N?C7A!Ch(%Wn;cR-G2K`O!SIVg9tofYsw`$B?S+1
zWhn{6F(>HK1y_!~oo6itGhPG{{k*>Zg~kwS05BCYF4lFNvoDip(mcx=u=@c~#Fp?8
ze1^)5@U?V;>}se1JX~~Z@7`?fc68oY0+l~B84L-ihLH+Nk?@^Fm`)a3{C0gYQY%FY
z{z@FK7}tnc3K|>Ug9Te9cgTWLk;|~Uq;#I{<O3LZIn_u}X`-VI#3E_&KQzEKyUi&a
z+o0;)+AcXCvg0kNcf}56E5X7StMKfOY~kBMG%n6kNR{UzncYtr+mg=ny*0*`!Gc%K
zpcNIFHvezp$bzzYfQy!qr39-VdVRJ#mJuQJQUl4irFk+B=SZQoOJES#S@~~mocbxO
z!7J9cMmh~{+56EF=!uQM`#JhXNnNm6&hnI@QXxo!u`k6+v!;!UBK>vGmF`7(0U@Ec
zAkf>7mc26V;}739goxn;erUswaxkA^Ih?YXk)SQbfvg9clbF+|9{y-Do?j`DUJPbp
zh-vY~mJ#KMDm+7_FvuEN-V6#HSKMKd<`dj*U1jhzfaUsg8ngwJb9!S_advSWYlio*
zzolJbr&GDa-D3ZOp<{PO`~XsKo~G4pszBp*@kh#bvF)`7umTJ0q-udQ=dd8F61Dd2
zZ%|9A0O;$A2atLxZyu1!l0!cM`>^FvOC<Ifqi&<t6(R+mkvZ5BCuH|!!!GBx03X^!
zbef3m6<}1-=~@Jl8^BvUexWf(?LF8;?!Fz($OzIZMQi2CY0SrRhfM(n*kx7?S2QUI
z-|Q6v0p7|Jf3Po3PNQ7Ym~|2@4}>{Ka&;s~Lo1sjfqnZa{rZDQE51C%!X5$&G_8vI
zz%T6j;)f^kua_NvecKiI)ovMi7>tX1mot=YhqHVJ>J}5GB#&^!+25sBd0ZA6W-Ys<
z*emTC%Hd(D!m+H$H%lQ|OV&w;GnHoC^nm)TJyOC_2I6L7f2iMX6&IYRTrP1Nn~vqu
zYH$F~fHep78}q#eYB66fpn)u%1OZPNt-3IPNoM+yp3<Ngl$(swZULT0l(XAv>?KW7
zC89+xuuRHMkRoV_UJb+$ZExw!V7o|f(;dBe-H<WvSBV677)roSJ^kR#<)jTj`Bu2E
zSW%hbu$z#c)lr9LA(;yl_2&b=4bN8G#S*BbG5Q47NSbM%B&gB6oo66~l}_A`ZBcFc
z(b6Zr3`K<$NzW@pI!mX;`*D%Ay8u|J`tXmKN4LzCSHmk-CI7EF5ZCMSSlGdrgTp&n
zTQ>f8Si1m%2AF%)x^x8Nq9{~>YkrBppD=-ZZX=F2-2v469blyBr8nW&_2ovgfO9Z^
z&@3LKx+1_`cVLfT1?Hat@jMp=w_X)Y&cB!GrMB8b*Z%xY#>{(RXT)*m_dB^%Avv>f
zw~}oZ+5IwR_IInMM~iLaaxCIfK1i-^@7LG+<Ch^Ofp1e_f&|gj6^^-8;LjS*)!J)x
zk9*BDBj8wV0ABl3(O+Qil8gLfJ+7kGyU)K@End7MR7VdAp7cY5Ry5^4E7<sBq%*o<
zK2{rm+@g{LBM2uLCx#mQ>srGsy@hi9-inc|`dgohSBNq#+fhfV+CwB@CqPoO(M$MW
ze7$v4)Lr|=D~*JJlz^ntQW8Ul3P`u)5YpXUk}66_cXy-Y&?q3?os!bs44ggB`@FyN
zu65Qr|L}(>4m10^_r34?x<1#ILABLz)JrqJkqu;WR<JK$0g3k*!RX;!WMH)==>KFp
z5?E4|Y5#bavaUYDz^PHawGt8$8Z%dbic>+|IcDy+r7(SaMO}nwPAR1F>sL_9N&dEw
zhs!?=u00LlQhT1tH3iVnp977px1ZW7aIK=VJ$qeBa7zEm__f6(&4<^k?U+{CXC>OJ
z^4FP~{lp`aTXPrT&t-;hIXk#dhD8_*S4LD<>;Zj;A*^4;!LVfr2p=%BK>Q=ah9VGr
zPwL3g+NRNX{E1}3Y?wONg2#?}NtfV7^bq*uKtD5UgPX@&7gW2@{wK?I-g^&z?9L34
zhO+?s>Z2VdhVS|>4+22(GP*lkBeYA|rZYtTpi}2wtt67^W~U-yr=}!X|D=M5O|a_^
z;6->m?_jQ`4vz}u<YV6$^wkWmzf~OYd+rY=jKGerknuPqccA;<LBQy9<V)nkQE<&D
zao-+t3cRxY?IV6LLpWQ;_a}>A+65M6bq5TithI0-i=6m~qVN$#F3b9rlAj#6fHk7X
zld~V$FiP$J^v0MG{$ofq#v}0!*e&FEEIBXQh*oFJGaQeX8A|xzANr-sZinyZO@TC1
zB2lUp&nNN0HnwDOo$%K~ynwI-<s*p6ez+UqpM7u9-?R|)`5+Xb{m}V-EThZa&r)D+
zKj!y&)`~FemhK5TA{eD%Wc2vO-$WX_Cf8>QQ7!a~Tv+b%f2dj+1{a%YTP4hF2&in?
zQ7K8+2(j%~P;J$6J%4FE0ZvITVDB}(T@f<|b5#>cDX@PUOxvG4OBdy~ye+>>>fj&@
zoVnR&Wk#mxcbgBM9P+=Xbi)W)=vE^=xTJmNp17p*xXj=u%j0gmV3epOy*GtB^cRVo
zI}e=a(x|wB^IPk<)aQzk_os*fDk-SMNBt6Evb9Ic>*XJrrzkhlDXOk}a+U0w7uLNF
z8eMnj0*g)tg_No#fq(TN{PE<FFi+xlk}>L=b9&-ln(D*oq?3_F#;n_)z$sgywc`(B
zwEeyT&kqfG2`IKpNZu<=q@bmwF}z$Fn07%v!+pj1x@aacAIG}}^Eoo&{*D^PhxczQ
z;q4F1SSA&^(tRMsFFJ2@DCfVy^>#Sz1`qApeh*{cM-eId+ppZCpIeEko`7sI)YVLq
z2{*C#34lG{^tfC2BXi52LKcWoqP;jasUORrpyPB8bz?t&QwS&7tGB@dlg5#81OWqT
zv!3NC{p>)bp;RB_dMll_#OZsv$kk%skK(3!6!PBf0KOgWhW-lgFW8l0Sszj2R$-?+
zer1g0wHn@H#P0*(g)2Xvr8gbv8Ww`^Os8zZ2nHaJq%(UVMPs^RAwpDU>ilA$%E7HX
zDEy&&vd4bT8Jo}!sZ~L;-OD>Q`6n%8O&ut~Sf{*Szh_xE)BdgJ^t^&DTuNHU2-)U(
z25V#V)95e*F%VHqTdwcMAoH!LMFt$)fw0V%LLDw`pGX8t^2<rG;*3dl;caMwwah!R
z9%e@2SkcUvOKVO=_cp}xkq&7L`=WKVAAN;|txLlCzE~<!j^{MYmBO6e=^i9md}+uO
zO1ns}3Kk$?cyGqG+S?R*)J4X#V*t$DoiC2TC6=U>?Q&B>jR{k~N8A7XnVRg*UT6EQ
z**qecJ{wK7#){XH`x2ENRT^`N>&B7L_za-(oiTcRqb_8k-=u%|XDNJ}@oD?)Ew^Vv
z6;t{`JHk>lTjFDkXSFWwA3d<hsb7kjF39y<I=}o5Aqh~IYD*T)d)db4{ohG)1Htb!
zp*nuXU;a3v&`RiL5C~au_hB83;AJFmTp={WNJV%4;^qhZooE69zjql{Gf<?~CUxzJ
zL(zXwiFFj<n4zX_w{N}NJ#@OpH-n6$&~ZG?UctZ!KoZ41%y`wrYyHj|qdi<wJX*w?
z&$i_x`8=jdg8P8<E&O><&i!ySh&YP7oX*<ECh?&Y#dk>EVd$ql^bPWV;$!JRH;K)m
zPD<?bg!#9weaG<Mc5{qbwo5#IkHZiA^};TNjr^3INY1jD-QR#aBQq_sg}LD8axBrj
zBaQ&5yo=PLZjG#le}UMX@Wj-to*=dHs3`UlH;dU0qb>!%#MCwmgS>&4z$H-|)OV6;
z+_6Ty71s8tcDziN;{{en#arJmMc9KTN7O+ue=sAaEMs(M*qx3f=ass?2a~<1m?V&C
zA>vMS<Q$wc_|}>BcRgm`@Y&BVp4EH}(LK`_Dbi~g#4Ii;{3m{=BcmPDjbl=cc(dj^
zHCM#1<WOT%twO1ZUvqvs)vkKq`KHZ%Qrp}Lqt;xopf}M2o8(2*)yb?iw8qdi7wA*d
zPRR<4X^A@f8Gk3gKmKw-ZXNz*Qc5kp?*1@m;)gk&uakzqZTXmV;wsocJ`a;IqzmVa
zoNjTOl&(;JiMhZcs@K7u*MDquL8uf<y!ZT8)+qEhc^|VqzV-XzmM`w`hoef&MUb;@
z4uYM^8yD8weM9mu;`rgo6n6FcH?&v+-Si6hE9yC6v&_QAyL)wv7OiXvJvZ$fwow<K
z4Fuh_O3$c~9HO`=?Wv94{SL)^)d<viMZg=&##o+o_MrP>XA=J6;OPA<neYMaBxWjI
zBXb`7?_eKr*ZqH)5e)d^6gk0@#o&tbW!xPQRgzWgSqk8Z=rrRFgx&h?`?tj;lfs!G
zM1>xiV7!8Y59cE7rQ^yQt|AGV75W%PMMq|eR@PKxlq1?!lb*)}%ZV;PVEQlj^pYgN
z!EUZtgQ7vYBolzYv7~?FQ{RhtpT~M%h^6j!eEm7SqKkrVbhIM5^5Wu9g^)IwD-lT;
z*gZxSM}y>o(8GhZ*TF?MfLU5S#p`4w{Ix+&-blWJkj#+dY;}ThH$iCq*5RMreCA6c
zhnRLObd=4`)Lr9LljE<V!H%3JD=ABQ3q}C_@K}s02m^2Qks0-5!P+U|yu;!{&<Hz!
zZYFN=26V;rBTV_aIT}xJ1R3Tnb2;Hd!R>{Ao^{KOT*rGEZNJmWSnH%Vv-wI5F_I1R
zr$zO(2;$`9x5Q@bu?yoR4g89IY%K5DgIJ053MnpBMuIg?>yfi~>SR|r@?E)_&cB6P
za3*63a)*EefRQo0`E*oHmF<aG42HC!vx!JxGZ0C{ZYGj7L~=4jIqDWDLJ4dfW{KbU
z9^C{*p<}6%t|WbLfUKFnTTg#og}8hwkS*T*Y<DP7tO#4i0z_AQG>jp?SL6qYhAr<&
z!uT?dB14RqfMG~ZzrX4W#~@kl;y{{^W)~Mh;4-lZ+;NpGH$zp&Uf%t1YL(#{%ZI>3
zDUnm3%JEf-`-{dMtI-tBVCd7Ch>cHN1UH};_Di_;i-@Ihv(l&4guz%+6Sz{iXh-WT
zVx&5NE2J#R^=-BJz2dW6zq1|4@IhYacbUN2ZO|XACYjvK-ZI<H81;+o*c1CA&hpGb
zs&;qgXRD|$)*7bhZ;z?-&+mi49Wy}fNx+*5q!27pf@;6RDr2z_qiO}AhXjcb`*M<K
zmkDy3jIaj*P=Qq~UJF+MU(89{zKb>?6|wMg7mj!n@@=s^GC1OmaCbs)S9ppKywK1r
zB<XQu_{b-p<Fy98V=n3nzoVr4#HlCUxqglS*M~f^^650qp+Phb<qxjVpU-siQv9P&
z^Eq|kIdR?lXXVhwR3=PWLlUF5X9PG%tIu_kdMF7KS*k6^q^xl#j@d+YmX2{YOF7x>
zwgCwofD1DgZ7ZJp2Gdx8>U^KciTXef^Cf#UQ8r3bqO^64ftUT0e$friuPAj69TXgs
z2U@;ePYQ+zJgx|wL8p!TmreIU!$a(scE%H(Ys72hO08X1&(=L}A~{Em^aHz8oIaf)
z4bZjLkakyi^8Og`+Bj!iWoINooMC~dYEIh&MIMRIJVQLIF=8|)X;Q17@yN1QMHq^b
zm^6o<i9S^QsI?>J`5YFRcw(NO#MPhFT*Mmjl}q9lAj1@*EM8_cgw@6;M2|%-3h&x~
z`9O_4Wl5-LF*8c<7ik=-9iE(sKZE&SJC`8MvZ+H*ZPKa-q2#wjarZmUF9^rUBP?M0
zp^J|SMO`Dz-TxNMvHvRPVf-i3T+#j8!1<rQFCRV)%;j~Qe?{JL%Y!7Z=X_?&Fu?7M
z8(8vrr{%pJ9~iZ|D^L4Yx2%)-k6?y>q)Ggz3lw4i{qGy7^@27V;+MNrb^T)U$!2aX
z4Ux%wSFv1_)9uQy?I|tW=SueX1SHQXkLXmY&yfD;mU2;GS6Ez07pq12g~i+FdO3e5
ziueY{ltAcB;8dN&wVo&OZAV~L2bvJ&-V-iJOIeRq_<>H?<2^c_%6FEGcc8^wzTb4B
zIMQ_bB_q}Ew68l<8})HGaJW4uS4qkEcYu{khRR<nfVrAxMsC8IoKNuS!s3J`9IS(q
z^C5h}JysnbA=2!u7`KcWKaqy({Z7)IbUk80H0z*;FB#er14RLSEZZ~e$^hUKmD(p~
z8Pa+&wZ;n68r*H?ULZew1?mTS%(p4w|Nd|PC9XP3JfP)$^odOyL1|&;^*t6dKYmmF
zD;{BD{rB(u?>LGi0F}ltgy-Rd5i`dQgrAHMy_s$L-@C~F&4d1DN+!;a3<$dsAK|9|
z?++9wm;5j$%@6&hYJ<v8*5$44wX0rWhNJl6m@35Qd(COI9Q(JidBs|<nd@I4#et)Q
zNZN%l3`9&9@(T<^gI~a5n--z#(9f79L5#ZoSyMC*k)Muv13eY}L!})g&^}QH)=|0v
zi|NV$FiUYR=IMbd785z7a@#JzYJ%n**{^e1_dy|`0)QRQ-ZxWu6~?~5%jI*F=UjLA
zT1gvh3EWX&RDh=jurBkc)O9=a4F^mSv5KgU>bGG0PxwtEd0-WYtlj}9X?pHm57Yn*
z6%ppo$>&#;6(80nZ#*W3;*AO3_8~NY|8?;qc)69BsBa7b6T_(cuVRL%2FS8uGTbI4
zWP3{~aLzk>RNPZ^Md?nYf}pO-C6liQ6C3)cR%3$RH53kV@4CzUw#EwFdlB;(L10WS
z1{l#|%P}*>%OjWonBUB{-iyBqo2z%s5w-I6U2Xycl7j6?UoMB+@i46leGy5hYRu^N
zb@aQPv}9k)lUfAB#aq}tJF5EX-_Kh-S!($!_n<QHqspjI0FFZ#ivv%Ufe4FwA<&1H
zYN^CAtC_{|?g%JqSuI>n(7)A|*qJWXUa7&61>Luo=8qtZZ^NB%9!YRvkq@n<MDHB2
z?0CejcF`@q1R0I_>HaNYLSNc0j^ft`_<@yT8zjOO0|mGTb>8(se-bxq#D#v7hhygV
z_cgB5m48~UF7x_;OsF^EAS@XB_ie9}X75evl4KasJG(zgLZTOoJd2wWe$qMy(JAhw
zE>-CmMcj^$p$D1k=${h``OumBx~ed9(ZMQ=AupU2{=fD<O&eg2taMHYY^LTsq7u$r
z^fWIg2i8_K|A+$nvHZ9|BiRJ@v}@2pp8yx-2{16CcSN+7K_5m+bj^7k_U1tJKy$S_
zQgdg@UKwGm18B*7uvl{tMu_X}66Ivt*XTtcS0Rh+n~&=L`QF^=3DtHNg}pMk4QpQZ
zUxFX)a?JqfC~3vw0VQWAcEvywo0=nKU;hVvqDT$n47f2%K#NX#AXVV?)RvqrpW_$B
z3>jc)R7jDnRz?il#$z@qh1^tu$3P9}c;hO;VOdP7Uzi0Tnld1*_08GNl)(Tn$&I<m
zivqV%(W+*QS0cuEqvwg)$||ZnQ&7`FG8?e6RZ#gHN?6|C!Tn2sWl<-E-qkhO$gtA)
z#$ZK_;s4OBlmDPwl>u_pBwA4P#s&>-5&<DOeLa{e5G>=dm3M7*b!_jFf%9^LaQ+L4
z<dZm=DesSblu5<+`9044{QE68b{frIxQt+ToRDlRG#i*D-z<kw+eMoj{?h~fb`9(M
z%*)QnZ9QHXP~G_U8=dMVFgpB&@v4D~e#Kg3t$2|t-gZ@t1pKJ`pja-U+-h$&Bh@R3
zVm91U5oqfsKo7P$Ns;3Wgy;NTwct-a1zw^IFJP<u)&0<T9WhAi&+{brYrffE(ecg+
zz&u6_ZWg_E*pdi;AxHlIPHtmVUf5XR87Ei82wj*4b|duAa3u!NgjQc@Y-Ml+N~vtb
z9F@wR?etcd$<ze&NrrkRVHtL@qKggAAIs-#c#S$x3m%QTf=lW4c1Z<-o7h+kh<zi2
zpF-VtCfRt^(yJ@v4dU5!^^fraQ5fWagG_BR!#{w;?Wr=zMN@C~zHr<o-9axa*VOlv
zv>r<>+8p)_g#Qa8Wt5q#wawv-D^^(zsVtR>c$#%|Ei+WZ?$z|Bn|V_2H_%ifQqYPJ
zHq71!7?WQS9un&cw!)3UG{%!h8LXuk33}0TBL-hTgO?y+9alsHADnIXBDBRBE(b2n
zum`zszs=?Cfy=kDg*}b1*V%+3%%CFL6O0t+GyuXk&GV;^ILcqpAntv_uD0RGoX!Pp
zx`fhwFfEg?09^=SwVF@H#t8E*-jnMzE<QgWi`35E6W|fWcjmp72*BnC1=A$+nD45-
zLAKGV;;Q@ozjoQbj_xf@r2mW<ttR?D_nv^oER%wGY|Vz!a|X7hemnP`=^8vM8+#sB
zw+J4GDdW7N^b1;f;O?hM)N@~`2|JroSfd>B9?cCu+dtB!cpuIh1DVn~3FG%O#RnXI
zd_3gIQ)a+T`ajNN9=-*bpm>3l$~R0M7zm<1Vm{BSMtW`dOJEP|3Bk12z)cPi1nzGR
zhpUV+LNZZTf-b-Jv6KG6qCVs-1uhs;v-e=iacqRA_Ei~h++IYZ{~1t#EaZ|!;#!U7
z$_NUVaONXoO+XBT2AEK(f>H5!U`)$n>Hw4uQh6F2)4``$K{FwSe-Y&ah#h_p|Hdyj
z#oD+%ck#sdw{xnZ-#r}b?C$13E*U)kB?#YBupkHzH4uTHjy<-!qvfG{YV?hE8`wnc
z2QZ<fqPK^ydsxB7{+`@$7Q=51EeEfhlMxsR)talt&|VuloD4v)&oOHO)sNbCc0~R7
ztmN5N8+k$TGMDwF4MFHPC9uIM)IJ7oi31IQ`^2}wqZG_dnjtR+VxQ-a9DBMI_JQL`
zj08Pn6WoQBftf7njcWp-iJr+ovSeaMM~J;95FW((fSavU7aG}0Q?1L^s1+~-O)R$u
z<=NvawsF>f=DKb}!L#_&TW59pmkZFE{Xrz0ZC>kVQe5fukHI+M!3~Lgw#k?$Si~@e
zt`|eiZ>Zazp6$(Pc1eP2sfOlz<{M)i@OEhQri=Q*biS`)69fa-;Bb)=zD=7L5P84c
zPvTDe)g}&f17yq?e+)rHOSM=`T^?RV)3Iz4wXjJ-6S!jPRKC-KAO6I%TKV;vZq57d
zs<&7*&sfKJoLFM8H%_b=@IO&IQd-{c`<&0vLGP+sUivb}jok+>1EsU~LV@-;cr4#M
z3lVjh8bUNEG82xvo7zspZg43gl5lA+C7zgEhx*vTwxp88fW%arCUt&war`G`=x`L5
zIp)*hUpev{oq%${v3|CF1|#7&pWu%q?h*sTK`(hco}N@zxZQ?+HCG#~hkV^m$Qs|8
z5;+y<fk^)R0XD!~4zG*-vOxgXWBz#c!6AZ6*v;lQwc-ym1xbsq4wSTD<b0oxe@GV_
zoITJ9P_H{pcaO6$d=G)bL^fH=Guic}X1N{!Z$^~GfMpwdy6C@u27<{JT|jt+ccLY2
zYJM7~^8px0c9TwkV~l->zy|O@Rr`mvR!8)MnWpT0!>biz7}C9Bz%01=v~lk}24Qdj
z0RK95p?Q+i9e6Cp<YFK1AC==ubj1~^IZaW=EdCSI0BFT45`A4Y|DvMQ(2?9aFF#54
z<+<+gjj|^`c9RKDLP{n99IpJkdS4m;@$wjc3nD~_<w4t|N{#i$TgImXPFmcRygY=y
zZ>udqv>-z=FPJ1I@a)b<P>JZoz3{BSe1bU-J5!Rjz5T)w9~RUD&f^lW^T;FvXqct*
zm)==n=91e-#|Plun4Be{HTs>eK*7s>9PQ&lQAU0&{yJFfiL~1<f^l&`|DPZiB3=(7
z>v|3#+RxeK%TwB(zYaas8OyK5tn~;T4qosSeSh`W_jE*h0wl#2xGNesMgta^LUqWr
zifK3HEGX|6v+U_E5C~^5Wl#m@RV$~KpX+nF4KSb7AoZco8fpcbuK+)2VCjTE;nJ%N
zei$jq`N<+o&v}5CfR`pP0=xIsk9Lbz8m8AC51>GG*zkz#r8<wOp-+bAh9&geAF&aX
zwpDZ^hGKt#T47@L!E0i2QcS!o3;xM+sG7CQ9?2MGwPhyY(Ty9o)XX>DrvVf}Hd$9<
zBIE#Jqx^Ve_<OBW>!ab_Y@I!$Z^VYGvdt^!e3(T~oAPRG>{WpBmj$<KbNrR3qCQ%v
z=A54R<RqA$;6TpKSHd%TMU7z4E_dJZoPq-OOFn69cqTjfBH)B-WcatdiFV&lm8`~=
z*I%@L@pL$Frrt3KlYLl6oB}nxI5U_8pPU~+W226&R$pixex8)Ie11|c`6WJ@?Sz!=
zVHeu!GbTle9g`IhL3e^{RaO%<OkxBF4zY}2z^<s7blReE<@D%98EUmK&Q!{PnAkMT
zMHr%GIe;62<XdSsi`ypnIe#<B)Z;SA4*WNn5@=O}2}~-lu$RrOXon0R8aD%6bS&n0
zJIkz|iA|P*=KwESswieOBk}-vJAVsxSdg~>gd7{q5#&BPtl>`bXBRQsZhS)>08+tQ
zu$tX1%)v)HE^;ydZ2HiT3}TmxDXuLIpueS4JxSyljgtH_W_~-?)o1jj$kFNUS@%u(
zeofR?lcnnxUaB7f8`ZMX>*UL($me|ZVfRRHTTH?7LS68@*f)YJwm3dQiiau77xC|{
zF4YM$Q{#hSd$Y0pGb4PQtcYgQK8}c}PzN3}POg=pfnl)BCExzE@7I|g=sf*BoZ)X5
zw_sGQj03-QetY=S-p%q#{7w-Yf|I2klo^^flW8Jf@(Fa~qH_Y1^#-LKt9>pF%K_8S
zmQGEk`AEv3`NlwIYO6B&rKy$HOCCx<J6OTA^r%<Fu9~eopgD3-#wp*OhCNILBIkmn
z(8m#(U6?kX>f8_Xk3I%`U?Sg}t>G??LZ`*X38QsL<2H}5n4tOe78Q+>G*(iK{2CN1
zX@olX!I_?DF@YN$0I6J4Nn1pMtY|~<HLEk&^79ev8p*O`$2Y&<>2RO$?iVANoM+-Y
zSovLYUu}c2PemF}EaZU-<wK>#$)^AAS63>U;Ew9>tKduqCcb_$<K8!-7C)dKe?sQB
zxge6*T7p&*{XAtz=glPYxMTe6G3K3N#Dqri1fy>=?>?I0=D$izDK(%T6_zM8Wx`yv
zz$Ci{jgJYoy7~8h3=Lf!Mt8T7;~)ehe}`A!xLVyh5yuggOy;GWSsNJ9{IVfzNAg90
zP_{%<I2zE}nUgwMg5QMlXJO%nwD?ArrEHKTKN&&g9qIbe^V7E(gjK9D>X_Me_~RfO
zz6MC2mvvrxp_9)hDgPP%K5}M7?xArWc@9>hj&x}Zh+)!Md{}x0+`ZP{!#u)CDNt!F
zULSQoGz!&HBBMaSG;PEy-?{9~N?MOY^J}8;`_1qN#0ZxVJ9SGU&%SlF*Rxx6yo)X*
zWSL{>li!gmo=W--?e{Z{c$~~(O_8gTgIG*~4<(dQ)xUlUOV>UcbO`276UazsG89LB
zv;OsicK?z737W{+paV<#hxc!?KglXaqdw=bhg%h9nY-nio0-=_-CE9s8@b0O7Bt{n
zm2>w+#dfJ@OX<rsv~#PnXMGz@vaWEM8e93-G%?~6^4w9850EiwK8TwV6<X^;m@ID-
zU1W?+FXK_#J$W7}#IdB@1#j!#8;+WbJ^Gwsw~F?#r_K&j5l0bw@fZ5$(SaJSU6CD^
zXqi5x_0h7Jxz~hS1hp9LBBeOgj7SS}Qf4PgMr*-`#t7*B9!hoz^VOFViazsPxHc12
z>*Z1U-Z)w3<GGu1w`M2V@?v3aE&96%d$l53n9n?apT>V#09bYvO@KK2$F7w414uW@
zXxOXOg_j;p&Dq_2!Twul{So;gxipgi+Ej@!=p1b`*mj1EN|3p-%EXP!%CC-E%={P>
z?5lAZ@y_6pQgn*;1P{dgnvHGrP?DkVxa2xW(Qsq497mQ}_^2qfr(Wl`_jAoO?$1S)
zSOqU=V<?oxd_Q`$U$Weg>q=iB|A?GhW|gmqqE|(latoPn`@P!l+~tCD6vi}{Xxu<I
z*Qw_<4^(<PnY9XR217DqA~^Cub~F8L;aO;)jS;h_7<y!YdLJ`QZfx<3nztv+9=L~_
zxAP5oI|c0TWoMpJ_$_!1n57BgsX`ff^;P(oH7V{d#OEHLsz6ltuqi)+yULGM8C(ap
zPehzv$2|qIaNm=u2t&#-s6pV#<%Q81JG1sdgr-=9Kd#V5bl>F5c1wv>y}NVN3fvhg
zO%|bq6Bez|czW8Xk_T*7PQ=E3sQBU!0%U2ITetRR&n_OakpWoNicYP?vb1Z!tbMC5
z^x|2~rSfR_XC1u18Gi~#n><gTD%Vn@Q@U)r#|jh0^yo}3L4H7R(O`#4j=|8N@wIoX
zE`uEVHRM-|MyO8lQ|JDX^0V-kbxr3RDiIiCwW4Yr30fUby(v3$Lqt;igqA3bcJh*5
zbW2M6e%?C6@)jF+vK0Q;p8ssmiSue&)*;={k2V53+%b1j6$sCqOGMAc)mGUGZ>#cB
zL~w0N<Jmv}itEYM(euDFnG1c0E!G<<r?;Dz9+b)M(n9rHB)T@1jq<iuXL(!K=TKvd
z!p1niaY0FFm?98&POV6vJawBkVplqdtF2qj<*}#?X(sa-^JNbX?+RCi&@Q>!{T-&Y
z`(WuB0tV`_d#cWv6pgRS^nFGjbXy^j%pZs<RTwR{Uw!wb-pHg7t5H<*o#=eE*$amJ
z#*M+L)}TO)u_Yke9COT`d<f);fkNpMd!6oTeO;D7PS@+fZc_XU1SCUQSzM{lAX~7b
zcVAwFpT!u=8rttdpgH3&Wp4D@?rsHuq@w0)Ekq-_b@USfbpwfkAC+w*dZ;Fr5TANP
zIbpluE503h*EID$#yAZsnJI{CH-J#UD1Of*WLbY5RH@lk=Tu@l9TdHJnozc?&h|N5
z_L@$7imqlq>AfxiOA)<#U%b#<D}9xrm#8cA&QiKwM2XE{iBD~SDFG&y4eT1N;Ft>O
zKWl|OljbKlB=j=eaK)#~p+YAGV$f~$q;U@nS@N}a_nC-Dal2a_*PW5@l5|xfpALCX
zwIa{@uJqwVA^^s(7)eC#LAl@1!B}Vm5hh1=nI^5fQwEW)O2ng(Q}Rs-$U=_{B*Tim
zg3+nazSWPXt-$V;t6HW#e|H3Y7I$&x$0qUM*;j7GI!IyjuEJW!*QM_meR3MzhJLUM
ztB`EFciUJ%sDW!&N+3oOi7biL20}uAg$rd>xdtvUR4cmLs;N$K<3^;UVmazDdE~M`
zg?XxG$|+T;VEO5m9$69jD7U>~vm(lE05pr}62fckg><4%qhqB!oSP3BZwGOSQmwMy
zmFYCT<MF-G-|S);704p8*~&aJ-iPfd&aqi=Kps@0pKGM?4%e#E+MEvDr@1fgcQK7{
z)c~GTXOPu1sjm+01?@rhwe4+G>`Xu-I0Y-<Mbp)&??v@9U8PJES)Jqj2-=)5IJlu*
zH2O|8T}9ZHvM!D0fSrVI`b@8*A)%^`hugx2F?=q&z@hoDL++^IyJPQ$-lq!A0gCX(
z?9B4o8if?-!d3Z&;F9b7M$Gu0WOt_BF0tMO-xdr${2H&`F`mcnTVRJEBawqr-1xQQ
z%CCofG>++WjNAGJtJa6h?X@L(yL!DYHWqF6XwP`5Vg@H{CX;;9#_N^w#Ds;jQdEks
zH@FoRkj};dDc-A5BHh(CKCV1K)`~t~B7?YoBvI(PG2hxqUB>uhk>odYA}XH0Qa9Yg
z=nJ95rH@SEb1{96aRZ*jag6Kgen>jw>UX~VBy=CZVU$boccLV+acg{_q$@^Q)ft{*
zlYmB##sUP{o9O-HIp5=mI7h@%0oJ4ZrqlqIq^MYwe`TL)+-I}>cr^jq+H&f3`x0=Z
zsv)Fx^#$l#6#6xnBVzOIx%uloJ#~*pP7UwAaI~x;6Tm$Gj;>!`zK1c7EXUY45m?g_
z3{783v61jBS5b+SaY@9QWOqm2*f*RpnNFNd8%H`SohthRs*VwcmEu#~jyj8(daH}a
zsqw`TMS4Mfg^~fuS36zu5Lzs^zoFxiB0R=j^4>AGnirV5xM9U-mX-Z5SBQwMjByPJ
zZyBvw8GhEQB(Po`3^VoDi(TyY)8*warJ^kbB71dN^5L1fPs8SW-egQn7aiC`*3sU{
z8cw9i%jb<ryBuN)K|9+cC*9Ob^ouoC=eyc&3ly@Hbym!oRr&y_yWJ%^0t0BZ8xMS`
zj3v)aN(v_7eo@(5j%$l<8y%~mZzWJUdf&mt{L`xao%tV|ar-@r<hfchHx9p}rfFuZ
z=w1GVkj}sb#zi~esh6UYxUtlq=GaX9(IPM87h7yvZH}#-{1`AqUcx|N4|`>aYf(uo
zl~_ftv4Ya{k&=uv;n<z5KaM2<B{yj3Chc^r1bs_R4_D*c44fpE?|DRidb5#Y?}!xB
zcn~3Csu<Ady{GomZ<_y|^2%|_xp%&^b{Wj&b%m?tJ>W*q+2K3aR2~Nzm59!gVtu`g
z)$3ktuBZ)Y6?#0#i>o?cVc=i)HsAR5j;4cjcTQI-Isc&)#V0I-g7O$aX`-)ZQ!;2t
zJZ_0yXEJetXM-k$dW+PXAkb68$|x`1Q5LHn?}<Z>w?FVBHP{XtuE}DhGP?iitDXM^
zvlEp189x=%$lkD~dgHz<JB1~;Gd!&ZQ)Dz4(?i><Fck39nO^F#xj*;dE>jDD;6`L@
z)sHsKq-rKBF>6WLV6_tbR>+k7DHYC_2DOYo3!>hp5P6J@Nt7X;Re^NlzPpCsZT`0v
z93?hK74(_@L}zPwycAPn?~9^!cnVo%9io=iPvS0kd?_kEysfnN6E(GJjAw_E8-#Sp
zsPuh_Q4|y?3(3$fQ!sgaelPKbM5qBN$=4dW)rP&I$F<)~dqS>^N_F9dmFJ$b@9KRs
z?wedW&?|E<L-VPtQ$Jbv)+_9M2MEgZG$~tO)$t}Yf?~GwvrA>5>Emd0DaO*>NwCjX
zj8}#Ib&FnOi$051)C0HQupbw9#sunYskZQKzsx1UZ%5J&GPR_(#vgULYWIZS*fhbL
zw)CAHmEi!kvNAtb#*wilh>_vFuYCe~S)Cb|@l7rAiZiL$Jv;Je+O8^n3Co66MH_vt
z$vRbkwXheftxNIuywjH2w~TfMUDAeJZ2o$T3y|MZe6vg|jtq24R@zikm)Sgsrk7Jq
z&w#}<eaLB(8JA(hT&h9Q$GG3P&G5BKxC=eqo@mURVD9_p9FyPFkbs27*~TO9LEK!Q
z0{Wn3AShM>lJz<hsRQVUXq450h0+w*1(8I0a-og2=LFpaFn08wGQzM&!MN=7_&Cb3
zy)UvK61Pi0R=jMgsXXU%dnV7DL`2pVJPGzl_-X>NEpSSK)oT>@wv64|t0R`Gpsej1
z-UH1;g-kEZwdYaxnW5!M0*>6=h)Vkba-5R*((@&kF`7Cf@?U45e^l&4(L=J%R%taM
z8;JC-ijH5;;}4o!<_UcMQXJNUYdsTt5n+yoNhB^_c7=}f#G*0$%x%H0rk9vCzR|9z
zz1x&Dlj&W6cpiSHq(a_wxD6`uI;;4X<l{5bJxp~}r0|3+26%hB4eP7sLKI}oSQ2GT
z@^FziAM&el)mU0Sz?`?gt<1RmEWOlyjZ5&B4gAY-c6rGM|2glUq6Qj&TdREXgYH9H
z1x==-Hs<Ip)*+ELJ`$)??BvS>iP~iG3sI4U(;FY}zTrdq=U@Hr7nY9?0+M^HsAsVL
zQ$H{agV4L1Crhw6n-Z3(jqy_Lv1-d7CEnR`w6*d|rm}u(UejJ0wdtiq8lw+ty~0#l
zn5Xt|SSyBaXi7`p56$US$as;*r9V)gQ7hln7@vyl-5)xqRklDv<_#e+_5J4^#*CH7
zluKBhUyXzV6Za!7MBQ<(RXMfhgjKsjr>ioQu9@h6cS7w()880QMXmP<#Lz<$TYM|}
zyfj{i&YObTFt>D*qwgz6^XaQ5MQ}!cZ1|gv-S<5?*mTskAU)sZH|@vg{S@a}!silN
zo>v>|FyE_q&A;m>%?uy|P+_CO-?w&nqPM<<wPM#29giy2v89rX@1t+Rev@n+R7Fp0
zG_JWpap`G9(7;5zR{{xGgtNiS|9S01>dT<l%$)!|FBo>cI~{w)?SFSW?AaTAb<x^t
zCFnLU)?(oqN63+hxj0^7u=Lb@-f_eK{w4!t$T#}-?~V?<FG1k)bBwtXPFEwx`a!s#
zvvzL(3A=vN^b5Uu&XRT3Ytgr+X?^FWA<7Zlrp^PsHJ6rS`AMK%Njufa^6R$ehW!er
zL(3vVKZ+G=>Ypcw5gi#+jTRjsj)dwV?2d%5jfPmsZ5te!1%(zL27OeEUH^sx$g{2)
zP6ue%bcudH1!0REH8z`r7%6}opLl&n@$00cS4QLhGVoj>|8}{lb>l|<tRI5dc^y+g
z)9xY#OibuU6$aGZH5ev@lM&fu7rAjR#ELcZAWt&I?)>spy|0to*~1y2&vR3SCa(Le
zsdHIaP4oB^<#}84^S&$A`5Nt9coy{@dG=Jy^i%LD@lBOhu4`ndUqy<XdpK#Zu5GHj
z;_Wx8PfaX`=IV$hQu}1|HMOtzUW;xYcWjesKzqNpj;5isuhP`ps@Td^$}ejAfprz<
zPyHk0pAUx?8F8WsoDRQU$g$I`&^JX$5l=uOS!crV>ekK0fw|kPB_oKS(EuUMdMBXJ
zN*WTqnICWlYHygMY;4`nM?x@0ZO{!9s$K7ZOBs9NP~Sy0T5gSRJ($wS2izyAGNN%?
zGcdL=15<Wh(EH@QzwjSQDJkE!1SdNcN26&U$wnadAOtW{xBys|8%)r7QPAg~_|JZU
z<rS8LvdGRS2^ji1=RsdPAAGJGKo#ja=6wbmMQke@)nkh8P`SYMLMqS@PXgNGW(6R@
z9ROl~lk&#$iX;(lH=i^s%EOeUrz^;!AP=3JqvbMN0E{}I0@Qx|bfN}_1BgO66>uKe
zFNu5v2hOH0cpG=We<_>ul!~<2Te?U80jy86A5?9DCFlLlG0YQ`6Z6G9p&g)$Z8pfi
zD!-Z`TLUEKi7EC}CSIrfdE+_1A_X>{oOFw&|LIa;X4bavihy^`JM}GlaIV+2iY!jZ
z7~fkMfMINFKj;_#>BX9hY~S+&D%f4Wg%JfW00LamCTCp-fm%Fu3vP{`Yv2CVn*OGP
zuuFkxRlO@gDiG%wba@NhP_Y6h{oGLK+1pb9+32NWJla*YAE(=?{OPxP-X2EWHz0O*
zG^BcBaZ_zT4zb&{QO^KrQm`w1&-L}oMCsYhmwtCwi9c%LpPoIcyU7XFZt+QK*eTE2
zh*1)C?6cGbE+1C!emoA1^z+r4DShMGIXhi7)$i?;8B+#J94!^q;_P!pUJEW`0~-KN
z>%J)sTYc$C{X(aDm`Z80@p>zN_x5DSy}NagtY-NM&tHZg79(^C6?@vi*kCzXT7>;R
z-DR|=ycB#S9qA}HiC|X#q~`YT*rt^?m7ewaqPKk=FpL<a>{PV2H&0b8xLF&EUDVuE
zd|o(z49Fk+bCRVQ-Wh{}^}agN`7kqYc`K+az@^p-oKG7gT&#P1ckinef-siI@x;xY
z%RfSYsCQL$QyII6$hem&Lvmi@iToNbQk$Z@z#41<_ZgjW{iX71iJdes|Mm%Kd}JxI
z<R{cwZ9O&FbhV+o|76T{2Z(IHi$!)NjCqch%^fW88Xv9D+d=D={7Rb5?2YbD3;Jxd
zU3WeGel*ALzeo8tvDc#i_xviBd8xq@UIA_vDM5Je4q>$_eV@Rhp18XmFmHSp*-hzQ
zs@n}5{{sehP9;Y>hd1RKC24B<nl;gtCmi~}yC6w}4F;>%-?lu|Cu;6p2CZMP!3V~c
z9;W@Ua!nXG(}`$Ew@x~@pFTUs^Z!c#YdwXqTpdVE3oJ*XKyzYWra2qAI<qd5BGV$f
z_+quwVk1tpOegHqZPz&X=5-z4<;(7c`axjZMVEyo%W*XSvaLtY^7)&E0d12+`s@$i
z!F4Be8B+-QiVXe&^P*(|Nj^Y1?{{fJJg+}OjXxGrbOXkuZ4kuT_JK*(6(D`@;=bCI
zdJQ?pQkGBUZx;ru^c=k@y|KZOb>VUcJha@&2M7N5jbtL%grwsm3SaY?w{%-UlD2KC
z(0R7Mm21xV0Ay8fCRxyO)Z_%Xey0Ya^dcr*hb1tQBE_gI?X?Vk%#Hm<H{6dX>%VK+
zNG@48?@t%$wp%(mr#{i<^Kc2c+2pgKnY+Kc@b%`_>_bz$?IlW7(N-()+YHnM6O)R)
z_=%ClwFvii>Ka9Xo82EPeSbq`A;G-XlZPdhQdO2P8O;2p+YM#+XIf=CyzVIZ;<NhR
zd#2q{pV3$z@h?Bf`4AH*fe7_#3;^@0#zyT)ua0x$%BN_$w*zw+ZvNNQ1346K^j9S|
zl$tNc?plGo!`2})sc(w^1!%3tVXMRU?EResz5{gW&~Cl4b3(?g9Hh1NX5qj_+&*1*
z2}pWv4!3dKUvF)StbBSlf4+sT9`FWDCqi($@bdB{&|JW<-!U=e3IhFPJ6o}{I8r&-
z17?Ncr`%=($u(vGvvy=GKNs!u)l$N?Z#c_+CT0El{L*(4t=WaJWOb}SuTHHyn_66G
zw_bh`L-awuuMg-n&$*5}b|3%#mLm^pUcKf1$Y;5GDh@6n_i<y!+;}ywuv9Tkut7Wp
z86B$^;Q{1(`y*kl96&H;E^dCYxh{hHf7<@7TATH+PXy;)l?9Cf6SW0D9@!c2xelBs
zswctB(wusk&oA{?%1zbs<*Ot<54us~s?^FE<@=>qTE}mU)N4B_fTuI_?PhvJI{+Oe
zvX)=|z&)ngepj(K5*ZCZ8G8jtVW2PLzae(-Ij4{Tu}>F@biqB_`u!}@VG0Ra66ZoW
za;7<@1H~Z1u^%(jBsK8|d4cm)LN?Qt!2z`z<%8b<xIPJy<VUq>J@2}k8>8s5W3$xK
zEY#ig^{SuhCtl;~VT6#Ky9CX^@&y#OW0ik0MEw0olW$WGlxQ{)(r6?5E#8Sj7vQ9a
zUR;=_*w#8OFz-x^-Szu4#~ZV6>hRq_cD+*U{6pX(m_iu0j+xM;0Rhr?b<14s-l(b%
zhqD>tu*N{Q5($;1`NQ4JRCs9%S>vD3J3DXJoZ^&~o|snGjEUqtfE_wVzr^x3zHd%B
zpLh1JfmAEa*c)*0Cg?q;a5wuI`gl%t)n`uDxTf(+O^$RyuW;fgFsK@zD*XXfh<PaP
z9MOZ3UhrJ4w}g*-;~uA^#<Shb)Vv*02#KC_UJXY9hhOS)AVJRwhL}<+{|I^j11!a1
z!RK0B!GQ{BdoGshAn8>UFd!eBNhx8qVg0<<4_I;iU>n@GyS8s~j7ot*)=|(;AsAF1
znFhk^B|2??%~aTUseCxyPRdj0&{WfXjxBG7u94W=0LVOYT6|n=UNGHj+fwKNWdm>p
zxts>+5#a3Fyb;w+L{39pq0X62-NN}k926&(7%$-^@GG&Yj^(@reZGq3bETYjEP!y`
zZI@mN|GRU1+{e*qzl-q51;d#cm#ki&^p<N@h}u=P!uJmLyY4|T<JR2`0^Ut-TQrv6
zY=N!md-tDw5(>L)cpe2Yn(8T=ORHa%&mKz6BYqG83%4m>pZH2PwpVp+4gsl8`gQGD
zqIUNHwoEBiizQx2|MrkS_M=YP?J9MZ?;-GgB!QEEeBS_#U9~P=6^T^OA-y?k(J=A`
zdxgycJRsO}(MD0aZop)De*4=`XqDozN6db`d>!8S_S>@XLBXA;?(9|?u&gQEx`-lx
z8Xhl$w!`E&6rOU+Zgv9)KVN)HJDLMP0T7TDj{E|;y;&N`(!M!Br7as-W%nqabdbF!
zS;Vj(({3b#-1XL$&peG0lyXscBIH(zOJYm@^%|VO+*nd9T6vQmvRm@L*n(+wMpLDD
zHc<!QuI99pEZ`i9{b*Y!>!{|D&@LxM+f@N>s_^cyD~{+%zgl^S8q8P*iJ02L8GFV6
zEB>j|TXWL>iSVjGBp7;eUPL)IRv2KZPoV4RyPYr_uw@b*YFuBm-W%&O-Az#Wlg^QN
zy4PD?^ZHymcn`Mji&|47vgFx0OEc9#dhWan<+IrWEUK+U?N-0$lzsoa#`0}@$?2=j
zpn4sp|9jYMjlnR!AIP)b*SY3D`?jfVp0;%!2kAXc?rX?jrAdKPY{YFJ80NQP7;S#2
z<w$U5FSSc5{mo$qirvoU`x!69X=i~p9w-6tB~E?F01w(w0}`8lz(QM*AXm_YP`UOf
zXSNsQ$@xbtJ+Vr@`#iA0)dgdIBnP_MEsYXKCxvmrFrLDFt6GD?wG>BdUyu#4_@r=V
z(oMx`FU(`*Gb3Tx=zx5_csPl&J3Csupv&|a*i5zDxr<UVfgOE@U9g}bg@!u!2(3_e
z0;G7D>(tt8F`D{Jhz(es9nD%-G?bhBA-UQf%Rszf+CUH5Ivvj{G;2Rs=fI{qF8Q<I
z`@VFm&*8+<6YFHVUClBbOMCw0dgF4HoUZb6(YHCC<Jrs;9uOeWXw{X1#JA`AE~D`J
z-IyL;u10p6pR8MoT`wC8JFikK818JyEyb2#tAV&_y$GG~V0u@=QH9gy3p+uB+gmH#
zOcKKen*syA%jEs~Q;Cq}Ccy;PLHLfry(~Xz>(vGYsrU&rvAX})=a|4RunVl<t6ZlE
zuh%!f7;D6jH-qwhk2(|BwSWl0dP?>U$u>!%LT?^oNrIr~3L&X+)fb5_w%~(L!JVWe
z!5d$DPp+5XUtyi01V}COh&L1ssRqg-*IlZlkZiNMl<xxv&#Rn*<^!n(72?}cO%69d
z7IEkO2J=P=6SNxQs2i{k*|27d)vr&;(kBw(zx&Y3=o)kaKGZP}5acs1s9aVClzL0-
znWw)9BO6GZvCyv{{4kJ)P3aA_+6*=@RZ!1RMFKExl|P<&R}jzfFzFM~PZ@)Zo?h&m
z$@fArmiPH5vg~vIgBu3{8ITEU_D#KVS2+!;u`sm{;tG!Y?zQBgmSXZkTI!_R-H=_?
zxV?4WN~wrr)tsfsQGnV9F2!TLgF?T{>`c=8>83P&?ZTE(v|>|jbSqoy@$`M?PMYzq
zBBEw-#9Rzo78WQ2$(wmjt#!RheQ&WbK(XD0J5o0*yPX}ko4?hUP%$jmg#Gmf|1yuF
zaxK>qHltNV2XAWuQ#5DDdszEeA}bImd|yJ%h^s7KyEbX0CVcn(=rt#1j*OLWcCIlC
z6si`dx#UKmA}gv0*3f$XP6A|P;3F*Ja&4(Z2n5T_+4@&O4+ZPrp<<OYeahNrNs6wO
z=t=mqIc1I}?X;kXSc264$t1$;L0~+G=)aX01wv;@izAg<I^*t3rwq04nuUF_TBfFj
zFXp-XDO;^Nv+ew@Hw+e@ewS$&=S?vl#h3y@!E|LG8zD$NyOiip`xH_0muG#b24id$
zC)zR#C*v&LjFfooLw+!Sjkf0kTV3yNRFKHD2a>qw+^oHR)-DQ8x#p<0_16s_O=YWh
z8*$PkSY_N;hi{iZ@VVF*v02`FIYn0r65sghZ)+$jSR4F$GMq*3zYA<;H~E=(3vWLK
z<Y$@VjgNlkeC<hr-BJG3!mWV^KNecIngZ9+m&{KtX!!k)Z}bKm^mgL)NHV@?f{*%W
z-@EoH;-j`gavf0KMobr%QbE*Mrz@>;30}>sVPH>(X1*hkg-K|9iC?2YNk}@MLigrw
zX0D8JtIfr6rB~+m%&4tH;kSA)z|rb6rl8a427T2RL+L)>J~1?OV9=*p8$Usq?eZ+o
zP7>S=!VS`P&WZ1%{)D%1O+8544kfsKr@N~6cW(3%rC>b;U2F0Ze7~+%9(PbsR5bq!
zjO+@YXYU#lb4T!w<~5|xZ=PTvCtjEw<26!_-2x1@Pj=U%14A0l{Gr0Q5S&|Fa=!4N
zSt2}EnZ9+Qr+W(lu*~47g^SH`3FXgm>0EafSj#`8<UZNta2TuRINNJgf*q+wNv$ee
zN_-S@7o9f$*b+ejSuE^Kg5h2p22PgQF}JX4v(J&~Xptnn9KL)(jz>Rz{kF_z{QKJl
zBdoZ8#?dUew|n+1kwW{4!4r(S7~I&<Dc}j6c6Htv$!XlFUF%c5kVA{*Ep*vuuCtDZ
zIkJR5IZ}CLEf#)~gEZ}|0K|coKDSIYFtuf~KOGsnnSV)P@eWo;*{8r@3ETM%G&3m<
z3h(-;pvFF<TAJh^yWAs3Pjey!M{!|g)>l-uUs4B2m*9J#@Niwo=3!f?Sv1i6A#$&7
zCc2p#&rz`>&9@nWYyr#PmU~2vuVI~Q93e2<blh7qY1vY2(Wn_}P_ZDdeb?)DZ}vm5
zwUx)-%^z<x17+QL(DVm!znVO9%AF4B*t;KP|MXWRIWqlu7?sl3V5=6&ZGh=L!}v*W
zTZhoX28A9MwIpQ-+~9wO3yf8*-r3>Ag`UfZ-u1kh5m!vRZq{vWc{ur?aVDSAZMq#-
zr#OWYx4fv`9gGeo!sEtI4Iuf~8`}5cCAvyZqXWlB-P8wN!*2)sPPLa_Sy#@15^%w_
zXltP{!Pj?uaeOzavhZ%pcRApOF#ie<EA_8g6==n5j42rhP_F}hSO?k*bi9@s(e|Q`
zjh{!=+<1m_yFMzbLW;m6-Vor<`>CiaX5>RrQ8Y}XNG>KF{n;1LAD9t6+8`NgbT5?)
zZfTfktHaW>NF~`*TVi=Waoy{z^ByWRZG7my|4^r>S6)<+_p(@r1kq%9_+!WH-A$2F
z#v*%L-YNpp;Q+XC+*Z4IXxBWG-|Q-R?m`3KZ=2-@WDz5#mI_D$t$M|puGVAAS+fVW
zv%A|7_RUjVD~)CA-_weEf4e5bQ24BC%P5qjpg(vj_YVKy!NjBOo@2+2zk1VM9rzXY
zjT%6k_U@`RI<{lV2w(1fd^M?IY#d&rls9^QzUvt>M{1tx2%97ILXOtqUr%;wHY<wz
z^XoIAp7^^d6tv&tRz`^Vp@ytmw|Zs7_FZm4)Rr_fHr8dVeBal~5}dGO^E^;#mx#Yw
zQ~XNg9MZ<zTm0`FxD&^tC86zZOU=zf94;*uO+@r)w&TrGu2T(JjBlB@yi!=`!O|1B
zuTJ+F6flU?8!WK_r_s|$FWJ=e;ygyOt?O)R+_lo3Z-YU4<x_sTFPk^M`~RGbYij*|
z<&E$5h-f=?tll<K_%X#8bLrh0aRJakggLkVnD}Suy~_kuFgt!W#X6zVR}uJY2rp#8
z<;RK27jhA&i%^R4I!K%EJP+EVabL#Ew>AsA3<ZWUel=laV65rRT^^tZf{Sb+ja0Oh
z^6jQP&=bql(AHaek`E(>4PZ{Zm(GxPi-dv>^N+2lU%~Qqxo2XmSOo=|(-Mx@?V$t3
zu)yL#g0Pa42v5wxw@eAOGux6t33AA*Rn~d09ozkpUw)Yi3yG|6qgl3?HOV!ljoVSX
zhle3QN%cLx*&=a{M~FB^U?*GnUU%#5Ton1=2e2<^1M&W%)mE;doYUwN5}+rqlqh09
zCzz>P=lzre<75I$-dAdM-6B(uexZ%Qovp8$dLi&Uq3$RHUShAEJ=D|HZu!}@d4f&U
z`~1a}2#W?~bfnq<x6@iT`|?Vo`WAyMTkH5}S0pq?!Wjdnegt4*7s0I2{p1Bkjn)$|
zhP$aQOVB&q2_a2)KiMkcxAMf7Z?`=<;1%Blp*SmcVIbbh4$^36b!Op#h93t`OEyAb
zlAwdQg0(%kwT;zoT_6`074u;MO<MF|Qo2ZmtjB!QQs}g5)lyL}*1pofdV<=QG%z6?
z(q}|(2yuO}Q$LQ|qEu8Gx7pXzQvPR(#9*;z>fCj%wA-8-G|9`$fhf8%D2Fj(VWsb)
zZ;Ln1w05d=`MBBJz!rnN<tLgo&-s^Vbd`gtidOFmgZ9-q?=Ef>dl#H7SeEayZt#|+
z4whhKE%ek!^~2ksx$UiYew=n?ns;|cC&`*9z9rb1I_J#dCD!3O<X2ohF3`zmb8!bd
zcVN`XfP+s}E!R?R=6qK~HJh$CB*0d%*6buw@x8&%#96qQH@bX46FM;#{w!Z`>mUo=
z=DaN8dxJg(WzC~Jmrb48o2j{9lO#8ksqz0T<*2Qvzt2;#Cx1>2DXAH%h&>0_V|B~v
zLY1w-EE(tPZW$I;*;fw&RtT9+c@6I)(w?qdzVzwS^trZiBtpLKhAP38rwEqfXdCou
z_?3U0W%$YNCPxb!Oq(zK-bxX<SJIhoY8Ormx;9f^Wy+QH`dU9p=CiGQ>{EO{nkN^-
zXntFo`HAY;9k97~Zff+!TURtsqJJ24gQ#rRj<x#x&q=aOK4unO?E;l~v|Y4MUCx?M
zgI(iZtr?(k^=Kar4WvNaAt=`j+-jpuSnPSns>(7*IQNOC@;fXCJ~LZVJSlslj<7Ms
zfdxl$H=GdL)>>&O>9|fW^$vN?D|T`wb%<001Es$ebG%iQL2J2Q?HEVQP{|;ftL7<`
z<V0H&W;zij9vP9rWo!o1menput{Qf=9eK|P!SpJ2)K-n7<|}(KRccdSU}PH4t7JUz
zS1_AK33l>&io8bswEfrG_wm5*K&`*86B}+&jOb)2nQnLwUU_Xg>6?1Pe{X=1BuAgi
zMQDRw1-#Y-=(X<6bfk~(dN&Rv_%_99hZ^MP#P2qq4KKPCsgs{<{e7vCEUu6=t%Vt|
z@_3lqHqXUB%6WoDIi}>v$+N?qhd599+gzo|Ei(?YsYO;Atf$J^4P6HXHeq2!A)zov
zAbj6E3gfdHA4YF=b7T`9zc#GgL;lmv{(0yZ1!AQ3Piyiq)*3}&){YvzFw^w&|NJQy
zJ*UT^fgYp)1#j2nk>FpNDO2$vS$*e$nj<<@k@OnlZWNokFX`tgdyh$f(Y__zT*3X%
zd=irepG7>AA@-yEe|}JyuK2(`JW-^*z`#`XYRLc2IAffL#q=UazAySe>Or8|Ao}q@
z$(LoLd8qaoLc~E66(nB1>YGdcjN#)-A~E7MB3>wyEVyAxHq6af{P&gr^YRg23O^yE
zTCuF?XDR-(9slna<yi2oFsHg7BmeuS|NJ+~1ZbG!>c5&_rTzO({NK+L&twAMB3idI
z&ms8FoAf{5qQk;3Ec)afB1rqsPx+rO;ws=<%)JYPQvdlq{`Xswh+#3o1(xfp|A)P=
z46ABu+Z6;A5J4$vEK)ip7a}0tog&@Roi+kWN_Tg6sdU$(8y4N&XDk%m@9X#D{5XHU
zbL~I-TAR6?YtAvA@#Ov7&!1cU{Nx!AzZ6sv<ty~prH_=5Sip>qv-J7`Y5C9p2vUJd
zl=fVNzW@B|#Y^JI03TX#l%@~ma_xS<#79|hNoLKC4IWAT{_Djj>UdCW{aC7$igme_
zzn>pF30#UFT{UI->yrF4uq!yPa-=E#`+om#HY`|X67-s|fX0u2Ffg0E`&dV07zUci
za9UycQ8&n>brC_jf*J4yQWf<B2Z|loDoWO>N(vUUa^EK)#U$EK6>*eW4HC_5Wgk_A
zTFJR|<@7t=ym_1pw11dc@esMLU%sBvh#&XXbHI$9m7nF2L>&d$6)h1o#r*lUj;NGP
zue$h$kU2rimL5p&1si-gq+j!9Ne7AE@9MS5fZ-s>c`V@SQyH~`D-t=HN<)+K6bWX2
zoN2de6o+IiXy)#DK!%Xv86UX>AzLuv<v|-5PQ{d+3JMSHz*E-+{ADoM=~_h^oRQ3O
zeV1JnfhZ{;0A0V%>rj7X&|=g8nbmG2OM*Lg17NPp7T+B8gO-N^hs`?$AWL;X{e3AR
z%x=t9se-nBW?Kq)w1=P!JQLGfl?G<8E)Kprcd#@76i%?3bJvOX=(IMQ<(=OB5eqF)
z9!6w#J+k4Wnpp-3DpSz-U=|t?^GXxB0Qd1h^IB*g{=5bloT&mrXrqDbs6&teWbWv^
z8LS1wXl|m>^d5YPAbyCz5C-?(;eCGF*7AOtRUMfGgLGU7`5aEBjOEk1FS8GQKqajH
z>NDQYOu9{7vnps?Dz?(vq9coHcLdA$bk~Usv*wDTkn40F-5}e@Kq)jEfg&4Or9nUm
z5zcwl25^q;B+e@@z?4WmFvs<U#lk=sXdns$uQz;B&vql<X9!q{He5AvW;mBCyfSHq
znNp2*4X6A~128tLs;+Qdh!Sx}F$T0?R*<!WNrdg7Py`r3mqLv319S6h-+|e5jet-R
zv<+;Bf!drS_d%k{x0aik_g6<~iwD1*Y_+x&9v<oz8g_q!R-}h<J6b_A4nZ*R$qO{a
z^jY#pWa8;Ih{BJh?3yAQD*S+vX?cPHEe4f~ie$3`k|1~Fl-j|2=)oW7&04pGO@>lT
zDH~lqB_GxzM}sBNd%X1=^^@V4n;K56%%RLRWBCxb!l-;QxSnZ7v|Wa>a;@Z6q35?*
z<uo6r@v;MvMrJ0LI)fssgSp7F*%qKGR0ST$2<Ai2R#Xn3#){UrlKcIZ{<qgCpe)?1
z512oS723_IT1UXUn$;$JFiDe>mDQ;}S~4lHocbo;Psu8=t6Ee2;~IyXNE2Fy6gUto
zY?-#Uas!W$1M@`yWo<Ek6sP{l!KS%$pO*pXHQN;Y-twt4Xu2H`z!?F~OP#A!3ZKfE
z%-}W~^l@h=1?nqh;3GE&<tWl=x${7a$yC*iCDhD{f)~49NDbT<&WiF9@eBt|Oyy8^
zHz)Fa0MaalNCc9rjjTq_;LQ|0hil?RKZAk`P0Z$wK+5s~8BM5B&F-M~jk^L`+}ao#
z)-!3chj)oks2i3TTaX{e-_SXPvea=>a?Gl29xRU6xzp{K<l@N14YVI&LbSK>20?jS
zqxG(>MKfEPvu{8-V-`s6jPIuSnfV4pVjD`U0%}WV4KU@)Sz=eL?u}WA8LV77ZNwju
zjgT!UF`RX-5U|1MtrQ2VlOn5QW6CO#6*0-g3e_cvlenEkfh0qBp~b}bH92SnuAp49
zi&DCIvtCH}J%$%x=9oFpF#XcEq>#s`yi{r8wol~7u$4`~N)3T$njXtKF$MvAZ+{Wh
zljjNLsXh^2CcJtx)gV(p5M|m*OL!-B(^Bw6G&cFYJ#T~`T=QdMb@^7AZ@9`{dAL_#
z6yDjIoa7r*!mZ`aPe4l9#dcuM!OL-bH2Y~^V-CBi#*iH`nwoHO<t&sXT<ch9B%-o?
z>5hV?MlDl+pbN0E<_Ny6vMG{g+lwhN;b5eszHb~Z5rK0um?lP-8m~wqx`c}*zBHU@
zu7LXZ03yM_w^F5A|L{8gH<(g%i(F`e4V8i`OM@RG-qKMaDBzFuq-2?_0^j+oO~$KU
zoJkp3%cTg5hA>bw0}fLd@Uor~%LXsvif@|$A|N3?<G2JfnSI8XK57vWjS6fIFg`ha
zNH0`F{YvBYkiMs){NZoV@P?qzwGIVma!}VOOJVc=2Bt1puxyVns<*#jl1YDO{Asm3
zaHJ={+jpbb-9w#DL`6i}PH>8$evXC4R$qR4pe>PEeZ8DaFvwQ!)Qf6@j^EAIxP3-0
z9m=xk_1t(&u1Qg_`c@?zQ5uG%S~6ZgmYXpr3Z`6*t{8VS8FY+(qLl+$WM0+A&|?UP
z#_g3S;Y5!RjqH_Y1BalYyNpqtsbBDLX}Yk<&p9ER=SZdtqVXQPw23_J#74?YREe}G
zS?0>qvHYMuf+Uf^HYv%it8^y06QKMS4q&=r@B~Wg8(-CW$|Razy^tv-&~%`_Op0t2
zg#9V+6+yz}CyYu;<zzB{RlG+j;5qQKW_;*%MwYe<Y;zQ|jG@Qhav}y=;CjevO{w^&
z9uh{d9TBllSXT-j-D;5X-(uDH+;E+)5IsEw1Nv+%;F%xL9WVJsuwzk*1HSOH9k(jo
z*O+XdpFgHvpH${K%=VwV4QkjZsa(FQRR_Xi8E@WR%TqwRYm40DQ6@UmDE0jAv7jf)
zDJ7M(B|ugTY4e-9;AM#oJVh@&1{!h%(&WUh#a6?6euAcyjYvZ9UU|0}*8EI$e`}8B
z&b6K0xi|g72zbcyF0vYCyE9Z#o~lWdRG3M#F-%FqRy1#d7xkMq#4xgYI9TcmopuQf
z5;&=96A|78dm%94gOYdmO3k@jc75PpgcQL|<?UpGZHmB(VzH4z8>6FalWHPOni@!d
zOU;L$<_e(b1jr)De2e?8v&P@*2@1_G7OulJ^LKM;O)+iVCK>98X>}Q<K>b*+y1LV%
zCj%3_;&~s_0hA{`k@#oDe8777GUO9TlvPFqs7kY#4tAX$Wb;NRfzm^E#uqOZR9Gra
z--@qy8;K=Ecp%AmYgQyX;}bNFPEoXcYQVr2MYhNJoYJCbsrxQ9>eJNVkI=L=o*{B%
zKY4?Gs;}Rc0Q!eW=`Er_`nzje!1g-2Y}#CNY6)yeG8%WGDDg$5qyijJ?Nx6t76lv|
zE%BKKwI}Sz&x2gF#k0QlA*X==gbspDbVbGUf$!BsM$?@_T3^0|M6(R7nrdf)&<M<f
zVBcfAk;A;VZm5r&M|5xi_AhVEv6v8Uva}2CG_gHT@bH8wKm@WRpXZsI&Gn~~4-(T%
z&}pvF03E@LP_#4%hJp^JU=$UypI20vA}dy(SoH=*pFk>5DGw6qB#evV%oPb-1}KV7
zq%%S=_=u*wP|QZw^z9~iLUK|Vk=0%w==D^k<9voDg*+I73TuR7+W?v&HuECmS4{Q1
z$s;KYS8dOG0)sh0>JwQUOj#ziu*KG&93b910zjS85J}JpYKK6B+~0Rc=qgf%bGTa&
zuz@RxYR${C83eX7pefn=0vS<mXYKjOw>~fu`$zL^cI@YJ)kP7xhe$X2EN{Z`lsCw7
z@o!LGNvQQfWDu=(mfJuPpCe9`xkCAsEd?U&h%D3BFTSi;t8cw^V@-Jcfd_;I5$sQ4
z7K1)Hc5<FHNLq}&HahdprbQK^d_Gfwa9C;6O6gu~W}6TrFEY1$dq6d(A%#jz8Zz^p
z5;IYevYJa)*_EY}BF1I<+EYc7nAuIsZ#S{;%9=dvSE4m_aU3qrWs);<-eN&^-3g3j
zf#fbNHYTBj9Cizzi+w!>WV}`}{Z1lT`aS3!k<A(!nT39gFL}k3`A9$hGj9%w{e*}b
zKuKjOqe;u^rLaABnD3TF=8MQ*SNNo8n5vWPBUB@L86$ifG-iAhtHsb(YsylbGyv{_
zo@SC4<fxgIic&tL_Hzpr#=G1j8VBr^HEgz=twN0iVfQ<#oecKD!4{KLa>UIOm0%@J
zHIIFUFW~nlEr@lBk7OnyO&(+Wmc(;D8wwDbt)t^T$~5zLI#w{xrYfzm(g->H8B{e1
zLHNI>dW#J8`8cv^fiNOsy*dGQ5y8}Wd3a$}AItRB({Lnc`z}q!NGptwMfI4?e7^U+
zN3U$L8s~G4!9v^I*DxY&>c)6w)}O9A2naC2jgVjH0!I?Qy+Jh|7*3SR36S>|8d!RG
ziOOQ(-D_0Bs6<9LktgznvXrpz4q=O<V!&qaC!6iokya^f#e9b*I4X2*i4MbNH@#aj
z9z%V8MOgLsfKKJuV^;eB|HEhg+6ha8ud7PUiVV-{{K4`6(K=v|MCr{^x2Bjbw?3{^
z=2TBpv?*a(JzJUYqPR>PJa?7Fjkr1HbZ1tHbW${0^L=M^y`LZH=|QZ*J!m`UuYfF;
z138_=w2yN0i6s?LII9m&xarzx8I@HS2m*sOV!H-BPPrPY(MiwS*@L6}R!78<Y`j-3
zN|_a&40=86t$fp_L>wq5h=O`ht`KzekSln9UfCzSdvncjPJ?7w_*c;Rk+#dtpGO_J
z(S@^I9Osd=961~A)7Vn5o!A3s`X>Jjdt+A+53d=ghg2DcJp1nm6=)2~L=YT4HO{B|
zD*%L$VSvqWoXn%V+?}7c8NPbb&%vcPeYzSy_vz<Z{COEd0z&q!+Ltna-!S$wxP+S4
zT>f9NEBIeeZt(20o{jPU!wv7jeMs3r+rJ|AD~OIaFjJea`V0Q^g?}y4Q+PC=NWT16
z#EyWp1w4(w>m)uGQvc6qzk>J%9?d_OfWG-9H2-*<Ss*{;7~HtO@X`N_z)|7m`g3^Z
z1z7Aq7r99Q#9AZ{k3ar>Lu`07e~(oK@9*{f42Mp0ZzyB`^H+YZ+P@q2M-0KIe>d!(
zhw1<NgawODHV|G9N29Nd?GsupQo3$EA5{_o%5G(q898e&LF>!3>^3q5m~{~bYo;J&
z2eIQV$&SUUmI{C$Y`=;zYVa1ry=e!uYyiKe@qlx0*o0mw2h0cmk%BYryOg=g1N)kQ
zmd6S8{4Mt&FSttt(^LGZ7pSvpL3g(WyygR_PHof7RsgT1(NKPpO1^F$S|t(izL@P+
z*ppyrk{kz<-V%WDtsUf9yaEfHav&ePm{vy`5#fE?EMW9Ao0>&cFgyZl2jnlhz^Aq-
zw|UahZ&K%lY+eOYGGC2GinAOy>fYs33p4kp%6wOMT6_e@?H2LHl+J+!DWqf^Fy81@
zk?8+Xk(8f6kqC9BN%dlV*{nU`2v{>u3`Hb%12E{A5oBvw!K{f_f$uQf2e|=?woc{F
zI8AML>8l4IC$sZ<amXcAHtC~TyVX0i9!du*K#~-}+{rx>3GiHWUjc<J4%e!BNjhjo
zU}ZKP9McEQ@N|<_052U5vbI4IjGK;5I}7H8t0Sfb5-!7uB7vr8(eK&;EfQu7Zhr*d
zN!+_`sq!EyzA-EDUseGDi470*)0xsae{{M~Z9y5eyvHl6P(G{K)Yn#h<$6A}1YZF@
zs!oD$wNS3z?ReE-+;yL4UxhymjgDhtrEET0zm>Lx1&%!@b=~<QQ#%Ks%KkYSCIb^A
z9!+t{B7s@$oqY7n05V>P&%+7zdc+&(nqaOlVm{JVd9sxO19h!W2Ejo=UjZj}qq_4d
zD-6KGtZAK{_A0+YXEdx=@)}DgB|y$kTtvind*l1}_N<a|eYcZM;e?Q<OUeE5^^$Bk
zKZ<^oZFf|R0t)DLO{EZCn^mWLz`&+tJ)j|AiWVW}tGan$)@K*xe&(nTFyG7|w_>~O
z3XnzZ0IRwVwXQ|D{uR(h4-_>GfkIc5x0n3Y>*EWE>fwuUKzSs<ASJt<I<a}Men<8K
zK{6}^<IHI>DO4j?2m!wegPgQhB$_1h-TtNOuZ7Wp#+Y_j3|V)$=z4nHfM;dsJOV})
zM8-UVz>p8I{d6b=DjOxM`RE|BpdarMC4lA=7nlw);b8*MIbZn+@S$H|H{o+1;DR28
zlrlwn5LB~Vq)|ZKS#7+%bTbqhN7gl6FLuX^#f_wBscbH!E_<XOOfAwK=p6)e3r17d
zTI7zvV8J#p@}dpozdkkOmL0Ff3DJ6CL(&zq%4JHj0C=IyLkvt{8LNEsh;U_Vxeea5
zeI?awWK4W_DR141MY024^9|ud-2r!KxMn&$xTfSiJ+n^aID0#rZt0_fZ2tJQv`<3-
zchwfGv}a>0>oEtK2p9nliFNo)!_H{6Vf<AXmD?fY6GodM?)mt9lJX~!%a!_P9}@T*
zgcs4+Ha<^vXM3wPLN(7Y${0Uk2cVG3YoB>vL!X}Mh0nn)-fhCowta8m3ZSkjUqPez
zP@a)u1i&sdpB#4b-Bgab<N^q-2%!cL;zGw6U-r@RKI#;;l57~@4`K&M-t>rMEkJj|
zx-<-i_<f)inPx^BPsQ=M=$OuyLOrutDXl4+{us?Rxl_<8*99A~Ncjp<n&BYSD)W2~
zsI{iOs9P#dw)i6EW6m?e7+!dpoW{SwwvhqkB%g@!QbO<<Vrxu3SOdgv1V~zzbDNcT
zJ&EEZ*5*B!6u@(ku}lRW3dNNc8dsLS2)~U0N#?Rc)&_rWx{8&~wSsG=(32M;2>8eb
zE*?VLcM?&s9$@^k>(oBL&bQ=K4BfgAEhe;sO(QU8Yg)AcNhuMR4s>}~&-5aB-*vhp
z-<v%06~MXi*Y79JVRwOim4<tt*%EurlfJAqP*`U^l7SC_ah5Qznu%Fp0Yf}ON%zFE
zWLemJ-GboB<pGW6>qaaRS;p9}%Z`@D&2JLezL4G02*Ea#Ia(=@&?xjmAhR`)wNJk%
zu;vTCSyOI9B@Mz(`3*p&)lBz1!HD+;NLlN++GM%)MJ<E80PB+K-ocDi>7;+_&BHlw
z*6=rE5;w3_eo2FL8t~}k)5n?{e!B94wf<7fhF$F_Qbw|v(?HcQ_9nO6WULP6B{(b4
z;GW-I*Y>%qcU0$s(*_!Dc}7o;ikx=C51f6w;}aEW>JmNP^?>P?-hpGzMJw=hvHC{+
zb>Wl1l)gKuFTT^ZA8cYxw;FMWd0xFfsQ9$stTp<L9bs|>!pD9bvARb|S1~@>F3Var
zr$DpVkv%wV7ADJ|fl>g6xqriqjBIVAJ^C?N#8c|Fd+%%j1-ljK+357GN?^3?;{k)U
zB%$l^2PZEqZgjm~1g_ane=QhZr_meDFuka_KqL`|{G<}{G+k&y(iL8QuQaQ&#bW2k
zM(qm>EVo~WwLlM8K(~$6x|Zv%MX^se+FDfz*X9qbw&k4j*5W>qn@y>L{tb4Ay)ksG
z?Bg6w?KxViLQ-DCMk%m8+xKaJjk0yylf+jrVP`Hh@0HC?wf%a==m7_hv*w%^*NAEF
z5AKcHQ*$C>bnMJRp!`M@IWnd-lCZDz)Hd^8Nep2`T`|1!!Zuh|R>gTGuL8a2WU}}e
zuuxj|;#6*lWPJ|eJ+=9Cc)BiC?9ZyDQz4CRGuHI>W?8FkU5`^>$pn2T&83qn-3Bko
zIj$G1UTpMQ+H>PC)81J80ov@bvt*!bna277@jhxxUxrer-|gP+BJItYUdq~dPbVqP
z#fOBZY5Hm{nal(Z%2`@0`r-_waaLN6a|$EVi9SQF`+G3ZEy@TV(6&t8Uu>qvrTofl
zyI)DGf`Voj_!=3>T5fYK%9w(*x`3E&74acDf$0?DJupkdJ8PcbT%4$=<#4THkb_y(
z^BNhHgN5Ka4D>_z6E`_%m1vS)k@dmnNdmJ@-VT+Ybs~R+Lr;r1;iQxi>~|}zuvT&a
z+^&>PcX*tH$Eoy+1RVBhp8`6UI?#v3KR1|c{`TP?qHgr1Jj39<h)A%*AoCr0=A1Du
zvPUvQS!yL#bvJ!?ryH#|`9FbXmqy=J*#sd8>o!K=ir98wkmef<x(`@Hx92<AL5Emm
z#NP2l%xpupj8Y=-U~=R?3vDl_$-q%<D&;n5tFXeXW%5LG6;y{&Fpg{OgH*WG?iQ`{
zj>=d#@U~3VHda==o`!=CkcrY*nHb(GGqU6gmC_9g2|G`c{f0AH`$s&@BAx`u8D1I{
zEdJN-LCXo*ObrK|y-Y-y^;+zUM|$elsAN@zXf2Z^eR-1AT2qfhqBWEF-xRHZ#?}_I
zb5M;jiaD_<Hq$8RE6VP!{8eLyTf!bh1p+o>?6X1S3UNTMHyYqnNCAqLAEkaqxpr3X
zXI6b5qIkAzK(h41*)7bk67)`cFC-6>+vdn#5vu!Y8sJ5yx(+~XUv2U3A&c}0HkT-N
z&jXiWxL<-lP+nwL$s{({DUFWc=AeI8pfOKave9w>$0qx@78bt4ORG~0qwaE>;dt|s
zmslfNB6e90UU0wAuby8$5Xn%Jbw!yC8m>L;V2SBeDtUO$Vai9_Yhslv7!#QCC`Vwo
zlf{22K<cxQ9<2Pdd3W|#t1gIfAA=4c>JXlGyT9T@$QwKmCQjMk#`)tf{wl|M>f8pG
z9y^Dx{(T9R9)yYC*6&ll|LqiC#xnKy!6oHAmXV9(#N}TvUJ*wV=$SVYg7TPu1%n?2
z!KJ(vqhhSTE>Twk%Ys_tlS+1ZaDU$v8*X%ThI&mY{<<XZ0OEQaH#`~szoK_aTyUvA
zKV?YruS=;LKqXM0(8YUQ?#b`>1B(c>g`QLasP_JTfS;eFr{D$r1C3|Me?{-H%J57^
zYqUPs-<Q6jfdc+U#e3AhcO?&G1E~JrZkTk{w}H>UeoFclBI}4vTPc(^kguHsgZUpX
z^q={Vh&Q#0uF-5ocI`x|-3^bOPZ5r-&-L>(<nR36CQ!zTUG9a|U<@H%SbicTf=Ek8
zC_h0~aUDwNP)w^erKie2UY@c7L?bQcjR6<S7!e0)XFIM?eODADQ<9qH%tt%{n5u&e
zRj??genUAtAWuN=Omd{Q3N3t}A#<3@r)H*+1gcPZztd*L-%pRD0fb0!LZI)$DVbEl
zQFa&@daK=@Q-YSxZe*U`m%YEN;1O{7FrM#`7mlzQIqENzpQ$DyL5zO;EaT=Yv%Y*G
z&j{0)8~vtD=i>d>)au#Gb?fgGIDMlmm>oLP|DKLx?zF|d8MYenBy?4eEmMOyrf+9&
zU!sUM;_Hz6lD2sB(ZP-}dI_V%`T@h4iUqXe2nyZCD$w~Mra!i_KJ7?tQREumKP~K3
zpK{2qVBF6tVd31Fd9pI0zo>OROn!NH+c;|5g1gTBap*vng+u*g*E8G#s~wZMd^g44
zx?7$FF5+hX(xev)kNRk`F$G0YRO(X;n{>jQ+u?%Vc59<MbFwIooa>09eyiM3IQc-;
zXUth;#%aL<t64s;`nv^j5AQvft5Ok@%aUPcm&&rc9??G|^svK%kJb@(jW8(8IK6M|
zTc5FWY}&{olHB@x=Js#*<_9O+DDn<723TzOmUZ0n<{ss|Y?aGqS6F7Xu|>~zY3@C3
zD}OR^ogyrtKMx}J?HTXyExIxL2;`pVd2uf=CaEAj{LL`;1>Z*QM1v{m;_Cc<&N57j
z*`)fs#Z{eoSFGSHB}_j<Q)(DWEzy4TXnS;L7ft`i{ATN;8YoX`gi4K}pGt{#dD=jk
zS{j)TOpD8`QibbffSNr2*ST}YQ5ND8(%yxDiuO^>>lKs!DkTkb?ByGD%<JLb+uruq
zly<VJt(yQ?ce1ESiq!iYb_F;7lp&9hq1?FG3|+m;b6yJrDox*dNUrZfIg!aC?7vPK
z!15VQ@9f>Alm76e%vE*!yK)I+ch%nax^cdWz?iqB-Pl)XXN);PG?+0Wo1pmACqjOi
z^6-$NU?GmCO;L$gKiy)WB{PDxzP<QlEJfn<!I@z(cTtRtNf>o;utb8-JN*7E(SGAJ
z#o*n-hR|^XD1G$Hs~Xq864((Ah#!tVb~KMo%M&0h5G>)%=IUQUA&n69-zgre327Wq
zThHwkulg=ovTJGvZ9g4<d_3U2$vysi59<FzLw{C;%yd00iQZ&3aS4CQ7ow&S5ClEj
zD57sb7R7cDd2V+z939jWMOf9=78}aDm1&nl=@`&EtNQ!Km!B6{wMC%#Bp$`SG7d5x
z!e!=_meU!?n;k-6n`9pJJKvI$%lLjfS=GA%mNFvA>Puf;(FX)7a|ThM?xs>nEyoky
z9MwQ#@zpASCYteczJlph3JLGPY=)7WN1?5nd!5ehM|x=OAw1cvYNw8+hl?G2<%_$1
zoUNvZ@2brohwEAJ=v>t~ju65dol|Q5<JhslRkglSLEOa_rXd*@FB;1F3e3za*r&K$
zWo0$%id&=St9Yel4A5g{tXAPF+pBlw;*AIcz3`7vl8bsN!jVgCYy+2wM*D5URN2>s
z_f=-v`*;$C6@ddH-=~+}AX#x;X}zJno3nW7v@qgqyR{@@7tMg8ZHCX(b@lwPWOHEd
z+^y~8q~zXaXB~U`M9;j19iT8WiHMkBGOjOJ@qIU1h)&~XN9&N@m;RD1Q^l-!#m!gc
ztRoOMAG&UNNW<qimfHGNb@KCNR?Ji94zS1K&Jku8d0R?w+!LsR^Imzmv)GrPm?jUH
zPYm^$R{5u?rg%o2hRU2Xh{Y_%<QkeXOw~|`)2Ni?sYwT?C$Aw;3_=>@HeJxUGZk4C
zmf3O<V)Ql_sU`*)wRU!rZLQgO4x2kuDEL^bR%_TOnZ}T1Pj+Sp9!c%7F2hbgGdP}3
z4<2x$V6nS(DRc9dO$L}b7O*jxwKqVYDJ-*pMAtN}P)kFWWXL+0bVOw3#{|Wy3mXcN
zf!nl?V{Jkg=I%Ah?1&Dz=~&D0;rjQ{*y@IB55mWfveU*31uMk@wb<CcZX{<*yOnF2
z3@P$CncAHubTZnSwGt#zvzk;4a2^v|G-f$=S4r)Q&HPAR>7Q^@G{!9|RN2}=Z$MUj
zdTm34%Uagl8zsjVU76@{m;FpnA^I}3p`g&re0V8%(`c=|f!`mCV(T|DEp-%b`S;KC
zmM_*jAM*y`LuDy<YGty<VrrG6QQzb|;)tX@l6T<>6G3j3cIiv$&H)Wc-4671@{PCX
zGAnZ-zN(`v)hGAl@~y(q3&n5Jk+`3?Hs&-;oD*0;&klTPiaU6PBXCHoF>{_~@wj2R
zcm9Ds{->AFgZi4sPHkoY`?2&(liZ~UjIqGr;*%M&g7kHn22-L>CE(0Rv`6hzD>N1{
zlYiwrCNXh4YveJ%89I4$8MlMQm%eO3ek5PS#mX19x)NJcB!M&1o(?s?{*^OIk_B{W
zF{7CuxmIQ#v^}ZZTt=sp&3Af37^%#kQvU9w?y=LxmbM*qojp^-tg+Rx-_)@Ivah_>
zA1D+Sp0)Q%s<_4JQQL2~4>$sVTS8YjAOGU0>EV5Kl~FtHOYx&pie)rKjSAuA>SDp)
zu$No!;=AknJzjpjOKw!kO2vn?y2VzlCon9acPWpwbW$wJucp8owPq@R7n>pmZZ9(l
zwnK=I=W_JNycXT8>~bDGLfJ+hSY~F~Ydbm95HAuMAvk|X6JsE2n;^RRnzG=YVn=|%
zwLvo~MzzQu_OroP=-ld^yie78|0H$tZonbN-EYMSM>18I*;~Q12j+HBUc?*nPd%S%
zUENzBY^L%(Cx?#3-3iDO!BMWYx26^gO?C;%s(px)VQfA*ldOMwhfDfZy0%n(I&BgE
zWr0K#rEuaU2D@2XN`ly&9<wJd>kuU)#)Yqvgq1p^?=H7J(wAjOJ=S0@NvP}1yo|S9
z-vujdO}$iE*owe&dBLMXc=2zL@sybzKXn7L%@lUv`c_w@lED8CKSt6vhzdUDK113U
zCO;lm`E)g9kFj5m)Ly6+*UEA#ejMMF`|`^k5Gs&G4Z8mV@#406xLuRAXss_iCR<D!
zivQH0o>73T;6f(V@3Zg+k9h-!Od6|2fc*Czf4jq&QNZ>7_a6b=)4vPxQ#Aiwh+k*)
z--Y;hA%5%03$6XHMEole|4PKaegqQczYf)}oa?^}@$W)hhEG2;9REthzY_7UMEo0u
z{~J+XW?TLZ8~zO&{{IdeYR^mTDWt6w_M;jtECR~=RaFU}HnBZ@FL=k~^G$2Zn>x0S
z3x52#hM`U9%U(<F#~f|V^g>e7<DqrOk0&o99&JLTBe5QE-EC}q*nzL+;mi2owl^*2
zdx<+w(cUYHJzWZMiaG`(5S>odTQk5I+@-1`P06(MajVcyn)31EfvSCY60{iwGgYJ`
zJm9j<^1HGP0Ft04fV!3Yr`Y}5K0TKN30o^@(BJ*=H?ZNr`_J!!f64C8>c_=4Laf03
z0t%A<08f7ZR%|Rtr+RT`{spx_KnlPD82?DtM}HEwzbkpj@P2~X7mqF##-HE%90~3>
z`#%~JN_a0n83xT|8T|J%T=AR*V1<VU62CL%e}oUh5AyuTj>><7I&M6q02SRQCV;Nv
z&z*jKLWOrOG<_HS<=FhmD?`YhfcrJ8$bn%3zu)H9CuwQ0wEQHBf1x7&mv5CSTDIP=
zoswuMEG+yAgX!-Z8XGe?{Vwq#Af?kVFfr-u+$ABg^N)+W_fjZbjl66Cj+<HaM_`37
z;F2uFGb{v(+dJH%B&b0`Mt#}f5r((H_j$3hEv2rGpP7YY7LnZzzjNr$<O-KsV;~g1
z%6T^mi;BXaY}NO5>K<ZUZVH(Y!s+|u#>v`P{JVGGGaBTl;~{i>Tb_F^jRC$AdkbmD
zL}KJM8~cy6Tj>Hge@@BPjRoN%ScpDz7awFjoLk69!=><CUR4cJbs<?gx1~SpfBS)g
zWTeym=f`4?5F=|8CQ5N10qn~gPCfW0+mcV#%!WNI?YMQb|945{<Bf$AU(%6}WJ#v1
zp5Z~n4C(OhCwT$L_oy2^@&X~;m*VTWgtovfId7e43h$t4c$J6)<Oni^7|x;0WBmM&
zbS>g^9Y%I6KVQU~Y~wy{{EQ1<5k0g8w5STM;Pytp%Wpz7P)bqW9wKCC`-waWz9%i`
zM{o@PD*kOlBeUk`Vusg#i>wC{M~&F|JlLEg|9iCX&1u0hWJ)e7DynoH`qM!ZONTsq
z7{$~1APL?Z=j2b!3WvE<-!U|bW0w&0Sow5$%=*%<%sbjTY2R1$fk-@~WD@{i%O|fz
z`liAaUwHlUbaC9Tb<`~9LqRg`aXm@p0Z$`cd_`+guQ%_(<&#MdAV$vj?@m#vJ^@rE
z9|pmFFq@Jga#-G>1&+E#IKRHfQVxD^@eJ^mi0p4B?EPFf7tc_Mf360aWwg@3yA;DY
zplTlJT!l-GXDT89$*~X<6U(ss>s5fUpoGC<|2Y0+Xo#bstZ2et*x(DtimP0_LR4Ti
zC)(TNGAk*2dU^_rA$1!x9WBjUVy##!H)-#h{9JY%?VPTOiwyr47h9pJ4f+Vy=7x(*
zPVYzGelO@j!kWhrfC4Udl_Ew(MZsEVvROt-))~%^m$leUs*g3SHzxVFKP+c619Dc=
znQvWIza+dV)5D8}hOV-UXRUJP){Hk>m>C1Qm^|2v_)+Y);OBIuCU<(J>_JPhQ_)bB
zbIDNd3-7Z1>f^Sqjn7X28NFLZoeCkdhKMujs+wVxu&)VW3~A>tdBK#>-iEc$u-WiO
zbd#{nC~jQyWJ0=oB!7*d;^I|^a2ATH41&PqWRqg^u_ev@y}j_7ql|N-qoboy=Tqy^
zSB+tO2i_)aZTQUXO-=8I%B)ikkFS%m04%rF$mm`Yec0*x8hay=^{2(iQKMU=yrymk
z-$}!@&SC5?9V1<#DzJDTIwrXk3ES^cn$D-Jy}IYE!7NH0=<MniFjDpj!@w#Y=XSv9
zYq7UB!ftLI3<gQ8myh$UuCD3>u3wn*#BsC=;2C&wesw;gxR?!ef{<)yS)7tuPu6<o
zx#m<aQ5WnKCh1SV{*ax`ceYO&vAZ1eYFSN94Ro=vm*w|E!C>*P$D|ImH*el7-6cJ2
z61DDFk9--f7A_$r<?px<$7)WP70KsbUA8yu9|0fI@MY?Df!;Fi_M~@4`^HA@h|+G^
z=3Bxx=kaZl-kxEdz4Z}`Xh8Y5FE$=!Kc@Hg?b}hi)=+w8ykV`QWE)9FZvk)p?-p!d
z-vJhp5<mA}8Yz~^(=t}5?2v@bgw(l>fhe76LTI8wPuBT321h`;#4c$P#W>GZ8Ib)9
z-F|;;QTwXgve*c4j~HC1dPCcd>`t^u>rBLVP&dJ95ak%sJ<5VAO3A^<!NC!7`UzhH
z77}5>)-KTw`w(;5MtmG7^<;>0W6LKc+G+&w7^7OZMZRpyF0l(Uh;fFtaoI&bnQ3Eg
z%W*%AL_Z*60z;cw2L>UXlan6k?=#t}T&rL~fl`cMimTm(lq#;c*3mQ`7~O8&#(ci{
zsk^!F8x}6X?%C$tp_xK<v9tAYL^gZZCYm36YljL9k(E%(rMvR%Xc_k74MjL}2ggfO
zE#%0Kd|P+W`O#LsyDYAx6pB{4;x<pbUA(J&IE4`*C&kmh@C=NDXKxEg5^EFjk!0%R
z-u~n`aXfC2eb9uX4+f8#PY6=bF##TZzr!T-VsXGOK591-DlR7G<G9sKp)t7>BqzxT
zWP^#ma>GmQ>|M#2K8%EGpmJ-A2Hq=^FKOKldOk+W!GO4L$Tl5?wpD=oygho22YQTx
z*C#t-&>6P6wP`kw|4WUv;C1<Pa$dhdL2EA%7al(P<hg^Z?E3ZVe$hUU6v*d@Zhw90
z?jS$>qLaQ=WG9tM*w^)9pBs>{gAU81B4BbcCqai*7_Ln&3tO%2Q1<cMs%InwXd7B~
zLwa-$+eI~p5qTsmIn{gVWm89l>W~>hLQI)q1Lum9u?g4Ks<V3fXr}X1sk0x~o^`PC
zoi0kXzjI#~NX{wgJOL!@!^)V3o745?bVMZr3o)c8tu>{!?>e1pqchbV>X3!LzmcEU
z9<Gq8G9$6{VXG=^c7ZX+Fan<HaL|40t7f-R%Wu6|k4Ly$D#IYk@2|ysDh38@oG(H7
zPIu0k-A@{|7(++wj28w#?~3|5J}Jjgu~|ge3+Fpj?Gg66<I(P?CGOT$M$ztP$ao8~
z{`hmJfE%^~?MhmTZ%^cqb(`0WzkN<9or+GfRLq)F12-VDHy@I6+rrxJ&pA^XKx?MW
zJy&--)HA;u9;2<Y3p(9QOmi1)M${H!8lC13RvmvaJbCsY?|8RS69zShF1!eodOKGS
zN2_A#z>SIrM8?y6#Cj&DW{;KKWr1(V(MYlc!Sj5tKPTLIt!e=M6(<1w`@X&EI)xT<
zo>hq(ZRxd&Z}m_*VM?0UXfOy7uXh!b3|WaYMf{b3ix-crjS?05mzLmSiwSeOs@Xf%
z&5!UZUOXd+PYj}wE-;zT&U=ilOOSV7MRM1<dT|ck#KeDxhh=o>Q*S2&m`vF-=+>Je
zK?+pn2qCP2(1PK7W^be&MjN5Aq1T%nI_8A#FFU#9UwxuCaJ*o^cV2eb?N35HqJ(fC
zS+y_0@S|D5=m}<iy%=sqoCF}iiNc<giAH@&#mdeeA;~o=;!p2>h%v@=nsAgh4iq}G
zMYYO!4%f@DO~hXUdw)MfrdH^~l(dUsJSsz&r94$J+&*JAQN`Yq2fg@~%h-n9++Y^o
zE?)LP%SRkxNpTPe^l&rQz4N*gQDXXPdB-0ni$XrJEhyj_)EFmFLmaeeosYCHXE{To
zb@-s5t)rKzi%;8aH$y87%pRdDZ7v;{vMREoWUw*Oj&(OZ{$S@25a3k#dL({IL=+HF
zeO%SsLFZaAijdi#2YrL}>I?+WWnvcre-aaFaa*qWfQEBc*<x>u-LVl#4E?xMMz>|k
z8qb&zLd@9uirw-~XV>Sb6xfZqL8dChJnON8M2K%d3kuZ2$2Mdifr}L55)U$cyNOjv
zB0CZ%b81bxOjkKX4A|l_d1biiB*RphgsPb{!Y8=6Z-%-*VN=O=`evwEpu%iW=E8C>
z8>X*vigC`swjzXp+<rXmP51^?zKg#~G-S9;ljUYh3fy^+AjrHYS6BGs)(7L5KJ!*J
zGjH;oas|@!gGTR&pi|XtD5aUhnIpUZr%uDEZ*SNf3L2|4s$I&eG!E_9J#%qg_6lNH
zYmPlJzBZD&k3Qx&b6_6xGdMqc;D7feDW1!QcXo+n=xSQRfGA*4oFClCwsxC0EfGKG
zwrJ%(E$nRrqo(;3ixKqz$rIs-kYaev{A!7PMRWKm>KtG1t!7lI<duQ#OD8P}4V&-`
zyW`80^o!f$^8k?-DE8(jQuGV5+s;A)ERcQNki{{F9Mx@X{KI0Ub*FV*MC0mgv8X@a
zUTMU>AQmd$mN)5FT*5@q$3-(r9NmXzFp!-f;J<~F*=%QPW!)<gQ;gtGv4T`lu!<Qk
zW^F#s<y1n2E-_r~`pyxM;9E!*1m14^g8ka*yCmjHOR@FXNkfN-_1mZdn5<gps)afG
z{^{yOIL094Bl(o(_82QHOT$9NhsvBIxP`A*@+lyoG6-vCtKFd)$$*s}HkKKq4_B}*
z-clwAxAh{vmE2qmM!Fll<C7pJJW9N$!)3o}V?Fga?|bat$|57h2%98eS5of<Wd5Nr
z9cTMY6W$}4?cO4jv8crC@+HyG{h0GZ;+f*POZ0XSeJlxUdBMx<M9m_%K{<!25R-{i
z9;<K1+q@$n9L{%jF(L+XByEd;zJ0jI=QiT9@{*KUNy9OTM#{R<)WOZW`&o&o#o#^2
z`A*dOAc`5EG4FX<XPF*>c_NBCU>D-+Yc^vz@{ie5nkna6(XC$MnSO_AYj0n1PjTv0
zo&s48o1sTw*>>4-cd0q}+p$T+)RzZfhLQPtNpDv9d22eLg9vL~ooyE7vRh-o$ktrd
zoN%6Sg~1|fw6o7U2P*c%g<+uA&n|~Fugb8zJlTIl$v(4(_S<>RuB@d<$%LZp`d9n%
zi9G(O=C!Jpn*H(PPv+>=hTR|Nx_0-}EC}2CtrwGoy%m?r5+>Un=c7$JxVObi);+zi
zx0TeK`meE`|MGbuyU6O8`ZjLi>Pb|e1!BoVZ?=Y;?wh#Zid1?Dh!@N?Hgq}H8#00a
z-G=hxlwEsh%Q~tXt(XEa%?@#ucr_0%yW+k?LEt&=QFRwXT5+*r0?PYVi6w~w`}Og>
zIK{&)8CRCWb)YF&*=DIHisv3Ht%rjCH0W4>8h@~5ILE8_?JnAi?;-*<et9M4edQb6
zTm;E|#_w2G$6amQ2x`s&+p_Km&w@GOTBz$Z)&Rc5U}Q$gCv%?c7lu_;Rc_YNzhW{n
za)hzBTv|C{PX!H|<clcqpkquTMYol8-lfxIY3<vn;KT|`sF<!?Bu_qzLAIL?xUlC`
zbM(OQsm>DWtu1wgPwHI4X;Rsz@pN(+p}@hM9aFbYY{5zFzO|%>RoSY*5J*D!x+T=@
zd}+eI1VyXFEy~e)Bmx!$MoaieX{@}JnH#3iIxH*J!-tzPps3`P$Ua<=uvLSsiGVm!
z<Y^ewPrvcK$RsaU=F-t}j{Wvp<nfvH*aqcTe~W}9ksQ9R_pex_8!3J-xf_!L+34;P
zc6(8G$$jbGg_tYD7tE2LsS$RI4;$EP7MC&|4OtKvFZE^sURQ344)|5Zd(i*T6gLtf
z9%K@zx9%uhq<ajN5#BywU5U#nodOV4@`o2O)-S6AkG6!cn%GysZ4zAXOk2=<h95{Y
z-~MwM{)y~6V%b|B#x|<&*SV*XZ>Ngfjhz1F7Q;;xJTmQ}og_!D-gr821CeEsH)L1Y
z8Ig6@LH@r<79nG(NZ!nJhH@pWNjgTrfCV-r2t{4S+Q05)Oq~n*>%&8&riO3~JIs3x
zFBkcD@CH~(=s<#JkyEn|_dg#MJPOlekTCjwJv8~Rj25IE_#cVO4l-)LLbpE|qo!Pt
z(7H*g9)4kUE`!92y<LQB;p`bfv;TU`i*-RVW^%DKBKRc*e;zPV48(uset1QHkzjb<
z1bsQp8+A!Wze2n}KlZ=+r3&>+%~;UkyAk^l(FZq5zJ9d%H~Xfcsi%^rV_6;oLFw*J
zs?nskRPSG_8C#9CAS0*M=sgTy#<bg$O3{5p)YQNGQOG{uM{w#(T+o(L_<Q?`!S<Pw
zYN=lKoc>u`D){!rdkZT3^@C*pG!TDw0sblRIoQ6Bw{R8y-ab?K_U&yZ+5Y*7U!TD9
zA+l0&D$D=oa{Vn|`LsYLM;b$+f6L@YxJ>@<e<@q~T<toq@|*`azjMqVQ5GWMYz;!4
zkrUXxI<EITp-*X4e(n&Y2+8Jt!Spw@#(b^g!I%1g`|aiWj9Z@N)Klgc-(Ck45aZ)B
zs+E|ty1KePT5rAeYl*w=BF=P}2z6SpS&W;mxlejFT!{_N18Lw7!GUcTiYWfZK_F-#
zx4ARp=1IVl_JK>ecYNHO7c|4`pPd}aq=#5XczPlfz$#4I#OTdh%6ON)X>*;@n;D<O
zXw@RUH5-Fn&XQnw2E?MY5vxB%=6OPGdW3n)kavUY`Uq+AG;fJW!fjhR39??#a41x9
ze>u&KzkfL-1h==ZZ}#;L+G1DNXM7?e7LbWriyPRDG{k63iRO#xL-&+L>I)P?KyaUq
zA3*cts}6d6-@<tc8LyPA-JZm$X=SK@V$01<z}4PGcTUuLC@E9{I~67RM_KU*S#ZGc
zsuQkCV5t+%m}4Vx^^qRW`QmCxXb~ybPm$^}L_}@CK6>>dCiVE0;*~2njUvzZ<+nEk
zo+r36QJYwhUu8YoElf!G4sx_}{UFrv#NdI`X#!^GWMExw2dbj>EM2zTKC5^_J|bfW
z-5}qkP7}%nETm6hzGfqBfB*&aadzSWA-g$CsJhjM;|@yJ)SN(eK-Y(gtm8ueAgU`Z
z!lsPE=8Yt70-LI1keXW`)8MTO?d&OZd(yc`#?QuOwM`z7M}&5CNZQ}mxb+2*3#0~^
zL85}~`Yj>`kdmUNh#1_9<C&p)!aKv`07PXGQ~Gnmek>LCiS+t?p28}U9b=Kk&7_bE
zdGMvhY{YozOZ$A;8by~}I^l|sQ}mB7eXVd7?QDbgYS-iGru<oMLn${VKPfjtD4BF8
z;u2&0>^rW-B1v->j5}31?-E}K1n(H46xWrM1${v3U)Jw8q>q%{Lkl5v$vQzreqWfy
zcdPEy;Sc+;G@)nTb>r}e2ni)6rMefAb3N1>39%56i^3-%p<|B-*h}E)F5}?s&KkVq
z>4d4QJ+ITfk!+csq80C=ousV&VDzo}y6eUR?jNo6`>T6;i&akD3srkESHxORj@0#@
zslfIzLQ?eVip$MR9siVaJcrR(2C7l#1|a;t_{{q9kDkw%Bu%kV3+O3ubw~HSdBU3%
zk#`;O#w@P%B%q?T)PA%Ntq(n_eV%*$OUuS~e`?96<<<ByM^gy-*Aw1!sU#7{<+w^0
zx4L~*T)LVbrXkYbhBVg2Q+_{HMv7q9A<a$}pE{$Wq9R?eJ9^9E3GZB49$;Z2!rAad
zLzZ;;Ji9QwrAaG|aeu`6(bvMq%3zem+?4hQExgg1$N^rCKnty4LOQOA;>W+4S1?nn
zA-feH<kQf@zYA7T1&F%cA2ALVcBT%TJCKJpj^AJ@_rTEqfs^<p#fx+*PV*4)ewou|
zX@&L>_hq(SS_|=rcVFPWSX$U#vpt=(D6eURtEc=mV;lCwQu14LbC?<8?lkMNdo;zg
zt5oFJW;~4);aF<-JZc>f+<Ss(l$qRG9meODsAi;wPpr8geL=f);sk1ujYaQ9uhbl$
zVhHr7Vma!Ikp+7ajCoZkrV;2VJ?Kp3?We3DaO{arMKlG{nRMd-D{Im+QFf2MFiWxp
zckrG>U7}u6Wa1%Kg*Hj$Wg27{^<do~O88))-uQ<<WBqGoQtam%u?-W86oc=rG$))|
zX<_xggkvucx8}YSbh%Kz+Jk+_;Q2y`IsAF=HRP!U4G-^;X_nhYU0!E6uXWK$<(u$E
zH6#Mf+UpY2L|fR;&|`wA$Y#9+!b=?rj|<50_I53Ega?dN92VpmEHp10q|A_9EE}#H
zqwco!7dsuSJJnd28B3-a-A+9duVfUnv9XblOrVQAt=|Gu56qxhYV3QOIX;j2<8pFx
z2KB!{g^m1kOgO%)+tTW(#c1`c%9N$gZ|SwRF&#^*f<aq{$vn7qi8G0bwxUzJ<}du*
zjk}|X`WaBKjq8xsCB6(<E=4Xusqrh|nW}X3hde+#fQZBeW{A23t*3W0uJB?e1~O|!
zbG?vbXrSvQjvW<=^kPZSNGpn($O&-DrT?`-Kj>rE7xUIvicB=<ht>JI7An+RcQ@v*
zY!197Dj@%y4)Jwz|H0^)i=j{XJi+ID61j*jp6@VC3WvKR-Fu0NgG1S}LWd@ykm9a9
z>iqn2&RiYa>k=AVqUZTM)#ki6686($Fa{5HofJKUJa4&I?i}^i`*=spx@h?9y{3uh
zoJZ}!J~n-`-F;j-97tmka<8iik+UpapW&G&9-a$6dc#6d07mt}{3K&nzIv<oNYF9o
zD3q`2)XqL*V?#u*?683J=ZpA0j9oX05?hE{cQ{>`u94g2wj*IUdhlsnX_>3~%P3(|
zRvN((3$TOAem?8nxgFOKu^Db?N-ra-@%!sg`IXCuzr)a(MMcHVaCdM`vXya@+1~KP
zTb;T35>?yKaU+f_9Fh6V!RUCh+9`%!FiWiZw5T$GyHAbss|%NR>keo0-s@Hv4`LDS
z1_g693EI=`vV*yPCn$<Sk<?fDw*lw%S*m6q0&=**cD|<z#e8oh14ryKSP@2Y@mEII
zfC^CE{VcOSrcE~Aq<P{4dF3g;o0>NJ?aA|3fGDXHO;>|^!fLy{PGQY9dz^U6S3Y!n
ze0*a%V7#$EIsmo3;8nmX$~8vpRLVY+=Z%zi+qyPUubFSLG$N}QJGyw)@%78Zlv!@A
z`t^uW`psa`#q8gBNIPlH&O`7Cf0>T^h(^B0R>qO@i|GBuhmsT)_T#UF>qxj!!xwZV
z+4|Gfc#u8?Fr?i4=_dwYcJ1$;?oC}~%2=^?<apy;_;z5M#8VwH4YkiORho!uu!8@)
ziI?Y<x%K&HRPu^MwjUl~^fTT)pp>6~x;&jK*2n$LGdI^gH~I|}xdg4oPc?J{=*?HN
zQ`^0asMo%_d~V30-y<^6_!wO8gp^JNMj!4CJlfUmZuT8tE{fjW;E|rr6zCi$vEEZL
zh&bGyj{qZUQf4!E&nGt7Y5|3Eb^hDR^e6)TS*ht0dK0(!N#uk&nj${GD-u<+GE6L7
zKf{X8vItSF2iy3k90}{w#%CJ=vgGL+UJ&jt{&!F_b$dfXLh|i9?dx);SW$@%cFVu|
zF&UXXvfcE3{{SOKDXH^S-y>s^@$yJc;W2h&4b=X7e%vl+btG(coe^4ED^EJgv1xpE
zZ^(;5C;&sK(mP3=oeeN@f_H5z#$^jnzdK$KwjFb<E~vr>*yeFobM`WP=_vKQnR&Hm
zn0LogPLD;Saqh}KnS0p(<r1`zgof0qcfd<K=Fj7Ynn*1T+#{Pp*ljJGaNPl^hUIx-
znUFe7j5i-CQ|P8nrkfr_8kDWiq>2S`$G4r!Wul`(vAHB=mSi&{MYp6+zwsw&=Qf09
zb!rF6i)h5pcCy`g_=tf{`Q^LA1NnCIy0Nh_7~#Z0@{`rVBs=BVXIi?m^IyKaD9~$F
zEC4iJO~<n%3=W8)77?W()?Rl*A0GaW)tceYCP(c{syTg?sqWPEO5=X?CLUDA>q?!A
zgGQMLA1^&138G`hgA`OneM9g|-R}F;`nZn$$n=dj%i+g?*U*PZ1QL)s0v;EC|A&aE
z?=VQ&K>dT%S4hJ7SIj~7!gDgx1qDyy75<&)3bFdSx<9a&RASL|`OZJoNJH!0a`)e8
zt@+0R<+Ja!=jlC}zhTm!)FYqC6AaU8ITI;rB#p5g`_?G9OcUlzk+%1G<je9l_IN|h
z!2IKbp`BK1p2qHI^~3y|o~o!1Z^x$T9aCPA+=6JBK&p11#Jn=3cbWrP7AF;-fPm=@
zxdQrr<&En`E*KFEu@&0IJcKsgnPywaSYF3zbiv6wt7$CfvU^hLJY|SXoaSZb?@4&0
zt9i<{HkMDUWmh-n7N1Pax9&r?MwY795OG+aC!9_j`qRf4AXP$mkbPXE9R5srNLBYB
zb)sR<)Vl=^+s#|IaBucud&^HIOK%A%el$zrmJbz`OInpVT(Z6MC|5o+TcnUe%i|GI
z(+w!=1m~OrW!INy&MrI(c<7BS{neCJ1undAqGt@H;!acszJl}~!FFk{@vmsGsT#z9
zHl#E1$i_#<$L)X!P6un?O%bVP!Z=}hsvjqjplupUeRs4=m858vTGnuawhzXqH&=;y
zF#|JFK5B*LM2Qrq4D%BdG5YPz7U($6px(s5h=9Rh4al)`m%$!H2-zlDSTqN+gvgpC
zU4dpeGV@Rg^%@6uBnn1gOOceN3K3q%tJ&L`a*uL=Ea!1!t3R>b)WM2u6s*ISxW9s^
zoAAA}?-s3+M?;Mr{R&@;!1uQwlXPsUiff?4?GK2wd#ck@Vp3JtX(VKpWsRP!OeiCM
z7uZANR+vEawRJJPV5oqnjij?ZxH@U<4fZn1AnB6btS2wEwQJ$N{p;R(H;22@2UT|J
z7FQ96i$~Fnz*&!@tRGG&r>6Wu&v1LCIc|?~uG~jDZ=6D;SlBkGf~3yF7cFu~ude*)
z%6p6sqyefJ7pld**UNEEN19KkZ09QdGkE5L+Ifd?wbv5<8E5;_z|jK!uDv_i+3_Xx
z;av9FCUe;u6~^PkdwX`wU<zV5^yNggYjREweF&%gFv+e^8ViH;@*bsj!WB6V%s>lA
z#)HHWtd&!_#k{8eVvdnOnsdt5=`+Gy)5EgtK|D6o_p;n0@-nezPt^pi-4*hTb^qau
zzx7zg?BbJdh+er9S{u+SX{~aR08~j%7xq<6z0y3}9B5+qA`Zh<oj;soc;?;BVB4Kl
z$=$AK7oLbMTKYbyYN2o)KKf$1(8mg%{?rGK$!gk`vH2pyy<<+szX!MIN<rrB6K&8E
zt#F&KUcc2RUIm=NfD|wMsc6ciNqJejYOC}w2G#lBS~FzqShsEta7`SmhD7B^KJE;u
zZEDITlnu{P>?XL(TEsFUvzU2xtUq7GeXbV^>xvV|+cW;)bOZE2>orChN^!>%#W)Z(
zCv_u*w|T4*v_>Hitd{m_UW$3RN5{O4rh&K|G#qNVDxw!FBrLLRN2&T%URH&=K=s3(
zJbCM8TAP0rp;~NOTJsL+T1ibcc2s_5UYy}Y<?TXzQV==)dw(HTo}0xA?eOsM<c+$f
z4Zi5(zDJ!g{9!MMqHU83aC>3iKuz^onfm??1&So|ClE@6K={KS%)h$nC|Wo5QNCXO
zx^Z8TNoO>V!m3I_Gy(O44=)f3DS8a-MumuCUJ+ID$0B-?Gjr>Dv%b4Uv9+J=^)dgl
zJ(<j-Hq|5h-q>LTkN?XiIXhn5v|D`(3=7^vV`Cb@PfS?mKWs_fe?G$}Ti7Uu?G8J@
zX4gSO61^?0q_Ug&q|ZJz9brRu=|}P9A-#fFhP;zd)G=an*6=xH6hMc<%SWu|y`e<(
zOKbf@Ln@nD6|6x*_9JNxKk5#bNHry$zugy}uy*lN-9Ys`Kn`EQt9pgijgEdR3|b8R
zc%Iy#C2aLOtwEp!84xH(E^(WsfCRR8^6MkAa&n_)E_8!D11Be^IDjsQ2us{<2L4X^
zP6t)m{RG0^=uKmym`##Of!J`wYI|Suq~5Wss;a8u)?x)l=H_-3NRim`axs;cDJstg
zh~e%+7|41o;;x2<O;rOJCXxPn`*Hzk5MMrVf;$PqCMFl`d(I0*bSx+DR7mYyn{a$r
z*4OPZs<IoOIdkyt9+V|Y`jS~D0^G^c8837`rb&ncv+#Fj*AlTjdUv4kv#t_ZpeKQQ
zsd^{_p^FTC*yD%8audRgCXFf8MQIv6h>@5NX(b*1tgJ8f=cDG=guAuf=9lt(nFGdb
ziUt~Zg;-AYHDnB}^aV?C2tmN}e*ky>&wvUyDLq(U_{Y@W@~skd>63#i(=O-Plb%gb
zLLD266#Q`A;@_<0bIJsv^YaUo8t>g}&ih|hU2cZDuy2AWV}0-A^_8bG#T#c8UAX~V
z%%mk$QU1Sfd;Rk%nxan+pF?;6$a1jd{E;nFe>T|ft{3-}7in*S-RumF#;iMU)8F5d
zOTTgRW~Q3ooEri2Oi)IbH)$@&mX3X_@iTi~QIwQsfUynmO5?sQg_c_rfJfx4sAl|M
zuF%)T$ICnO?(X|#k!NR^YWL}!WQO;vLE*GwWyAh$7q|AUt32g9F-bp})x`bQo;$lX
z06PX(7CL->@&9{rJ$urs$I-jXcFqCbCy{71W3`%E=qC8WHPDEC6vvBMZr9Dc{zWan
zQ{~dN=f`a?-;8YS?K0n-85%{nGBzaNeld6EpZkqFW?j5~JsUWf{&Lf7>-@Nwo0Gt&
z20*7BfTQR;8ul-{Xxg`~RLgTBlPPe(Y8{Vj$11~kS5@AGc%W++HO!r!^(iA?9O$)k
zm$vQQcvZ%x;=-2g+h_lLUz<NCax&_X%o92q)}@yCE(Qft$Sft+Ea0`GPflI@dtkQ2
zzmS!dzm+F&F|ww9{a<}M!`}*M<hxV(b$?A4-?IV^11U!co9q2VP6H01tRhP;ZtYo@
zS&}R4-X*d^Qc`kbZ27O4%HX$eOJ0Twv)fiG1pH^Zyv(n3>XrZhe_03gF3D?Lzi(5f
zPDJscPIaSnU(*FAQI7md628!(9J~96LhrgvE!j^`w67bj-23<ZpV?05XL*E(X6ELW
zI?jF-vq^LnBk-gS1x?1YzL)P@N?O*(mi6uJ?es@`UI*`xl?vOM|31$6=DkyX_369b
zsKl(@rhdL5R_?pXv4@)~;nfDXh3O)CL3hXVGjp?_$(GNNu3hlZZ~eYco20*qnD!^#
zHuZh~s!OJLW>WQKpLsSn|9-l=H~sv?_#3OgXjgCS+6+9lewFe9g|i*9pG>?D3Q5nn
zDnAc+F5Y&l)~6@F+P<%yKhygAjg6+Du`~9ZpS64Ux-rh(p>8Ut-64*e-aQ<yyUb!+
zcK6?cJDJH!bH5!=oo`ZDAX9wl2k>fP;GuVKO^ToSh?`o~{MhjF<x9gdnJIf$b<4c|
z`1BKSUb<B0!quy(BARj%5(dClcIWjid-ooeNnU&fxLn6`PYxHnU<8l9a&f-7_jlWF
z*76wVxrb7x%B%=9@%+w~zis~x?TF3S`8o@V!v)rG-u@)lSi-6CB1=BDOi70|)8tBK
z|69xO({ehxWk0Tdc^RA5CEIlMqT$V}-u2C}aUaMKy@L?bF1esLi??6zGr6(<{+qkE
zgTKC%^y0h1;dHkSv|y#lbk&xsYcCe9i<;c4*61(EVX~@WzD&%)@UKe(6<q$?zu#JX
zEadUDJH4-bcEbu>$S|b_Qz~oLl#P1NOm=VFUz)#uuKlA}>*+0Rj)EIHoq!#VD-jOB
z`B>36o8LXVcJJOhttAirX4&kRW}BI%<NOgd#Lu#v;=A%g>NjWG-|D|i>!W|Zy7_g}
zMXom2O1XK0EA}?<p4oE4eYWhqD~Zj<uN0pLUlL~jT?AV<11m8sls^1t+<nSn%d~B2
Qtqefm>FVdQ&MBb@0PxFJmjD0&

literal 0
HcmV?d00001

diff --git a/docs/source/assets/deployment/anything-llm-chat-without-doc.png b/docs/source/assets/deployment/anything-llm-chat-without-doc.png
new file mode 100644
index 0000000000000000000000000000000000000000..952a43bcd677d23b8d78cdc23375c6a2c8621e8d
GIT binary patch
literal 138979
zcmZ^LbzGBc*gqf>1PP_21QqExA`Ag(L288bXl3Lm=@=;>A|O)2=&sR>E<r)MN0&5=
z?)=U3zUREg`TYT(XWO&qzVGY4;(J}!_g=7y(sLpL8UidVEFwACXD_g@aG_XO*PQUL
zW8Ps8v-ZWp!Zxswl2VbAl44YOZEI=)g<xT^MmT+tP{2_o59l3wt0Kvr_J#COK4UQ{
zhxwo+qQLH+rea0#js?y8(@$}4oiA{-Zdro?13a>C->y;N`by`I4+|Abe{-Bs%%l{l
zN(gVM-%51xE%fQBzITl^2KqShZEn8RdfpX)m;9LYO~K=k=W4l36|=g+%IU;%E9ji6
z^_3^i?)pWVQDYX}PdwBv4_y+Hpgk`$7}Rfl1<`)^+?q=dC!Bt;DZoa)(=c1t#^U$*
zZgQ<i=jxLkBKr>%`<y!u>{$nwmM5yrp=Cuk@530iz&iKoQ)r&uzLE2(?dCuR&63_%
zvBzYaigXaWjnA!=yR1!F-&h6duD@<2Wc4+^d2b&7vwGx9RHMu2?h{CyP)2qVc>1e|
z3N6klJD2ToTtgYYp1$4GkT;Nn-^9w{>Abxy(Kh{)p8V^_&wZ8>zt^GYvULD83_hJ-
zSxj_IT;5#Ucf<(%2}E1Y6a>Oz$6VuMVFy`Y;bN|^F@H3eKP)Vq6r6wF!G)$=`{(+a
z(~l2@W^%l-u*9+Co=K>?VsA|08AVB-w(Xb+0PyPxlIWjEw+80Cz5nuv(@fX{rPlje
zdvj<n9(`$0;-XAa`__nOl!D?`@qM897aZB5drV_fSLe)i#sXvAHXlKyOf%KG&PTB2
zM3?Daxb)$-$B$E$5{28|`(feWGy44cua5~wT3Q-8rKvl<MKaaq?u@oPIWZ;_kW+rb
z_UGL`K2)r1Y$ZIVp08$l^o=5Rc|)Y{<3f+e#Idk(Nq_wp|ALVwG~N%G+<o4oS*T}G
zVlIdc&#JqZK>Nq7q#wjvA;Vqg_F>V{(cpG~-JRQCdI5h9-`9#mFv_W;1BGN}a=S_e
zg2-lwmBN2r(2w<fvd|vN8|JQMrYuu=bCcGK`M0TIUivaqy{#7S&KuxOHYVd@Y03I#
zi%<XS0{;FIdGU_)7=KC5V)lJD{D6((Ftl|e(i&R!S<(<qD-dQm*`aaRI<Tlo$rqL9
z%RWK4E9HDljId6=L_o_n-6+uA;iH{{niHdJ4>dI#vR-L~DV&T{nE6j=o7VzOt;Qdt
z3^vj=Ot-3^TTdvs-FN<ddoZgZ;+efqZv$0tg^Wc#BpKar?=7QscdnF}s?}*bL?cp)
zn~p3AU79{DcA{rb2siwy_pA-vn9`>YS4fexVXH>|KwDdS;fh>nQL>dS>N0;L#X4pJ
z5YAoZ)|p0KVMf2l=?G@ADub|ato0#FMf6p6MmVg8Qyx;*x&)66be+bI$>j$8A!9DN
z6tlKsF6~<tH(MAEvQ*g>_-M+C71DBJjMKYl09MzJ^m>JyR-u%!a%z2feKi%Erv7`o
z8_cM7xiW9$3zP~-I*y-UWMi^1-Dz`Nv_ymoe+_AnicVla(uO}0KAFzcjOTv*wIbUH
zG=Zqm4k}E%J<j16Cmt}RtK*tqjCk68uofdt$&p{}&3}*x`$Mlk(EDQ3suM~u@jX3g
z+kJBsS66M6e1GCq$U9XDc^tuu2FNh+gmEO!*?O~ezBMXP*MI}5l@+ZyQMsiV;5<@%
zzf-pic^I|Tv9V^R1&$|`t#Do|zel;doSWt38gwXHmStI-wadiqlMcH!^;#{6iwCKy
zkpD`3#EFO4p(Q0shYZ(heqO8r-2T#Qh8X@F-AAuE@rrE9E-Suc_I~n@k3Uvq!}__>
zHR~LL$0j(eEJOcXMPCz6+vd=Cm~(i1$D(pU@%VM?P3p#uq;tE9#A*-P^=U7VV`hik
z(9%gbr}Aix?hKCbYZa?AcbkI<LCfWYIqAbws!ZptiF{~Z76ABi>;BonAz~w#M3bd_
zP{me~`zT|4cw?e4l;Ju>vW^<!8n=8}>We&h;l}z>LH}Q@6<YjNX|FDT70YWmHZ`}$
z3o96|M;D3^MZeG|_`^CE8k&3z)jQebCRP_8K033JXpN~}R)<v3eoGhHA%QG*eWEJS
zA<V~vlBcLv9~`_?J0qCv(z#vXOREJ{c1nU;hu7f7$ufOoSA|U#-zS2}YpFkX8jLnP
z>~cU;FcFR5wy~S13pMH$z2u!Y@)$7F%I-@P*4bvsDT3v_+MFc+-T#obbNQ-n#g3X$
z-}GHmCofesb1J{b{eZb4p;0RW4*mkFeG!z_q#zJ0oxYJ^0_pkqj<JVFNTZdUk5m5Q
zQ{~eZH<w6$aENuk_LmbYh6Vzq`Lz4P-&LA>gr3$iD!5S=P4NUp4-=2f#XrIQ!=tTn
z7H2#$<<N`iOr5Aix@5<mIFpO32ENx!EL0G*Hn%#nupbglPW0?aQ1)UyqUIN1n3(2s
z>E`iR(oKacYe^N+q_<9foL;&+9QX|6qTmvXC={Jj8dkE9X6D*UEHr<rJ+{W{;6R<t
zIlesM(D^!>OYC{U&;d{6VR@GT;E(CVYZ|R72RBC`CbzQ@1B#M#`^RI-nrF8Jt#vhU
zDFdwyR#>jx70x`wj`FJCwf66E^RkIgxyb9C8V~3u!r8SC619m^7Iv8~7Ir!t;l~B$
z0be8^ZJNAuQzsmKuUp*_Q<GUcDE;imhEYh@#c<mnOpbj7=$r$Og4RrK(}%#yJKn`s
z2frCC&lbv4koY}V_-0YGoAAy(pZqJ=1Ts1FLX4{eTUV>g!=n$0*l7C_Aw;Aq)B;kf
z>*E?kbj7JJMoo=ts~Wc}F;sUc)tJT6+n?QrZ!XoItV?{;ikBZ&(y!!DdO->LXuP*j
zx_H=p4nUgp&o4;Ox?~)Jbc*}b1@zrs$_Il^EUUQ;v~S@K1pCh$){2$ecAz>z7?t~=
zD{VQvtN<PR{BYP0DB&?)xfYU{Rr9BYnY&|m>yjDHFBj+5@H8Yb6OH`!kgEd08QWd6
zJT}NMELtnmcCZ>@cGL(T4L^2RJ!lHuLx=Jxa9oqTB_JucH&t#gkq}w~JlCrR3q4)H
zp&RCf%1<TD^d{PFbzh88RyBn(+|AVSoRTW4K*K6+TNB)}YF6cQ{pojBc;Qd1bGeS)
zav4rDn%>=|xz0ah?J<L<-D&4ln{s@pxuglNaS2A8LnFB28r$XhIThCRtAh-9;wsy>
zLZ-?_)nosJ10bH1oAHdY>2oU0A6TswqV>u4-QO#0ugozMTp~I@sA4S+_YNxe-=?E|
zn*69d)bL=nyU0@1xq`NSxKQI|RYptvoC=4Q05m~DkeguS)0y{xT+nj#^NodMFJy~W
zLHWf5<ruS-rvGUV;F*D*4)j%QVKFbjo|khkMjE}~5j%d%U*1}b>efVtuJki`1<TO0
zf}yX91`r4}t`)m-CEv>uIF-&0_d}$-f=iJ_ubxLqxLyWSvIO4zWbaY~*-7|^4>14B
zS4^bjbJ$dF1wHyMdq;sBRe@@~vc5`o6`yHs?THn07wet`t+ld-`j(b1)i?u#;avXy
z8aeXPW;#K5ra<&|w&hf1Bezo<vb5LB67qd&9i-ni_Zb!f_0@eS-;zsHdMq1QYV~<;
z0h=~a4?9j(A?EBkP$=D}D}9Rsb2Yw)9k*kXM01q#M`&-VxtD?!diR=M42aob0#_@u
zAl`rKoWBDpr#X*@jX>So2oytfs|q@yfSU6%bMN6`7^}u`jZ34(qjR~jr)FFIUz69)
z0&dkpq0-wdW7TCf-2kf&{db@owdJ0SG!u=OuYcg46hZugEnWvoP8YS}XEZF117&8)
zXKg!p<JU!MvU-Xwh>P{E{o&aeRkVD^6lk3B)d##57po<t<(TP}z;U5}1l=EKg`4y>
zHr&qFgYP)*3HG0l@)4K8xz$+c^2rAz16G&(-4kHr(nv7!h|PR<=lfj!<Ub&wkN9<*
z2=msC`w(0xZ*kh6VTiGh701oEGNV$Mj2amFxm)fJ8DRPDV;&LZ19E4!{9{wFa6(x~
z9U_S%jR#$)6K?$BNl9fnQj9=Mw~b}h!EEgR^>ro{1B1f1MA&ed-6g(1MI;XP^$fXH
zESLWb@Q1dFr#u4OR}HD=grD2!*A@vPWaO$@!;;6D3t6myL6Z($=n)6$;G*P(suWGi
zu>2>s=O=!m8D7NIVP7YG-Id(+*+>vM$*!Iq9rnVvd)1~m3iG_<w8chgz4g*uAVf#^
zw~3QDHWgIFT#3VST$Q{UQk2f!9_KR9Q9UgZX;|9BJ;)bD4cE?BwXPS9JI|lJyBt2K
zn%yfobkFzV_A49SAocdNd-lblyA9x!&tN$5eO`<~B7Zgc`<B{na@m6Q&?V-ZxnA`e
zy^-3D;|7(&Zr)a++~(P*CsrPM=sHwVk#idYy*2)Q<dW;w2Iw%sRm7?{xxnTe+4|j2
zWHnsbTW|hC-azb$;WnN3^t*MN(eEQkE(n)m#M1Jym&UmZ`;NPnXwCdqYK05%vL=#&
z+Pk5Ge?DAyBL1)qr9AolsAfvS+kg3;rj=-6no?<F(f4pd%S<n&^E0MxHL=HR6ezWw
zE&tZ_oRgKN)cL2)PHkc=cNq-DsxXh?6}GXy$SZR6e3^^9XMYs$V%1AK)0m+_2K>rE
zb<CnJyT=TljuuP0vHZR<X~V6ke)COawZP7Dehtsga^AcHrD3D{RUWeVu&(0bq|lu0
z_^2o;(nCWyeS&nK<M^y#UMO!w(UZYj-=2M3@0HwB?wp##8h@^42hWR?<mLvG>E?#x
z?|#<9$=`cx54|*sVl5T?TrywfjayZ=P4-4mqC>Ad3#N1j^9g2;y-@1Hn?1*tJ8gg~
zdfBI<(+J?S1P8?h;j{)UOw9XB!||R5LjMAmSU(lKzI&uK8+!tK*O=6L2A>qJ`ubxS
z{tHh$r;iL;)hM*mQ(MoAwIhMD?|$IiWYqNX@m^*j!$xppV@0hY#d2_?axjWMA$6@R
zOjmd0&;yNZ)#5h{4^uW6V4#|r=}D{syXja+H<>nnKKV9b!tq}=pMHa!d?`-)fsL%X
zpu;F%yb#NOSpEQhYe%PEAcz_p&Zo2?^B=AO#7I6!2Q0WfY(ATS{~xBwl43;E>7K3j
zQtZ!3^u5KoS1GUlgyxm3`h$1}_CJk}G1F&cYv4l;zQbq+-%Gszctj2k822e<C^#u(
z@(t}lSHjId=OE6QDc+t&8Cp`{BG33=>0BrmDM0`y9RbF{jBY28{P_ggholbQ(<T#I
zaG@`-OX&Z+!5r$FBW$d#0}O?|><RiKGs9>6AjwEqdPgel#cVP4e|#Os_XFX`!s4SK
z5Lust{C~)yT6`2>b>AkFHSpQPBjf+r(7qL%h#b=rV`8{W(DIxA@NvApDwAK&t59{%
zFuV9#l97i8lMHS`(GxDkX=%ujJPnS{&R&;CroX!HzG563Y3q=v*on2_d_<LNCYNDD
zR}7~CG(tARSa}1Csy@huyNu$Gy579RWbk^V=4$FD>j>0fjnkHd9W#{;KJ2FqJ_SIq
zLiOcjlj^H`x%I?IP3;y7K?dh1v0krBw?YIvw~b14G2HP94aON;s6c3*WBI-Z04L2~
zt&f#yK`tmbG)Lbpc3v0;W@ct;Zr=Ybzest6OJ2an$;o%|qFVp#aHLRQn4wB_+jMhM
z@UZ2mjHZy5TPTWwn5@|1i7{9P%U2P1pK3M>es~yRHGH)b!~h)|$fR_*1^G<?<T$4R
zC3nAY!%gS>0F(p)1yXHc$mT%WQ$s$h(?HwC#4*M(pqtIpDCWDSvg&G+Zau+RI0oVo
zYYi}gqufA+7|s$wu?rJbu?Irpzif;6CjJUlhr*Og%wlJZPLbjHXCbFG6J2OYCMfwb
znt}R5car5LO{|gHnGZ-p!q*8sJ`Aq3N$g_)HKfnH<8hF>^Wwr(ug%%<R_^^H+JoYr
zhOE{}Dn9yb9%ybD(c12NESzpB@i=~oV$+@kL1eo$g6-Gwxd~UOy2PqgvGX+%r3Urx
zCtG}mUMFsvkF7h>DI#f6LM^74dKFUaa*hPK^Oct64!4797L|c(^JeXp7hJ@{`Lk$!
z|8Wtza%Q^c$%W&DO)Xln8J@yAx;MUFG(jXVIUC;1WS)Ll{`&p9CcZZ5S6!oY`+`$t
zeqiHR-io2>9NXM?I9fDR*&6m89UYbh@VwAdXl(jsz-cOVu~j_FGhbk)Z*JHcGsk$*
zx@v7+wpAY6D@hdMEa&?`qS)98cpNDTI~_XvXx8;fbqiS9?-MAoy8Oln1mv^0_Op@E
zs4*S~l=%TqZT3#eb9IDaUS3Hdw}vpb;%(r8I9NyZdRG(U+&V3-pLSm#%gs(&YdoHD
z-0QX(%U_I69KP)KK2fM$`Aleiyget%dW+#sUqNSf^((X5IZ}a{*9xH$*ZA|WS}1j)
z#=@Ib>0&`IK4y7t`@;6R1)`Us9aIEf3EtCgnhS;N4QHEW7K2%t8%nX<`p{xRzXLlj
zL(iQ6P{NavuUOAJ0@te2UACr?XynA{ji_m_Gi$h@gK0+-YA%R=37uWbbDjq;Fu%_X
zSU5dITkS3PR+3D@PQFc{Or*uAbZzS^h(;|rIP4^n1g2eELb1<dIP;mb35F6JdIaST
zhr9+Ac<N2Qe+|vQ@YQ)LNp^GHp{Sr{mZydjsI2y@YWNeb)GES(*5})-u(od_&srhe
zY?*c3`Onx?(iCGz#LoLf&sQFhsLXmPTq;m~7;{4Dqi6j|kUS0N14(X6)XDm(1nWhO
z4J1n&rJdYJ^Zxs^VqSIK7bnZ|b+5t&zY4i*LMLi?PIsg-**VnRQW@J+pVLE@&o^DX
z!pG~D7_Q8Rc+S?AP2tw2Yn|a@m*#s*-T629*huZqciPTi1d>DK#bZYPL>mbXy%mXi
z+p9?)TNQGV%>0&3rrC+kr6D{u?LbE|mPD7!6QI>dK@>JTzp$nQ&sipvCYskMy(?|z
zd`ZB0vZT#Da>BN8%1ocjsbw)Q#<0rXJJ!HGmuLd!k{s<&)7kK%De#59(G2W}AM%xT
z^fo!bzWf3?VM7SaglWP6tMR4gtm?%yQA-7>zXH$oJfDU^b5UAYn@+Kjmu8?0TkTh(
zvD^zfAb||4@X}(eA(Ga%<{R8nyR*sn8fZ-5+Vz{o>6284xoX+tC_`_wu-EZi(1h0+
zdU-tzHh<#^pe7NPvQc(4Eozir(KASq=uIX-XSHHedy+>Zn(1Z_95W$X?@RT6NO&{D
zs*L-Le`9&0tf#bIf94>-ZRl0_@aiPZ^^BY35G1D^HCx<GynF?3`2454BJQ6yFBD@>
z_B%L=m~N)OVD8DyPME)@QJP`GOe&VZcIPfdBbPo0!6=`Tj%|HX10k3wFKg~KXiB-~
z*D`MgS)U6Htg?Si^|Wwrs`7y0B>mv^p58}U#B#CRhz?|uHDsZ%c5|1S?38!98v^#+
zVrCv8WrgTDZcR^_;XOT`^%s5cP)9fpIvDE#Q$^1-Hj;c`olR|ON|Eq~a#W(HZjiCV
zRqdX)aPuk4>~us^zNRWj)g8|&NC0rd<X~DN-Mwk4nglWk?UJ-oqIwl^`AANy#-8jW
z8kzEF+K%)t@<hX1{L#m$Mqd7BQf=&=vTf@LAyhnt=V^JyQp#0~Bf>UsBX}bn%XF))
zpo?Lujq;PvWZb#L>a}2r55}nUy8-F$yIS<a8Y23(t@>IeRvcsY5@NLTIx-!|ljaL7
zj<?x+9Ksew^@pV`;ZHu}P0-BOmB+zxPB$Far(0Z<BBRuP$$WgB^qagZ?Mtv>sQ%&{
z_ue9mgIKRs)5HOO)?dZBUsO9M79aA=VZw8FHQS3jGE(Vj|7o?cI*p`LugKA8EH|ra
z^1v6OM4H(pk^b1r++>)kq4z;F{Zi=@<fx|uLwnrzv`4fKJ3$nk`${U(m)3pnP<ZRu
z%@H|mO54~md^F>2o9umA9ZLQ9E1m_?ac`IPZHiBanz9($pI$<Y|BXqJ$Wi$rxBoId
zDlE{}LNvl!PnX?)(k{!B>V1T7;Um#bGnfE&MBpjwr;Jp_j*N(aB~n#8*=8o9{I#*N
z?283VVL`=ILCZxTRwAP*)Q_9SG$Yk~T%d)%NK`TDso=d=5Lz={lXg-u7sX1OsSPuM
zg^fsM(Va1_4w?AH!lN%#hWieIxTxX2-iHq#UXs_=-cw$y3=FmK(*aEIos>h#vD8SF
zP%;*mIEj;H&t1S0{Yz1xk@GD&t+zj8uw2~zDVZ7jsHdr0PYLz)QbjTCgGWc*Y*yH`
zPBMn=(Guxta%L0-)K6WDlc}WhpIf|~JvDJ%QMd@CwKi!v8g23JznX*`JUtjv6IDq+
zaX77VU_UOgo0FWu16?|t+qa#33wCk7xU-^}`-1d2nsqiEE5r{#@^stB$*v{u^0-ax
zRQ!2$XAQOB>kM0|1HRr)NUZ)+qmkvuAyIChvh>cC8-6!yYs9~i3W$YkK6`;w^lXxm
z@^ke3ppz!UudLV*62taahIqGqVR$ngu&8#pG^#%R#LbcG%a<<=;T=7yhK@@5S8oZq
zx0`t?YU|-=&o9(~k%w0Bq+wKVLcdLqX<g^6TQ8uj98vNvuyr*^Gm?~HehC==2v*bU
zGq5m3BUIS|aAd9(^eEr`#GAm{ZI}W(Bfh+^)e4fIJ}-*r_<<S!QC;69?v$-31fwg$
z%CKv|TLR4NOAUi&{6=JxSVY1?14J%kX34Pg(vk35&KNC2*f5@~pOBNArkym|g{5p1
zFMm0>KbE^irllm1n0`%5*Zmu$gY{abWxu>)Jg3X%WVLAiV)>xbp2M`>UoYQ??Dt8D
z?n#gFfD1j9sgA395t|vTaitbf#t)7KX52f?+B}b@T)f)*vGM4P;I&<MiCRwEK}`k5
zS~Q#FhUD+7Fu(;i5^>);z7q0)%28-cMC@$RVMimsUh^>2PuaG5ug4({2%B=^U`?Mh
ze16w0<6>FtYA1`z(6gkM_(ju7#Php55-boQi-SxNeNz=u?L23{>(kqzaB=M;{jhku
z(>G*F+dAS>dRTHvn7CL>GH;Q|iUTC_eEd4&1Ou!aSn~B4KmJ<R<D>GO6;|3#%oasO
z$8LS(Un?zesfk3|&P?+K7vm6gn5tgy`W`N}>5sn}2C6h{j-+=UZN_6Pe{+OO#oDjO
z1=f-Ey^T<>h@ch7!jc}IqXhGwozmKDW9&w~pGlOehBG?MW|JE5iI04(|2acNCr{&K
z`Rl<LU7OWw67HHZAzxE0f-Q+24}2&RnDHD0IyFB4K+D&n10?~?G@<;2@0cfs2{dUg
zGg3i9v8AB#C$x~U431v0tGX%ob;My=r@<P!C)?*Euia1}v+zj#R!bn!o(_L}pU(2Y
zU+j<tR@t=K4|?$#RpQ-vYFWo;se+zJ7iy)o+Vg0%Ly<0p0*ACly%uOdOKF@HC*Dj>
zI{Dsjpj!{uZ|h03OVX*HmO(=ZMO8J^Bv^HfgWxvnnUs_PAN5viFqw5mr@E@RwBaa!
zYj)w-T_Gih{tfg}Tz?%4nA)s{_I0{=dmH!Lv=eAlMQSRL;GH^GhflxCLjWGRCpB8r
z>BWs&XA1m*zVu4HS!s-ud`<Vd{hlb9Ss6&+z@gB5V6sRLsBA};hfH3(AX64yJ<GZ1
zjO8jTdDW(=?zHT5Czm*XQgn*w0l3)oZLFs5fEAJ3z`|z&PxEOS$XyG#pA5lB&Wezz
z19NZVwj@LswDYp!>}4Uv7joADJ5E5^{+=Cb%4I@p^HN%`k&Wv)V=p8$cg5kiLVSFP
z_%=PuagSQ^I=hQKA)%T5fH6Ql>YM#YoLPKemFFfen9sJmjlp|EOKm`2qEip}K9Ipe
zR@FKrO}&2V10%o>Hx(M{T?M^O0Xbl0%n)VuNo5QO6#WvrXv0kb;P*=tm-Os!7_)Ih
z+hedH-sPH>sBBp-;mFEU%OjWos-t$Rf!14ArYN?W5u`$%@HTmddDLLA3`cQ?nze)<
zl_HvJ&KhBm-;>7go0uX3h~+C%QKLGTyA@ks9!CpCmce_MC5|3%BjIXh(Yh7y21;=U
zf>?E9@U%jHcF^4ljI=XupSk-n^;KzzH8Cwwx8O-e<oTLRyrJp-O260W2VN0A>>WFS
zSo!Cb9UuJ&PInC^zS&klk($CU>?Gr!g1N!@L;&eAX7>2+GO!!dcxU?0(sv<wRD@G6
zW|V7#eMm%C7{v^`#>^5S9<_7&@d>V3oh$uLi(t2_iz67LJjx?OSM*}$>e2<`lbzpb
zK44d<$J(JpbJBQqa^f0guQqX(Oo$R^_{R(W;}6Zvea)FHRRwr+l(1<5c;^XBHU|t8
z-5TkToQgNdK#uxZUcGWX=MVb3!JVc3@08BA8XJ>cHFcN`&E2;6C89JpkK^C<WHY{x
znbO2s8O&8fYX}G^diK9*XqnvY@7V3nQ;dHg-I8Fb78Q|pPZsP4Zb@?#p`Th)1+(js
z5x1Gu96O`-4VQ=P3XF*1;|XOAPW*GFG20`0f3Z~i#0ApT&&6ojqL2uB_fcc7=)%*a
z2efS*{&JCzIn?Ch_=h>m>~D|%l%oIb=#A*9@S|gNO*C}S*MI~_N4q#)ylPHzgil5q
zqc8K&a32$w0xTS^2=tO2sI<W!lCqIz1V}_tr)z}r>DZ}^toOGx>Mbx6D;-K3IFv&-
z{qDzCN=II6<g-bbS}*0FS74#iUY+*?#^Y*tLd8~>!up@JRImwEF6H%GzVLRAB(b^^
z6KDuq^ixRoI<=5g8!rqB8*2fb=BLWfcwf~|*)J!CeiXU=giZXgO@X_lcGAtG`r+qZ
zOcE-Vp}?Vt*z$WKZDX;g*m<LkVXN#@UJr)@O5a7ppwZ7PzUw$lcvIF~6LmEg#4w&;
zR3B3;a!DkWGSBZC`m;`A5+ZRY-(8JNN;Yl;tVzajh{A4l)$f`+t~A++l065`)VwC&
zz?&usafvgE(Y|vY@at1dqE!L{jxjwc*gatp@P2=opO`0&od!FaLXiZW?zAyn;37jN
zw=<8^#0OB^!ZJD{V7c5NLL3;HxdLIW$|%=NeIVt_3P|E}E5doOBTfAvyr!v#(uR8H
z(&k~VVYQnr`G=fNyP{*~^<A)n+0|iX<3V_{Kc21_ise9hj1vr6=M%W$$P5vVeN@p6
zoKVj{-FV!qejUok1o9PtXn%=q%FprvdBO}H{Kh%?u~u38;4gaf@`#{kQy4=SH=+8Z
z3Iuuf+okE0nVuMd&KDXa=)12y9_QLZ%qcb#JZ~0-n4Qn!JffM7+|UMiH`sF!u1*@(
z6vVA^Gpbjx8l`m64x?8!g9!qqM9wzKE)xh0T8p+bn%?vJ%Ih<l8c&+kTRz8xo@D1q
z`{!rvM9cEXfsZ}uj6<2R25ZZ~s<Wm6R%)nRSiD|!JCVXiIhAtQ;l|iy-FTq`+bdaE
z!kS*WVs+t_LTrEftcI{zL3EU(o(kVpNEnxcL^0zlX;h|oaNujU!CVCVt!pMNahBW5
zsEhRbuM&$a`zLwRC*SFS**G*#rXlpXBqUFdQ!3;lO41LVY%6ZZ#0YB5l;gVp#mNns
z;O1C0*3YMXk)=`0%1^j&PiBjkSqX2|dFZ8E9B%e6Ec)9a_&O<zM^eUCS4rA)THX^l
z>Kr#04@9NS0cmK+4EF;JuzgLiV~^cC!2Ri#s+3B3uQbH)NiB$J0n#5S4ivBfA{-B7
zNDyfmM?diHEnKl7?{4MvBZ6t%=9|_ic{P$Mib6i7HPK4WlSS`K2gyFB%SlV!WflDc
zLy6lJnfJi=P$LADLeZym<in_n6?Z2Cecit22U@<&6`5|nL$NPi5IY=Tw&GNvI{D3h
zK@4886JD70zd$~odiG5yxlkcaQDDk3lE_}_6ZwkZ`bJrPcNv*C_l)>QF+Z!m*52kd
zb-BC~pHDdaYqjGVjB;CQQ(6_NH!_?qYBZ-%$0^`ahw>roGjwHgFK_49gBj-uYtiEP
zP^9I=*A#k=KvgA&W|7|_eFE+&$(>mqv)znfrv}bYlbR}9f%P$@6Z$dKQ4>JB1}5*q
z_j?eBOg)V&U?<0wn4i#4VbW10z4wVpTjH(FRLxisiC`<96rWhQpJ1e(X3uwS4p#hK
zCadSmiZRlqtwjCun(AUtiVoMu^9~>310M<-G)Lw^`iO-;HT12=%T!ZT@l_Y$+>7rt
zn{D#-vaq#QKxVjtmh;?`6gPdtRQuT4?efNDhK~riAFgW+yK`M!Jc$*_N)0^V8Z%FC
zfX%x8rU64Z0BM^iQFn9m%L_!}kB{^63wNmyMs0O&dmGiu$(71^iok>rb}d7PP(vXY
zN#&c$QvY)K7(bPcR@IS?K^x-uGyShBpg8VoSN(l9RUadTG`y`}qK@wpYhRXvFvr@8
z42rKUB)^lNqBd3dRI;F2Z~y7*1zCn-9V$P%jvB$ks_z?rRlcvh3!8e%vR1i-OC#c5
zUcV6|uEk9BCn4drZb}~j3U;zqZjz~#lS*oEilo^BI-1+eRM5C*W16QwR`qAYUHds8
z?NMh)0H^)ZtkXVt1^6FDdOF;fi>e{f(l<u=g5clyf31}KaI|}*_|-n3Q7)BdfuOyr
zWW7JTjC|L8?c8#u<>m25`TRY;J^b?-k8i!L610@c%*2Jpf4f-V{Y7R!tn)`ytG*2C
z=<MpyBrDcw+j=<l{HMI;e+=04KB)uh7DxD}=vYjpnE5EG^Y`G0<!j1F`c8%uEYp-l
zJt@zP{SOrQ_r~}TA71Egx@xB_H&v_C-!tW_6VJ)1SUec(BXF4>Cve`db%Ph!QU-%h
z_O}qw`D{(#>f;bkfwq^KHj`Yx?2RQCX;{rI)l6=4we<&rf6q4sa^p>l<`Y++ipEnq
z!lSjM{qa5ECIoRfp8MKxbv@F0F_c921Xadfxr6ib<OT@eO!7Qy>A!;w-&Y^{F<o$>
zc(anOPWD+*9>4h9wnWzRv8Z0_lZOI23)yZJHvZRs5~6u9PA0^L7%ux>E=YI2dpZ6V
zrr__0llmEF-R?WX8hJsmZLLud7oyY3MQ7dLIeeze>SxoCz4u`&=3NI%3v&Q5{Kr=*
zIZ4+>dukfo!UM+3JK`nJyu*v@rW@^AGVXo@!};F)UoaMTAb%@!Uvnwuz?2xi<myt*
zjCpWM7*6V;^a%O_Z{5|gJBT9DDI@O8y14_6KR(3y#o*teePcLDov+M^+wR;R{^)(Z
z${F(sqbeg0G%>m6p1p3&3`HU=qvCY9P{$^E>F4E&;;0I&+O-3q?Zp<NM^PL<C^Bmg
z`woxVz|%bL8MY5-Dh5`?w8y%1G)29G19tZcj9QX3`)nU~ZcPiKcjyai+bO+{Go_a~
zWp!f-eyXpQKlS|-pcjK_N5{A<SHJwRmU!`L84<)vWsuf{fs<6VF4|&Pm|;Bc1_R3m
zdmP^|o)vhjrk;X`HmaCsjF#(G!+tr(NAyeDpG=Jl%t-g5>Z|s{Jn1WPyB*AMO*FoT
zs08@P{ZhMNgy$u~vN?XXY4;?=`b%Vy&dz$IABZiPFbHaO0uzw0ZcKIA$JD{!y?aM<
z%2)nVVNV<ZxKA+p2GjYn<Z8qIVXKeH@N10~0sg1i^?AdQVr9dFrdf!}HN-c~)8T^h
zgLgfO$^DVY2qDd?%hPoz#;LM}d+mIof>usBD^w@b|0D>$DDeikF0W!!R;sH76-HCF
zU2wq+4Q8i3HEve&qL=J25v>QMCe_ifJ=k~+f|$-SRWrb1onJcOO9<7(=Kw<Z{7?ef
zzZ#wrK=`KRkpL5W2OW^>eA%b-hjka&xt<D|b(r;*?F#Na9vRRRh>NNdr3JfP$5a~2
zHBdtLG81Zaj=y)7wB4n;ib4H6y?F7sGe~Yg`psZVK6|Af3b~LY-W*duc*waFLs<X{
zrY!2oac@S#&D4wTv7>9ZL=P7tU!hNylgse#R?UnUdS7%5KE2pi<0p>GD`@|j3RB8S
z+T)P_EPzv(8hDB}Ssy_lii`Vp_x89jeYx<;j@QW+#29tEN9v?HF3a?CadA--47S9e
zg5gqvwf?6;h3PnRJ@k{_H+BaOMJH4B$Axw%cE`<tskWDzuz3=zAyj<J(gDh_64Dtn
zA%S<%VbgFTBRv0h=2(M(Z(GN;l%^4`B7u;A9)YC-5rz}zVSN}gabfkaeT3|`m9
zC2hCI7PE+kSLLOoG<ll}J8zi(Xck3tXq_@Dg}Eee@_!J|QD}P)CU$|EAx@8a%m<A5
zKQ*rM@LCGkwjA*0p-7{rJ@-IoM~zqC;g=W$yfNv}d!Pd>GwZc^JJ`?slLk}YcthA?
zx*3tIie`!)ue9Cr=Wph|`091AHms?sx#S&EG_$0YNXD4c`4|VEuh6G-mb_T?5RL;x
zfGc*X?Fo=;sVUvNi=DKUY6)B{xRMA0-nHlL7w6<QQv&Hepyogkgk63^T*btjNMhwU
z-ptZ)GG;|>vf@<?ZG~OO)%Rb2(}#<rnrs}Yb><UVYPk=0gRQEuadRUv?Z;0ze3$^(
z4#EZ~^V7+b0JJDlRI#4>Y5|~8Y}xl16=Jy`PB07D8v7rxBh#LIN9x*^1xQnIR?MSV
zbh<7$A94U_4@%SO@pQc^mA*M^O;GCB6^tXlj;nydnr?Z+`3f9{dp~(>-z;)`ss&vy
zV+k}4@Kh3&tHyFAt-NQ%sZ;uf#|7j@B`AP%K&tXV1Dgh5yI-wIip0#|(gBe|oC(2y
zw#3<fEGcCT)}6TKB0v<*?sfJc<h<^1^urmKcs_Px`G|jI#b++yx2o>lA~T3y?7N>;
z-vwI$RjebC6kY9YX!evz{_i$o1-!s9_$;mML-x{T`safRTx@_L6}R4s*bEJOX75Rw
zbO`=e<QsUA6(>jK?&^%L*GwpH!#uE-s9ba6plnyCj0>#y@*dU3U`8td4guPN&VxI|
zPT4QHiZJouA2T-Z$9d#(P*SYh{*CMANBbt3mK44>l6ym(&T;%0;hT%%5HxTb)Qw_V
zNE^$Q{mc3ibn=~~tnjA@O>ElSHxm>(##g6B+v<m_C{Q3>s3yb6q9{T1!?DYb#Jc|*
zMLZ=HSJF5H%v$CXx>Vg~7i@G#UFsm%`IdN!1%R)s#0ph-xmVaLLUv04ePLcu7TX{?
zHcnUTEt4I>ZD^67tyEbWC#G$3`7(r}Xtxo&-JvC8*SjRCV%+lq#|la~9eKYnG{-8P
zt>f~H;e#ExyN$Au>(WV5dip2af?*P;+I>lTWbqj4msvZ^aB)4!ScS;b%>1q|GkM>#
zt^^zdZ%=~XR%}$_R75Rtshozv!@mNQcinA2?Mil1aUwN}>df+)z2eKmmde;JPK7zV
zzQa1#$H(3UqK&Kri50;DquQP`Wz7mnKM7deDAcF4A$^mivdX1fI=?Mvr7|3B9Ed4V
zcI%2`G|5ukuh)C8;box`&KHuWvA5F}roy2+{8u3Af?Rhalew3=qK6c^o}CXU<KUj=
z`i|dTbGYpu<OS{C{bvh*1a}_N5NWR7Zc)8gP0Zv3Cl9?3bX~ZnKoVHKG51nZRZb_Z
zzS&-gh+|6k55w!B5C)hIT2f0?Y&s^?>)T5kjgs-0QtFwBd0K|jQd_S)=amlcFg}ju
zSgc{A(SN-bCK&N8HHHu0DYlrZsR&}X&zH2gVT4IA%5P%Brumzz<hPH+JF9kMV9^{!
zP7+a7FU~uj>7MMqsp6{ergj<wcWzR@THG7y(NH`6Y7(ozCxfQ_j{h%mz`!FPA1_GK
zGV|TSM@DMe+AD1G3JSVdH)1i}g&mycWzk~~MlXgUc((W$i{Zp><K!k!A@nN=a=u!i
z>~(5HbUyTC#l8XYwiBu2yhEg85i9dEz6isRUkY5R;cZ!8<%0O<iU}gafNc!eg*SZT
z0e-zO%+$Lp#wPS|<?nibkwk|S6gWv}^5jBKIdug$={Zi8uTeT+)BTc7F;Ts(x~Cc+
zpZ4`8ysqx4H(yKY#|NbpzDt~>o}2!Sc=3A`udRb|@-L6dzlbKsRh1-d?T#Z$G_)>J
zX%0)MvWLV!;o+1M0`n(PF$ZCpgqZM8v6C<xqPzESt^<Qs^2CeT^)T}rzNoXE#)%LR
zE3LC#`h>%Ezy;veeSF#qr_?PkXaLzi{mHv`gPTI%@N%snZo+ATo&gBJTt769OTniX
z)>$+2tq>@5;qmyWea1;Q<Y2`-BKqOCyv)g%E#|^-*l~39)n5V-`d_F!;%OL+-SLY0
zI`lTlBK?nn#1(G8t=lVUoUHSLgI&uED6KWj`v-VKqN7><dNWb}y77ixTwKUh={5rl
z5|PPq`*832|0ygs?f~I4DYuwp!%sSapwp$Czr#V_Bp}Vu)5;z9VZq!BZrN86IbUKB
z#h;S=6QBNEKoL*G{p<$N+{Dx_`Iz#Q%w`#X=#Z}|*Q}Pu%P-lcu<+_9|7w*lGb#SE
zZaWu_#&>pAP#>2PhA#4x=l$D1lxzEznh&MLnCfg%=C=g=Jl6l-3}XBcyQ~@fni@zX
z#AMl*BH`TC|9qk^Rm%E%sDN|i65q;dyibJM{SR;)40gb@z&U?3;6;Qiqd=I(n}D!<
z(NC)Q!!6#%?MQ#b;jkwK)W%&rT`w$CQxbeQmr;i?*sr%#?8-5;gv-?4x%P+$=+7bC
zmI;v#l>aX2Z!!7`5x$vP;7|R)-$e&})%O0Lw%W38i!Z*3p|}JsjQdya2^LN@7is7h
zy{0VerD9dn?(Ml2hqJX+dK@kl@ni*<=pT1;kUHphx;M!<jQn+z-NjZ&L`pna8K(66
zeS++>rB?U3EhC2^_596$VG^m=x<0)YHZu(Fzp%2nbQy0G#$oo0-za@Jg%!ikn!xc?
zayYx}6aINs=3_W>{AI+L=2m)UEoQ9$?_KeY%rt~%E;_bc4;3*Io1~m|h@UtLn&w9|
z%{Kd6$Le1<8w$QbHjQcDd+?;!{7kZCRP*H(@C{|Y%l!Fc?wy0bw<W&5Nt#OkLq9)z
zak^v^^8;@7|AT5nGK`e`4kS3h3W;>?zxBS1%5|$g*m=qcg>p@N_4R$By_oP%{yryP
zoR6l&JmYC?Br&SuG2JhM_+QO4S~4vz(Jn~99L&1&_YA;=KW1R=kGG|FBPCUdBD`Y4
zFEP$!8rR9~Cl+8zAx^|@PG@oOt>Rn`#<ssF$wz_>FfVMpyzc#opo58nV67F;`Qxzh
z&$-GNCflbDYAGSbhC_`8uK!p>Qw>K-u>WAUjv7Ci*;GB)J?vkr_|S=S4^^$rX$pbd
zS}M*(ht{kv>28dES1Ldo5B=n@{y`9y<bAKF!A$ZG^>$j5;|S`mE*BK@U+@GsDT{R@
zs~wh^$69{mkN@2MKUa3?seI_aFo)sJRYwbYDV>jRJ=bBnEw}R3pF{*Yel)&r%LMH`
zq^&60(}Q(iJ>>gwlJ5-deKJ9DV`8h?C1XZ(t)zy>>}<UytQ^xof1IlSbEN;Dh$S8%
z-i{#4yoxX$Y!$o306Pz!azlAX9;RV@_39zideV4Sd}atkM}hY(Ol&FTjEY_+zCmw+
zJZBmip4gqeL1T;&6O-QgES2Vd>NT}fs5UA5Df@{4-0#nkPBR(h<Mr4b)OWcEJsd0V
zjod0I)rcg4<U4MRHY2CqeL%^NS}3`|3ve#XsX?RYs4ef~X4BrJ@0A>``(J5o!Kgxg
zg!0z(4gGf0WWT`27LWB49nYK!9i0ZyB&UBvZ*O0tYFh+lX7Ib8&5!SmPeV}PWHBo*
zlt!|W*!t3|H2`U;A=mN1tmpje<#wBb^}4l7v4k*DN#iA2Ubn%sx=3vPa|n7?inwfE
zBmdlks6=S((|2@hq%z{BET?`=7pi(?Ug37nBj{1;<wXz!Y$I9h#4*o%+Tr-_rQ7ZT
z+vQ0JPs4t%$4%NbFS|+QFb-vri}#719jha?wqnA&Ij=xcS!vw2?~o^6eeC3}<S48g
z4@8$#f)Y=4it~4MS{jL;Y?GdyQD~i5i}k|I%+3#?Gg-))xj-|31X+d4D#ROu;A>kB
zCq2Y$Q?0Hx$}FPMt6wlZZR_kr`xx6-vHC&P_Y6_v7-}TPdi#kOYHGGOzrpJE>a38z
zyln%2Lun}NNVsqF!f?I<aNvsJ9L79IVeG#3{5)$n!UmEX^PnaA{=aC(M2&H+)aqr#
z9SmJo5>#<}sP%~DMCv1Y#v)td@NBPFVc28<-C;N8njX3_wwxpY89zVSRmdM!%j)cJ
zOET*4IvK`3n>Ys$Zlne=lwfFPV=HZUOS*wmLl{Pupa(`_t&{7IO2eoA%4fuSgx=$U
z=3nZLAMe<naVA|OIoFUp6GoS(k8xFwWynQZ7l{N-oLLpxw!YmvrxT*N?$mw{Z)|~$
z!WXXZoO|c`_Ke4+*_8{r(NkB-Wnx(Lc>BQ_h5&}fXm3S+;dyO9i}BffNReJLd!W{;
zzyQ6FqwJHTmA1CgnM<!|>yj@^&U1jqqciThm2lCP<*M&tCAJMGBbOJ+>o8rj%ftOQ
zhmKu5XNNnei*YO4(G^t^-9|6n(F8$<jkhy&uQ;>{%dZ{I^fp{wdYsL)pJC!i>rsAr
z`B0DGk*At?mRWBSvaHr?N)1qT*JJ-4EuT4FvRsBw_+hf^XM5l`0HW}uP0V^O?aN_X
z2#;db^*?MClTuL?Ji-Cy%29acqTi`hAd1SCtDC+?Yn^v9*9r9FQ6AEXD#FP++9QsB
z6e7?OK@8j2*U{&lhF3NuV!R#G&Jyt#hf{4fb*q#ftC?D!O-o<wd9?0cVOUnDKHY>o
zVS5vY6E0gFF!z!v<izpO7Vh3i)ggJ#O5<XTsOv37&+};yiapdu`FBR$Qj=Vgk(}vB
z<@d+~_x<fL5tanIQcNr_r+3&_MGk9TaP&TQ?s^ryd;`cXo(YW$1y1&DGPv17{#sUx
zolCd&u0H4GU}Y_a>;EN(oqfEU-N{`ts4V=&cEBmoV;aBqG7!Ge=rUuB9<L~StRyy!
z95c0=fUVegNn(_Sdacwq=>XH)wA>$cNv=na=-QNH9M)MM0Dfur4Z{7(BEWWU&0B9h
z$V=p6;m-PWJ-oL^aDRK|6Z0$4z4wD4sc)G0+UJSOH)PTEsYW?#+LNj3_P-7oLm9@e
zHUedEN(O@R;1TR<%<ToL+f2IB*gkNZ<8XTKc{b<Ew;>yEzK4U%-9NtxzPzZoI<F+z
zUI}I9&^+1gEK+QVKRsN1r7U_-t`8fgINX2S8((UmNl7&^$gPr!^;LnVaeg^}tR1q~
z8asl}J*)XX<JDHafGjl$6FxLIgjDJW9V^m9YFG&&gCKzt3=vkN&>(Yz3}B`HZoJ%n
zS+Lt{xm1f8l%z*ptg6C-PjK>XljPv!Xj4do`;L7B$-85L&F}}|Pr5L)SP9zFiMz-v
zue0%QEQv;NYotZ-gjn+H-Qei0Xz6sU@YC#6BlJXNr9X*KUy|FQpvTI+*nkDz_v9M{
z9T{>IagvQr)WR6z0JQPK`EZ<r0-J02;&5W?{IIM?v`uWC<Uj|q)D_pz;N8N2&sf|;
zsQBr_i5_1?Zf#GC-H+njKj@HIWfKSZsPdDeSLA^Y84JF7PCtg#w^0U-;vzHjJJCXJ
zpo(7aEx~O!4X;XV4X+A}72;zTV{~ozunC9~!Ha<^7B-(IwyN^edYbVx?lN4xZ2zFI
zhz+mZY-qE+`_y&$9Y!;Ddmn7o&X7DBe>HmR5>`Tpn@dhXak4xcK4J|%i`{Y9aa<dE
zG@7`75=d){tUns-#Pd|2>*X$!852C3I`UyS%uF?OnYPdGr5m_JT2Gwbd*MzWMNB^q
z$1uEJnsl3`L+?=Nx81{I(00=w&q`Hgx7JHx*XjR|n}Yk*ouO`XC|9i*j%mP|U&o1$
z4ZQIN-W<Ac`=7w>YkEJ0`qitF545+wluipeDeyFyzv+$29nVcf;VOj*o(dd>TNUn;
zKq~1@*ZncfUgL(EK<Qjug*PM&qFsJxQ?qCP{1!m4zrQuj^{9fI27?WL57?Vu&s@6>
zU7WF6iI(pp>E27*#Y8L(rKo~$fo<>IKHH*-a~&a{Yi-BAQ0dQJ;}WPz34O;bGxFv$
zfuhLXKH1?~C(kMRNNoFq_XtBgocT^x<`F@kc>q_p^dN?WPx}~|MsihqWaDw%e714E
z*K;6ocUWW0UPXH5tLTpUm+ME_ej@Sjb;vk4{AG0{z(ngG#SHXT>yHdt&;^f`i(=4f
zWB5G(Na3FZCnwuyBwz$A`sj$UB`fLE*i*fhNE=6~bT6mABC>dqX;4%C4fk-7OQzTW
zhmVH*78#x5s@wBZl{4EC*NJEcp4HLRq<7s0(5ds~mMjHOKe6-B^}aB;fxd3br&GkM
zHIx*nd9czywZ+cKSuS#TXNhHQI8Z`Z9oN#murldlsr`XxGO^e$s~D348NHSE#~#GV
zD@%5o&lx~3kFOSolY2bJ!<mOJQgGm;(KU}gqE`do&-c8{Brx@?o<y?vGjv?z-m>jf
zO*Fulis<Y`{|DA+3~{NmS7firpX6+Pmv)LlYN`%cBZD~2xDkvYKR*zN+MrI3&Paun
zKG~X_8N+l-^XVNXZ+N*5D~nw$EE9g*a_D`k!_~rK{zHR)1%qoGDJh^<1UW`;4x+nu
zcL?t2xqoB04IU;<RSg$i(UPuN9GQSozVl-z5p!%L8NrSAhP5|eW(G<D7E`W~Z1eE*
zv1;_|amS}<L5`+4lKg9=MhXZro0QK@YEgQyfJCPeTkFenfM~v1UZ<lf>2gx-)_x=a
zuWh*~j)mE^$c)JN#kbzxO7E-ntMm1cE4yH)&B<?@7h8#<AwSrO5N6Pm^B}PT{W!Vi
zyZXsE+5}%Q2MAyCxfSCmu(Kq&){(rhm>Fvn?!~j3XW<&*Z@1p6KP=ZWfOX@Al%K{d
zT?($GyrU?O`sNr7$FFgPlw7ROpiYcJmv9IAI1ygbzn=xhl*-BrTv*9VRkC{$DeC0F
ze0<L!7^0rsk=9g|YkyiEJbuapB%~L+SX%~2<@_{I%DbK*<&ev2KA5Sn+#%;#W{=>|
zEU=5je@^E2%-@-2o`uzGt~&eT_HrpcZ*X3c!_LQtG{?(+;=yBrI|m&J-_yQ?Af6v>
zUbc`n^0*@8-cD3iyYuC~c3@Aeb^NyS+Vadryha8Ot^G)BpgdP?Py_Po;i6)t8~Y4@
zH66M9Xew`3HuxuwHO4wwp<?c-R%moBt*KOqRIXXAyqKt=8}(^3UtNdBI%7^(jAjIe
zuSVpV-0mAFooi;-${m-ZAeGlhM*p8<{hzC$=ZtHbZ{J0?;g1gCMZBWbGCD&eLG4u*
zlVuLT-?fRxAjQ+E^?raVh7_pWO<nrX4wYC5$!8n?UCfLu07>o^<^^8pCkcKQTav^-
z&SUw;u&Qj19B?~1)Z8w%2zE)f`Y9LtpM1u5lH5)~>PwnT6SKe;as7WiVpc{<X9&{_
zyS*iX!kiKLZE=F^q=eOB0g_|g#cY^XM1N<dE<PhE{<a_y69&Nk<4=AV%xh&v%6B?U
zF^f0h7=p;+bo9UjTvAz5Oy=9vO#6!z7_)IFJzFU4b$~jFj;`m&JAzITj+o+09^tki
z!shd@g#<<|+?R77;4RFZ*zo7iLp!3^(=oB`gCr)6C%6OrwG<-ANLu!xv-sOC9Watb
z85!~5`BKwjKjsjpEOSb4wrKWm-D-?*a<m_^cVg4(uLV%)n-n!(y4x7=*Gr}V2v;0(
zds%*IykgpfL4e5*tb{>j!a*Rr>L3Pt+&Cci^<Ml1A0J+;{pyT=<L&>*Z~-aaN)$Xm
z2RtDf23;^4IzZ?g_WOa3D=HJN-$S{?8k?0+DzOr^_yv(T;2Pb02@5}C7%}zr3Z-co
zp<lR1Tq=wUm{cIut_1)!$>Rz3RE7GA>Fe?SzFGJ~a*TO$K~oR%JKa&aEObBhO#j{3
zS-?w-Rz<}=ZwgGt`(<1H-c3d-+)2k4N{8^$AIERGCrMBL0TDkb2CFt%=oe1%m$Kwy
zL_fX#zw5^wWUAt$r7FAE!tAn%FY>!yW3nFZ*<Gtic2$$n^8Ms%g$zGy(cixJwJhU&
z>5p^LMETDa`#`B2e+<%B3V3NVZoE`%F(!6RPvj3lEX0eqFOF2<Q3I=;U#*b*@j0fV
zro!4Q>E?zD1#c*0QT<<e;h*ISvXFKV>a-Gt+;3rCwjrYb57S{!xe@fE8JJs{y#A{#
zU{>L)M}}WbY$3Om;~^0UDx9J}_5XJd{uzVoMH6BL9Vo?SF4dGJeTik)|Hnc<q~l<=
z>o$ZKPFFANNce}Sz-i(``i`u0$aAQnXAm)cfpW6w2*OH>uojZ@1848!^vkwVa_X5s
zHtj0LH2?TvK`XzV`4)eT9c!$%$qOx|DON4UobhtLuM(M%fWYUA^YP0T!DJY|L%~~(
z%6(biHgQ>gU8p5x-&5d{-w#99j^j5O9=h3UfE)kfKZ|kIY^43A(}h+#M5E?+$p8Pf
zI6X#k<R^$=OW;O4<)ze@#Ue6HEe;ht@r>oCDwcRk0KqfNQTzyVzFKM^*U-n#Unlmr
z9+f`mBc%q8tx_X?GtD0_xzxllzZ#?xvs`Qu7p2Glhl(&_exK$OyqS63Vbmnss{B{V
z_TSGdSHuQbP4WS|j0bsb#k77g*}pfknw7NE);iS>hzotav((4(hq;-@;mm1TbE%KU
z_<%a=F8*T{3u)pxDHU-Tr`G$Wr=JCP<e@LU^PPoSVMxdS<Lj#fqRhVcb!3zfP(qY8
z=u%=3kP-m_>26TEC8fg!L?l$YyJP5XgOKi_q+{rYA-;Fkb=O@#zu$id&b;rv_uO;O
zJ?D9zGneOBW4FjURO4ig9Iq+?{IX|&9+qLPI|duetC#{TtLK`vujEy_rC$8Utfc9`
z)55W;((78Jdwp3d#`}UFo@|<B%9Pb<h{C8t??SML=#x^!uT{2*4HJcp>ibi;DS}@K
zfpjJ6&>zC1nDJ~21(Za7&G(>Zq*H{NsH;To;W+0lnCH<>V%^N5uA4rZQ{tJgCk=Q1
zHd3^w!pg{490WtOs^n`JmKcFzW7(QNl?NQukcVpANrk8}dzf{!-p{89y)8Gq+fUn?
zgaxXud5zE9#DajA)e(+OqICL?V@|q-$*JtusYVhipG1o}gc0Ua=srq+=?Y$H!|7gO
zxNuU(`7YhLM{DTqQIiCiMx{{bAOTezmd+LLxMKo_n}oVVVfu%PwV1!w9l;J>K)ZmN
z+wwzTIBo1EAD}idRD7xfS{RJ%8jK5J0S&ZJ%P>>L#1E^lEEPlPF-Jn>6{+-tMgM&)
zs#4$Juqzqi>?DU(bv*9AmenA1O(|PS3cKCpjpF@pl9m`_KKq<njIEo!RZ6NmW8j}I
z|0*W{w~-eJxh=SCBa=^%?rf2R_fb;RwWE_;6Scp#ww#$*PM7G?uF3}x(iS6deVEg_
zmg|`>i2{WgU*)vIVuEWyX9E9NMQ@q8DwBF0;!6;ju~Z~;(cO?oeTZ^P^NvWiir8`I
z4C&a2Virg0P?n2OEuwe5e>4mC(2bVsO~&X6kQwe_><y3<TNgWsn_T&q4>HlZ$W72b
zKNKa}epmXV{?5|fkSI#*9t_c3G1I<jy_sn}*PO+TdQ_K2nVBI#eaae!C)aGOSGe=L
zyUdpxupuhj{2f*^Vy(<3svYd!=Z2P<P9X5Ws;$j^l-+M4rO5OOU8rZEr++$hb?jPe
zwU^2t*gjmsv&t1vb*`;jT-beCl>e~?y$NEFzGYsC*mHc55<{y;XwYjhTBw_zqVi55
z0Dnel{n_>mKzHnD@&SG=|FCBFan1J-AJzxlBS-*9khOiXEe&#dW%0Hv*&bXBgY-;H
zCE&U}##*(jFcsm$^ZGKbq{n^D>rd~7S_s5&`rlg}+G;~&VIr6%vpW7A=l~O<zE*C+
zJiz%8Pj}wup0d9n!yCJAMPH4WQM0mwqaafx-4cz*w|XBrP*ozh4}kl-BGK%oCdv{4
z(wZfCm0gA5>7^Kled$uCF9OIY2S>}sa*ey6#iYt`+>nC>Lg<<#<+dU<xbD(=BnE5N
zR@!}O3F8E*?wdnt#+|0Ac2^dAk$R&W-?b1obm>OA08rvrN7z_R+w;k81VbIRr|)_+
zAGsFwB6#c8{U&&N=6D<|I>O|hg1T}v7bB<`g^%5NiDKg3hIlhVgl*E>7c?4h5%TxN
znn<AjL}|TpbWqD(t?N<CtGZYA%HCB%0ufXexqEFt@M-_-2WmPD&O3~xmMOR8M+tg9
zHIqUH5xBnX9UVI@(X=3Wk-uPi#aoO-u5mtGRcHU=O{lR_DT{{(sk9_HhLO(&nUiv{
z4NkgW*+;+_TVoox(}5IFpQtp)JxRi=2X;Hd@7!ks{!mWloZcj{VvT+9;;WH;a=Vv&
zhRpfdsr%^xopk%sL&n?i0xXP+q$Kqch&{bZrM`HJ|9rEWF-8Q7CR)l)nh?%24z*As
zf@8g1D8-_TzHDK?`XW_)EL>G<I`-C@&K1#U?k$=#_Y>qHuQHf~ly=b+>hBmcsz&V?
zPej6tiJt-RbR=uQ&h!%hf9y5s2$Na2{FCHFT+MB6b?fH_ch^Z>3!*L1<foUKiVfq|
z4W|kl@5WNElI3G7J5a_djm_fC(?F!}B<V?5zUC4ZCx)xX-X^rjKT==BroEw4YEmRv
z;Ug$4!IdZt7o6Csd^y{vyoECV>zY(qQpj$(GGMPf`U|cL?PL@or&q3Sp2j~ieIAd>
z-_Xl{>!R)<EZlD}lGdIiQC8Qb+79Y>5<<d}42q=Iw=AKrh_d7_OW&Wifj?Hh_D#L^
zRS~j3EYEP$AmFXhe=pIuui^SK-@xXUZp@%OU68}jc<1Hwq^;PF=v%hs7nwle*~h=0
z?iair)Tq<y{dZW*^bZ98Rn`8>f`vZfm~@c}1{UBTuE9;W0jTLePejLHTo52<F@PIz
zN3TT|<KM=Q4#vPqcRhVHRR7~$r`J}$?u@@8H5s9cvBf5TNcbO_{yQb`=c8;zlOB<l
zgyJCP7A5_c{+Ad<AGE!_gPDlI7%w*B>~-Vbzi%sGFM?3{i8ef=HV+-dfIzL;*}2Lt
zd>4q7<zOw@N&Y8vZk6_a>YNP(5yB>}#zJvO78VBkm@uwo5q-DQkZAf9P6qBOix`im
z;l5gh#k~HV?{0DAX$}_{jLwa(Wr$!ZVba%#B`RV=9EeQ@bHW`QJi8B8s}dYGCR=y`
zqRZD83czJX06SE6w6i!mUL`p8n50p+&c#7nw&mq79s~r@HagzE+t?*!aP@MFXlb`|
zcVvGizIW^mRO)Bn#S||HS9_j3{Lg2VUB~o&Jd~@-hUPG9>lu}GMso~94HppT`UwEC
z-FPXe>umMV0zluiFJuD3xqV4~`N{@_Cx5jC<W=?-)tDDDsxp6FC7{z|<B{yUgHucf
z?^WM3lsgydPGi<7ZB<2%r%TZeX8##+_q0OV(?-n;>sCQ^2);G@HS_8rgGq_)yySTl
zziY5`JntZ{{gR$LYQJc@<p7SRC>l?4+8*%yi6Z#R5|D91oEla)unYzDO1dO^^wE)=
zw;AN<$hmwrm@lT-Ji%?7f$B}R3Pa~*=%(P|cM%jwMO%znR)C7j*f(0~7=!uwXKwL+
z!4v;XAd4Ecg2r=|YaQ!pEi5A+rO?I8m>?m*vtNa`;5qH|eyebrikAcWijVx*tp|b;
ziZy15|B6G~pm*jCZ!qXeH`4OpYl02r9l?;K*Iwx9{-$oHYYgV@;z*aJ;%cV4Mp$YZ
zEvWlTfGGro(eQ8K`NnpKX2CW0bKpIlDRGH|zJ+~K3YWYco+Q=Qj!WOvH0w37r&xY4
zVPSC*_ciZQ((Bdyib?iPk^XC<YdP-mive_B*dOKG4mJI!@~9VoLk=#6<LW=WoU7LF
z3^fNRQp}-0k=~q9UOv&saZ5Xog{h;x7V?gkQt`={)d=Ghg~{T@7gPVWx+y{r<BV?~
z)a9$Y*(n;K5$f!O-*-MlD@x_24<>Oo*_8VWr`rxV+Y7}1TPV#O`n>ne=keAzM%Ur`
zHwO@89ZVQ~SewogbOYP_Og1U$HG*e{AJa&v0zc=i+%^4N8KjK7vN4-;xU*c#dtR+|
zyoX#Y8MepWDjc3SPuXA*(c9_tG^OBk{D$~kZZ_BsLztxIm5Gb|Sjkc)Z#CwXv(t%Z
z5E#oItu1<ig<u#idn<-O%R#hkFp0>ZH%>g85}{i*VE^tcHMQJlw$vr8D|d(6(Rm>{
zbFCSNEz1#6?~XzM3;lkHZO}1e#O8EwcoZlBoK7ANGvv@&OakqLlcOAuwn5ju7l2VA
zn^nkd9Pg09e=uG$SYFscW>vR$gN$d}GsqE3Rofw*|77!ZkYl|Tf3L&v=~r2Z&eR!$
zoE@-7grY3Jsndi*=K0aM=lSW%81vU3ZmH)v^rRs=$goax75~SD7z2Whtwe#Hp7a*N
z9AgZKcVAb(CVIfBJnixMYrej@ewV<9&5Q|i=+XX2R@&a63ScRJU8-&KAwUfG(2Tmt
zOxuW-%f2v7X-myWA%^7KFuVsn&8xQ7b}zh_YjBX7Twpwur@r6GJ!Ld-^1X?#2CMk1
zfl@^>Q@rbXRmOLfg784r>K@pHy<!arV%O3>$`nU)7$ck)<A<{jIESD8yw>ksf3uvJ
zW<^OBp_CJH=PSB`imdeV;Ly>iMu6j6Wlb`rH;SKqMZUq^!BoBZ%&uFod(!!UB9`4*
za^4Y2)rC4eg4y&?=W6JVb&N|I0vPC1OxRK_Z^LmBSjF~o+bv%Rjj*F!`7sHG1rd42
zp28H;x^>oktQ>;VCRyz0hfB5)k<2EkQl~QZp-emWvi(UH--bz>&#N*^x{2<WmM8?c
zqk-VJ+hTR3sGbQh!IQMz_mp9JjVk+XbcsV5t^+?V4N0GIFg`+;8oo-C!h6|Uj#-R=
z(o1+ajmIsMEhnrC!0RMk$EXGZz;+AYfHs<YHj}#c%7G=2W3eiTervW^vl=s{YvVn!
zSRZCtb+oXKV=!`sy-Q$`oDaD=b@al;y11w?57tHX(mTkeCFJt@{hndF*prg8yJ;yq
zxl{mzyT4(#7NUE)a)EDu<V8LjKH8wH5phzDEa=Vx)gInYGJI>CKv`sunSkLyokCUX
zWjc#SX)6^Kh6T`U;V@%UPLJu@0qEXNnB&1ln4(PnBe&fp`Ln=XuHlmwYemOTLEMvi
z=Z9a<M*;IuS6$Y-l&SEf#<ECMu>X&Fa22r=(Y11tDhWAoKs#kstu3d1E&7FsgcE?G
zUi$#Zea<@zs8f)%50j2&|7<94x#(k?Z`tGnbzDx599+@SwQ0g66L7;n*zkpD7Co;5
z^zC8Q{VkG)L*G-R?r^nh|DdF#qy<PiI=0|)AKM|PZk!(v3%XAcxy3zrQH>lk6u6%S
z;~ul%h=khK7rJ)wG?ajn+wdv#VvTtek|OMMuiNu1(Xwt27CxME+ij&m5=a$jiM35Q
zcCb_+z%jF4v5b!k<=y!<<+AZPFmePeumNhlrqE$!piL#8u^g?IQc$&U&iLB1n&kX+
zgMS~?fK|#LIIiBTIa-KiiZ6TdE$S=NqT~l;mVD}JuekV;r}*hSo2TRHI07jCSON*5
z^2|h!p^ZX|p1K|`JhJkdGV|d{Tu-fOKQ?}fhOq`2!_(o0v(MeqcE`)c_sCQAEkO3i
zMsPNh&H^Zx0Pq9|2nfsn#=G_597<dnFP==U^~nHv9ELW=Q`CX%l=%<oJp*l_Tuv6P
zN@~NJT<!bXAtxt~NVM2?bz`inmT7Cw1cyh*Hfm=Z>=~DAzC;_3-ZGA#M^d^ESf7yS
zy6daRoz)%C9=?kmWbh&2^;FEek^T(2_uwo!NyzP%fLo>M&Gwnkfop(*x#<m&4s#nd
zjQpZ1b?kx`Z4o?OmR*-}Bs+i}--^`UO<*9LUa#5Ugz_rw2I7P(-NJ8T<C;OKakbo;
za27b4BU~ISv1M25tmv<I_y5Ci5T9>VZX-$Wu^i`<<t{#JCv^f%6m?6~gBMyo3^R}$
zkqE#>L*`D{_Xi13vs;ZzS&j4%3S^}_yZNM0ceB>4KHsS4Ya+8>?i+>PVT3RW$Hyda
zb@vFMR%hw50MOuy7EC4jbj{jxujwvDrk=dKyk-qB6!-mz6-v*Om-P}*X_(W+&mgt&
z2^UFb>3+Psb&-5cX;T`X_>^p}@e+P%8>{9jS^!1(I@ASGYB~#fU_BD4>#>Zi17@%f
z%Djie$0ALcGc(GL1M=d^R{eo@gC90qtHKw6C)q|tGON#AqVLa?t+*}XSoSIvlq}R2
zL3s?(Q$_qefUR2qm50B(Oro{Yv6$$&PPbaWHxzqX7YDP9;Rkzref+bPT3<jop4p2}
zZ>CV@aeN2HBy-LK=#c@{qS=#0&WGaCL)d>XJj{?NdtKZK(?liaI*m_V+A;*S-;;X_
zgmf}_Fg6u8{-5MgAv13|ph^b77ZMy6yZ9GQXZLzU@Ey)?q8>U}li7sO){Hlv-Zdyj
z;p$m-Tqd*l-tZ1B)=<XJnOuJ+T);y<&;>|w1p8ZP9Hz-r@AV1S5fF{VZxgk{UzUtk
z>OZr(nva8^=M&q>9AB=do^trQ<`MkXvYgC>u4byX&{ia7z5W!2(%UvTpBJ3-;Lux$
zxM%-0E548DVr;rp<WZ7Py9{R!9<D=gYKHCca^@($pxcHIZ1Kpf`_WF`=`=pc7^Um#
zbTQl6{NkL1LoGXLSewmE0G%q{0qK1`FMLvR_g#5bo1LllSz=5Hpq3w+_X$5)(kTIK
z@2RebvsV@6764?b_!8d86jVUcy+<VaLG?M``oq?9ZhC<+2f%o7&~0_Cy3oO51(_pp
z=caVqysZfPT5Q;H>T2jspF0co%JD(&gu4J(AvfyVn2+m#UIi;t#+^4irw1jV{M%gt
zNF0Kx%w$l_`lAKSNZjo(>+`jWDMxjdbFNONROUpPcwR)F`qXGix4<IRlsk7v{aN+Q
zqWXEz(0x9`WcsYE#=tJkvfbtWG3&n5C_bkG{Pv}@rG(m(?G6DBvp|;etZ(B1f*^mS
zv+0cw^$g=0Oci(X48B77s?igM$Gb$0+U=LsHtI%P06?VpmDjw-2O$eCZKX>BjEdmn
zu)@_qa@gc6!h;g~3jQMl-}fsM>jM7Oe*Tm$BL^bh%+OiZhLi3yd(vWizycaga9vk-
zK90$oa@%34X*3*+e>H=}s&+)@`3@qGs~UOF;37kk9bmT>gG=xbw7($FB9`Q@2wINM
zJGA4tlgVVT6qJUvc{dm`<DM_p1T2GCyp0wMF>XH_WcBQ@tNcAmTrR`#qcp5&G10Z_
z+;111Z`Ag?+P2%1wnxNCiuABM`YwWhH1&lx|C1Q{kiCdenpWudA&rs7V?8G798z%M
zo!A3Ydc}uYMTn1nG2`F;TC*nWoF??o?R3eB|4{J~Pk+w(E^Dfg$KnHVykM*p4jrF(
zu_ftbN}lgw3fnG7?9@)BT_rRWSbDipcc7%wA8D(tI#utkaJi;bSHvj6?mYiZVyyAX
zK=xVSDR4}SV>o_Kpxxng>N}6snMiF3boVD+t;Zb4-s<y4$BBDm8kVk?_6OHT=T}gE
zhcIs(Q3tV0MX<VmIz7ls?S2v<7;-k2rLQFn7ef&(9d+oQX{fIu3o~oZgA-T%PU9>l
zYMgT8km)n^s6TYjKrKcj6EfwFUO!W+SCB_QvQV<0d47Wjn_!%H5CfB_sK|RffkVAW
zbpY=4h?!$ISTFdtA}7`Zk~GU_?puGfoq}a70(x7h*_rcKTc2Ngj39t=LP#P83S`ur
zE{5>U<fgfSO2VvwkV7-p*D-1CiB(TWqGx0rMtN)B{VufY&$YhC`pTrZwAq61B;251
zhCOnRvv{Es_{Wt?Qo?ZBtkh>@hL0^*{Hg)4pamFG++j*$;Gvf^FVbnseHg7BBRExE
z0IrPIh4sf8ZXa3I?4r9jZ%z9NEBuuhNb-R6nxvWfKaqB9@Wnm5z4I2uG6b}{G#i-A
z(o)aCxqhq{1YvHlvx6DbEO9$<Xdz0^ivBErlPV;^;f<ctQVzic2T<*hq@}<ZN^GO0
zqeDfRJrrH_1)*zsy(Fti3BigPzKEyc`l-(@@1+(6E_`za_-jJS7;75(rk=Y{22O6N
z%_!bI4%AX=$jSZeB&$JVXud(xG=p+ru^Q(m37XbCEI%>jd@yf`_jYAZ2idTFJ}f)y
zxGd8|2K9}WkK0O7?aEP02w#N~<MHnfuTzSGyR=qMo~O#fvwH;6BR9^)TH?Vm>B;(H
z@dDK@Nb9L;U7*>*sa-L@sVEobtr%;&R(8JGL<HTXm2sQuCL#1Zp>P^s_F-2&axEIx
zaXoI7zW>doJq&+bi{1G68)@f8ioW7kay$Ez>9ex+jqBp%%dhdVv9&#p<{iN|spjhx
z<MKKpO1oL2N(_OAv%AHmc&a5H!z5;Sv`viVuQHB|>Ud`M5njh?&1ui54!8dNHoU$n
zCFPD&!`Sg~N2IEOD0DHKG)vkzU+AL%p+C9p8=Qv}3m*07iFK$8HaU{0`)9P#kJ)^D
zm6`QKJ_WyhuI{j=4KJ*7n${%7*=POCy6=a1XTR7{8_J|6CqzUnA9kltU!#6Uumv~z
z8T-duMt%m{aT5>PI_nk_P&E>*?si?S&Fd$sS0T%&j#%q@X`SW^(V9PMGiD6e9GYU-
z58@wab*v#RN(9_d$GT*D={pkunWSw}xbGeNy~84YEensSFX|AAbK`LG{@#NysoAOU
zrJ_2iW(jo4W<^nmq+eL?&ZtL3HBR5BTZyF!KKK2pT;ouv3eWlt9^+%(No7@MS@L0G
zV|95p^1=6vda0H(Y%8zVRQB^BOp?X9w_!kbPJLCDpJCwjou_%RdGQDpXDJ_?BRz_}
zYI%GL+>i3nybd{13N_Y!>eW7$In~lmf`zr)IfBL@CXeojRa`aNJsQ&SJjJE@Brl7P
zt1qv;C0Zv>6eb@^S5Lo`EYt*IuJ5k}S6<qk9g$SJaf>|n&N|#Pch#M`6t;N)F8SM7
zZddTg=eot^5)m1H)RV5*eYs^^mDDj^z^(W`-o7uuBOJs#-B(08$(KPmv+W(*g1<pw
zL0L0+pKFg?{<O}ON+0`~1vsVz<V#ihLIsGS-#>)YO@wgAh_1ZjkU3~+@u5l?xGS3O
zw4p-`8w08sC$3N0a+SkuH2C#>`if>X7!J}8;(gME<#q1yWY=vO0%?{V;fM2OSbn*t
zPqo~5HXAQJ$ImSpdW)^b*qXgYl-4iTMD6S-*9(LdH+C*SL#qC&#dssIo=n)Rf0#*U
z64XmwaB>W|@S5!*DP?+N=hcJHdckD)35TjREi6-#{I8S_*jQD+$tq@yc<Orc!)715
z)sfP2HPPHoEetpOn4T;e2J;1DJ8(Jke%!`GoxGHd2@Vk_}Tr7gSR7bZDK$Aaoa
zCz0A+Lj_UfGkfig*$xU;)5TW@NTcV=!-a|3FOTLuUT#d*oeB);xUPK~e1*nc3<NZe
zSJ|D48s)Tyxza{4NFtYf<m(>b@oby$&AmxIu?rE{^uj-;ujtelrz)|CH&m^%id8tk
z?;?nlN~6C`eQi~dpJyQW88fL_@p4A&u^W}-c<zO7bqAAcJC#bqI?^xm$nS&V#jbNZ
zr()=b!yNHHAOei-FWBvuJ_<OSJM5*q2*jV#-%NdEx?-66se+L-A#Eoc<5OY1Mv+rc
zf@e2P`Cy({48t-wE*youJ6DG^tzSAn3+CvA5$%pFO}dj!A$LIqVc#3yQj<iqE@W3j
z@xBPZh0FeUFKhKuirQdjEcXsS|9U-&wV?}P?qo0MNc4$-=U~Ml%6y~KihjgV;IOaW
zozJtj2s>gzzMGt8T)FT%&Fe{<7TgmK;R{Sl4(U(oV>+3OOb}+BI_U{?G06<ASn0_&
zfLI%zczeZ-JHHO1%f2#J5rSj?ZJ$352<RKSXq!4R+?skdiyKh$hF|5mK|CA9N5pZ{
z6{^D>bdsgIQO-J1=m=ab_eJVnW3;WzbD8!%Q52w{5mrgK1uc`&*q;y!S2d>K)EmfJ
zJh_&utg1aoBTspqIe0?gc<HXdw2j3@r=LfR3dGqNx{lhDl;I_xvfV3{mx)ace$?kV
zt@xaOuR1%oxal(42>rpldc&R5?;lQxYjlJ5(nTAhmT}DGC_!{N=3X(O^`V8~EHKo=
zp=Ko~_oVXF`b)@x?yJETG2P0&`2+Dtz{(5HOaK+X2MAmowHE2A9QB6@ZZlYX<DHa6
z)ntN`wa(6wb9#a?Gs?Q<1c4BNcGaX&=7GS)iYbqsiU8!+O%Pva;dp~&N@d`*N>MFU
zfP<MIeWcxpr_9QXI#nc@<LD<md;aSK=0<ahAs~*Bt@4!HUezOM<Z0ZqRr5?`b=(P$
ztmz)!UUS1IQ<cpjhanme@3ClJz6u~1ViYRWfeP2-Ylj_dcq;oYn7w-`=Yzs-r@xtw
zvkXK)y9D{IC$)yj=9|^dZVcw@-y^M782ixWp>%*;7k|sq{fGkwEjwFNv?4oo&AqJ{
zV7N8&+3K`pf@q=Q*feLaR_9WCF5%V0nETUwR9SNH$KBcD;`i?{WU*~xQbrCFp3@(>
z#|KWgh##s)BzU+)>6+G}oE~QBmfUA3%J*23`=Ct>m07PwvU=*)d2qh|kin%YFZaCB
z>*87cS?Yd{38_t|LWV(OW}NL?-3)_f;f?no<8X8<Yk*H&IQ;H7@=+0eXv&j;l{O+>
z&Y;S!Q?P-TGjONI);L!DOFu8ZTAn(WBx$V~Ufl03mvkdPue(=Yk5*ONyT|*Dsvq3j
zSB$<@x6_0Pbj}~%TC&4h<_ii|^o}F5u4O1UO%ym^BoLfh@qwXGae}=2y{UrR-c)sh
zxg~gvA3Asz<JKAo>t;PJNHMoZmarSSNm*P8`B64$b@YaC+iU15Kej}aYg6VweaLtk
zla<1TAFbsx_gcAZ%ar7*0$Y^S#K<n47PaWSu9evk2!lsFA1L(cyPKZ1cdDc_$Q?K}
zJjtdXog-n*Yy!o}{8+L}h1>b4<}9}2G5d{mD)YlWhhZJ_!Ya0wCX%Q^ZHnWR`edB-
zM<+cZ8TZ~(%(}E+by{2QP4$-rTB<L^$=0f6u1NUr`HG{I`Wnws-JT}`hxiZ&kfYY)
zO)n6<RBc^TH?>wF;S!nNPdSwBdXZ*Sx$xpVM*v&3o>D;&%0Kb|!};Z>{-O&9-u{t`
z7oE?RllPn7ObamNu@VhWRR(I?@9ovc9PV*1^$k|rBaJl9=ZzCzB5O>sC*XLNsVTP^
zKvF9h`rxGb>Pd*t4(UP~&SA`ry2#ynse?Mz{^QqsNOM6_NIJ&R3hsxS&w3h~j;}xN
zrny|}<Cx!??@=qy77LgYN1m=92h)>{R;I%(qGEjXU36Nya~NHuL3CQ}QD?y8JAiju
zldww8xESO|Lt)?3BoOu9LJ(O{6U^r~^_<^}V7M10S1z1Hzjbu@m6x_mp2yxNg(}TH
zIEPoJZgEK6*pehGR%7{5PQpsw;1{|?x8_AgY@;*m{ce`OidIn$4pZjJ0;p2A%bkd*
zC6v}OqxSd#<AJrEC1dLsOBjGF(s^F>;o|k<1{h3e>aM7wD2H^`lGN}+Zdw8QV}b!t
z-~X<kuIb?FG13W1j%Kr}oxaSi_$8;FPPUlMEgWWqMg_qMphWb&7Hk0*$+#r8!mR7b
zjmfNvv)3Fp^;b+@U>9W>ccxi?Wy2Y&WVoV@N2+aNHi{dm{oK8<w!HZ`M`|oAEMhpx
z<+WZ=0<uRxXhi9oOI&t(4$kxnVq8wpZD81A_wMH~2E`AQM($TqcCAhCzhg4JG~NvG
zg6S^p=VOUoRf=$i!3R0EWp^K_^-rAIl(f-mpB!A?GYh~^x(9oz7_6$itMjZPP0~nv
z;^m6FhBBXmI+I#<f?CXiPMsZR3mR7Rr!QU+7n-U7`AZni;=D|wm~f=Q#jrzlf*3aB
zhXS??(zirJ6SCp1!pPMX+p$MyBQHE0+(%J`v0G+9<227km|%c-{0klK)rgm9VLHxI
zFGv>@>g(~w=y~^bsPbi9r|?r`qX-ix(m!AdK@>?S((om+8<TkdA{lPrvHlDO-{~6>
z%MFnTI*s^8`n9M>S8FV34=1_i^%J)lxj2<jA&JM;&0Vj&g#56Vrnbxrst^@20ofq0
zT7K1X`7JkcKvDK3$6L!;Ld7YYmY}i~!}6K@q{}~sW-B5j`0@suw!wQJ+A2bLZzKZ^
zg^E=)Tm4~k;xY(Kh8GjDWKcM4^-Gz3L-w;@$z3hrBQm^OPh4Aq$YhlVO2kNDsOnn`
zyG&*EpBQexyuZ_)$kA@cVNtVIp7CIjF`m~!Qd!#~0}r<>1znKYV@XnXgpKIAAB<Mk
z8Yc_%6B)?ME#yZg6yA4x+-<6+c=S$AL$=|P+G>#tW7?$AksQ5c>Z#c|5k|4JP@$dl
zl>!|uVV@ZbK0o`h+(&#Yn&tQ8KC<3K{@^geeITC;{H5v1+uo_CsBEhBA>81xM-tlJ
z;(#q8)x|wxRX2abwS`9g-Ntgt*49U@B+&rBL{i>?D`sIRF~y)ML4&95r_A=wSW3aX
z?KI=|Ro0WvC`&;WCwLRkx1QU`Aqh6}axwaZDcJFN(`{xAkcQkU%sc6Ao1i3zk9Y%+
zn}42%Oy0}=nW%fK0d2j+Z{!Q3Pux!Dzn((*%_FXumlg=f5`O}b^B%0@5)6^}p}`zI
zI(ea^HBwi&*rMoEB7y67!`Y*h&L6o6gFvCDQ@P;>4>U<@Q`X$ZI*ob`&d<7?Y6Mq%
zMAn{ilZDqCn2H4+79R+p4t3bkGU50G^zubR0UIv~8|v3}myO={kwkme=<pnjyH_v-
zEap`fEv1Twl!HvdcsIlrVVZel^_j!C>z(pPW{Fgz;vSN;diUcI6-}#Knvs0y5|lR1
zQpo~KWWPB&DI-_rbAR4bpX!~wkeKI+v*MZRWC^WDDz#1O;_E#{uw14M2*ICw=g$q)
z4sB*tBYa!AF<!+krF0UYI%EiyFk&XlAv)Kx(IvTgJg=%~kr}J0pI<gqOlx)XV+O9Q
z%5JV5NI|-@B^Y$mX=X&T<Va>jS&&8CmWvm9KJ!fpiL6UePz`t%O~V*&Z^S&M?@wvv
zvKGzS1eh$7)VRJ1KH$XrI)JJl`4yswds|Mm<Oxfw<{)-##kY7$*cP2QNun|oEDMBp
zMg5}{Tx`tbsd5hRI;wpyaI1180N0q1Wk~dbH}Q43;CujZYu_VSh2I%es>xp&_V9_*
z8MJErRe@(NR7j#%fr*8+z0*q?FWwlB7<iGVnn`JC17UXNk$~Y=krd>^m7|O5Jp-Uh
zuqS?|pya7FG*%(7&p<MycoW_4qSewHEk?7`-Jr+sx;|%YlOu(T`&yw6eD+_8;6jGp
zeFP+MhGc(f!&?#PujF9`mD?=&?qhNeU=Ocpx+s+uDJ=_K!b-&U2=cPCsgeJ{2j;?r
zO20i$H;jl>ywch9CucB?CTSX|Vby*i%^kb#Pi_IKSetEc%-PGVfkKNL8wJ3bTgZ8y
zY#D2yjA(v1l0VnkRwF5IpgRpHufil3a&JX}@yXSBO|lVT^byO28ZPx@=85>@mG5-%
zX}*{8g1H}4!Cd#5WBHsCSk#L@0RaRSf&jITw9<JFNEOgy!L{OyR7FnC<i*!u2hZ84
zYCdh7A(3AIKyw+pjvT7oZ|TM5<*#2P)9J%ZV@Nu{fqKVaK1r2FogPe`>gww!<FuCA
z@mmkUttMc07h{H4?|%eHfX}R69K+H$0xbt=M`x%FbIuMOX+%r)FAKtHghnKYZf}!`
zH;O6Ruz}K)HYd@wO4DzJ8o`#n7{K>6k=E(bV-Be3E%&7teV4XG>ivu2{gmZez?}NS
zsn6UBg&(xLm+mNO@BA0$h~8Z)I8UUahxl@3se^vw9D9q=$Sr)pi>%D9YYAG(dU*4f
z>Hh1YKphBe*Meu{fgVdY5+BF>4nGc({tmyi58oTs%-ouTn_AGlnf(tM1Qh$n2}cZC
zWXgAf#~$2cJdoiR{~f;{%_z6(C(?PiBkM9^u!5DM{mX3qFyv5I2qCRZX<`NeyseI-
z`}#jt_LrdxeZfB22*gq$vT6#=ddDxq`Fl?lTS@A<U`NxmDlW?j0A-blcgDN%*FpT(
zfPo3Ht5C0KsPv>oi(+GAjTo$|r|AFwGIo&nCvG^tj{#zK{hM9m@Bg6+Vwiq|g{7-Y
zJtc<Q+RAdQybKh1H9(y-<dHHBMUYs2R#xjmccL{Ykrw&lkf@aLcK^ec|9kH;iR!i0
zHZX^&hPq#jEgRD401898cR3ws0A*>{6X6Id(=ba!Dd}KS`)}{yCFG97tbYG1Gf`WC
zm-M1ce4b&h3X?@@l5{dMdS^C_fk}&m`0pqE^Kl0nIfc@JpnKTHvUp^BdD($_(igI-
zmHtaJK<}1TB5%2S-OWeP(d%&eu9z6S*OFH>9CEDw2NEh23z7_vrD6O)bCHbZW<jAs
zqQkIiY&KdytM!RD(DOHVUSdM{(_;ZgGfHy}4|QAKd9=r@^K!V==<`69lN`|L!%!o3
zAZ@*rQ45R7VX3DHl%!su6_K!93oE5)oF>h3^H+De1uJG|X5Pwf17Am)F9b5!0C0|z
zuUgn*`7P>!x@#OZ?;2dh#XA%Daf7`gqcw??>7NrIboT~k)eD@eKU#W=i&vlW=4IFP
zvw-w)oG#admvptmIglSe1tpGh_hz<MO)CIT-q{~3U(|6$3y4QpahR7*PB={%ic`7U
z_gQ`kCNs7>JMKWk1I8+)BGZtZbIq&;xP|rGZvl{L!sBr3tKT>m>hT5d@O$@d=iY>u
zE>=%jPSulSD4X`g(#O75G^L<t`o}UjvxElLSmw@1f+CE7^XRu7R+Xr*%CG1ZBOB#0
zPPBBIaCge%nKrbP;@%`2gI{*Kj7w@@U6T~rPIG#!?$0#@2a_`;c-97e&swvq;F)-i
zwg6m)q$FP-E&xR?zIf>}0u(V2p*4jS?m@S9iUbpK9DAaCl$0sxez8!X3|LQAMG#~p
zfgkI)m|gqTMCm1K2~%NhUjhYJdqxc<_h2}j%t>XT`a!;lyli+<MLW9gky{xjkecAi
zYFbt{YV0tkbl}$K2fqduDPHKrCYh4IoSPrdhbtRT#XAVR$OC(N;%gnh0Pdb%qo=n3
z1e1GwMgfv0C`s2W9ae0eM~e`cayX&vqh8u-H}(?aPIlb%YPtgPE~hCE^ALe!6AL@s
z>^@#Pa!)slQJ@cD(C=|N2t><XbmY{0jEpo+bUP{PjN$1Y=C*0{8lk;HWgl9&kxrsp
zDtLa_VpYFPd#<ItHh?{5QCH@^vo~Dd70qEZ63AAo_x|ZUmBQ^olW4^G@aiaRx@N-y
z&=TJAcT<4M;3G)8eCqV$dGDBAaN%*;Q;~3r>TU1od&!VRbi#m{uJx#C_E0nbSLy7e
zdMI`*ti&gyvjp^(V!TRo6VG-C1BoHwe+8HPCi8X$;5t^O#|~#S`@O_Bs=Dy4Y*^b^
zb$>(G)I<r60K&jl*b#1P5Q<A$WN2|(zLl)hJh%iQ-_hL1HSu9hBkRZwTKddNLu|K`
zvH3~o-J-82Gw8l1XW*D3rCGBaE{G2g3mdVlm@sm!IwpR1<gsl4q_k1s&>ZqOww(`m
zZi1>Mkn;}Ym}h>t*>Jp)TW-IsaDY6#pb0=eTI{_#jv)7mnw9%*wku9G3RKsPt!uaZ
zq@y18y3{O!g2&4Do=On!h}{c~l2dC*472A}?%k0TBEK50Kz&bfrw)VP*Pa>sJ`Iy_
z662^H^DifI5-Sw+Z<@>4>*`qdlGgS~(;Nm8C2KcH<Uk>U#>#1pYj_0R8KP#jI@fc>
z`wAueHmgY=z_S2#fc{mFgGw~H9z+LYSE$_dTvv<y@!#6pNxW7@<Glf7s+4cNwqH=@
zK#<g>EIl^f6eM#TTY|U|L?kzPo1q~9PutG3n1J-)qFkm5c3X&M9Lx5S_vAi7mJREa
z0H^fXI+a|2U;Bb2NdE4Zfa0clV!itsT7|!(pl)9S)d$jas2@@+*j*;&G!sT<Nwg0|
z%n~6|%EWsG@;zu-p}##4@Y_1~fm`(*rGU$aM1RuZ5bln_K@I2*?}^~?e5$uac}&i~
z=RxL`JYSS-@m0~Z<T^j_jUD2pH_tDNh>c(Z{g~>j>UR62RFsqqGHbqfR#?3X8*12W
z#*q&21%)Ul92d+9^%k#A3+{UoO)*K&9P7+EC!E}Fm08fM@~)jcUsZDjHSaZ25-)^X
zNv8#F<dPb{H{fv3=AzM9?)XX6YN@{1q!UQU6K*X}_Y<os0Mg~54;@jY!JM3<hWh_<
z#0v#sL{w9PmJ2?#g<_gT#UxRnxWDn)_Q()4gr<upQk18>MT~nlX2rG#gQoI4v*S|g
zHbA2sQo?{>fDhG|-9tjz5zu+Yimv)AnFZWC+XJ^DM?&J6RUYhKgkHQoB72<Iy6z|=
zJR$<&Z?)795MQex`CGofHik0O1b|P@<ZY0$CUoj}0nQrXi@3=ecAeXkj-<9}pIZLK
zwad~;Dm8f@JH{v3nBkH$XHH+(Aump8igx7|P^bhDhGdK7H2siGA|nSf;+C2XcE(h1
zdW%thSBsK?*1rE!Y5OW&qNiwWcFS)5gWW14tA^8ky?&P}>CAZUDt%=}3hlAi-@6+C
zo<_~}3<cz>CtWuz0o*MfUkECDf*N`8x@P>2m(X2;9(eNhToXx#gAyo?iUe?eRLZ~)
zltd<}{nAL%`|>q<79F$c+LP5N8pbGv8UZbpv~>}DzHgmDIsy|;JuqCXzQVSH3D-i|
z0oN>?akt%R|NTFZqJ%^=JA?47K^5&m+3N-K_yW73WhTAN`0>?DRw~(gvDNkY1@Suz
zRS)b~i{Q4PGsx~PT;D@YsVCT2i!}~iPEt?ldbIN_QOkKmQFe5H?Bne1`!hGZzuc2c
z5PB85!H_!1j@@i>Ieo=QMqx5C=L4Z}?EBz~M~TuQ7UsNb|1eKl3A~Fx+zE^@Y73=@
za)$}h#HWlVnJFkaS#|)L{8~n=b-72R7D4Xg5>8W<jD8vZ!Qh88m<?`zv1X{J!(?>z
zOT!D=j-xcN&lQv;BqU>0RtU@aR0(CyCk*OXete$=NvNK^Z76gA{>rFD;qlz{7Jn4u
zV8_Ch$kxgU#@xfNI3%p4lzgjt)|Ck`;Vx2jfs@S^k0>>Xw~og_M}UuKkS^rLc56bF
zT1=QMTuH)}c*&GmwCOPElO{$7@MM1nk4dZ;1J2DUhBwUm%Pl{%-g#NZ)SLI#2a~CU
z-qbpjFTG$aS?fXjEk!EP62|Pa9LW8>qN$i6b!EQ68nGsJng=q<VckX!k&o0bQb^7c
zWFfn~*2f|a8jts5tw~9gf~lmPLq4v#Z#;&Y6f;C!FRiO?RH#PIA4++*{2kVV8<Kg2
z)!CsSYIjzHZ;2GT>KOS`3&7%d6*|gsk1;AbYe-z{v)`V2h3%+;3tWs-xlFO{O%C>u
z2?~Iy3G=RFK)uO{<u+nIsVCcp`Lz7PR@3u^HYdY<C!Y1IKadLZo65?R)IKPdkI#wy
z({;h0L$DDhZ<Jtr1z5pqBJn=S+D+_MzntA_>X2!hPj}v*tYVH}O0+~^<6xr$QitcT
zDVo9j!elvzJ|RZ*<)Qc)HQ8$}E{Qi<A00|o2YR*szL$7kCBPKlW@ZEGnZ>A=?L@hO
zZm_-R3fk9Q4}e4;Sd1(&A>j;*tDMSqYLEWmxrKg)B!?n_G-fD2zXim6@2&h~G*E7c
z`7m*#QiuCCcX@73%B>Hauix9|n`rt8E2n?9`{|1RcFOQ#jDZ%~ndcPh?^|fEOI#%J
zVGUq>^vm@8um$ftk~e>_e9$$qvbFvnL9mw)E#@2v%;91z1Okh;VK@07Epn;7eQ2dw
z3aJB)0%l783H<&J)PuB>l1z72J3-e+S-h{m%VJg0B{78G=tgxkzkljTOVL%Q_~n0G
zmR^v&slKFje^HhM-gZ|Hd=>0}eAVb(xXHQ&PE~XY@Gq`iw^)9$_`k+0#&)e3K-U^G
zdJ~QDKp^+eJ?hV&ie<bgVd_rpCink7-F{9fr_RN6N|?(C=Op2`_rHx(Xdf%AidI~L
zH_n|d_{Q&-y;Nd)#YR6$SkV57t=?~|73vPew`#tm7+wu%uU8Zg{KX7^FeH+Suq$nX
z)ZeJ5kRp%fEC2J>fS1sL(cX_6$sMDfQoK6%iwXW5p-CL1-l?-#1kByT?g!$&KS%ZV
zGO5#F+bzDW_tBRwg!cE>pC!U{83lxpxW&lS-zmGyL`nYAHxf8wemO<|dO+1}A^h0!
zhoI{C+k_f;3)zoGfdq-VO|i+H-~K%3CuH*HpQ~Qh^ZeedvlZf<RXtKE`2YL3ItdvB
z_W%4iptIiwxA&&1lz`tCL<A!FLIgjSoOha>@wZnqs1`cnlm~5?PapLP{MUr8wK8~T
zy|;guDf95kzuK?-`H*Oq?;r7MKDF$^+g5j;Ab(%TD^Sdlx$hE-p}*MxH3u|hzW+&Y
z3oVB%e*W*PFizA%F8_@nJa#c5_4kc_l~}G_9|F7OVrz2aw*k$iczwXj4Z-CBgxXcR
zZ@*8+dq{`E$TfIdYl%qe@9#_p)w)8)0b<=YyKdp%jps7<p?ym-oQ!}t!(aMx0#(fK
z1bQC&B!d?D#742(WdG9*iDCZHe7&FQV(hzz6r8_k_x?0*n-USxZRWW%%#lsA|G_C@
z7j4`tC<4NB&v+<G|JAhe$6(%I^|a86HGyl;e?P28gG0XqX975EibJ;F=8YP@4Sg(+
zb~xKA9#SU1rdNN?p%+F<I_OEiuurSkg8W^?{#xrbZxe|C^?Q<yv?5(sI{*9p3*O`;
zHg}VmHB`*x4+L#T<_-gYIYWQGuJ^ZVa`pO9OC`*aFuqOQ--kiXfsq1C?j4~p)6dWR
zoI3DCtV7c}kzM%o;6EwYzkf1#=S4**iXZ!qhl&#rsQ&xuO>GM;HZt3hO^x5)9Hgd1
z_0$)D(M8)nHu;^%HCHP{zxtR<9}MaQ=OP+~|HDWBd`K&`w_L0A8bx{SRWpJg=jFfu
z2Z$h&-~oJR^cQ2}Jcw;Je@^+|zAaQ9A{|o!H!)=ZhQrIm`7b5XKOYCaLOt+?iiXmY
zM%qkWDEY5isz1KYAWTT&Wf4+e5omp+c3}too-T~Lp;m12UbOjoq+`@Cx1xXE;yOfB
z>V~F-1QyzS$^NJLf>0yDcVb`?EW-TXQ~$Sb#x!^%q}G!B7@Q8F{lXmoohUFXCACiz
ze3$fFr2dzMk$Z=VcDAR0Psb@fmiq0})Wgu`j8Js*X)Vv&HYdN&q6B0`U(v`pdf*=7
ze~fRgSO|2odZPW!4+vW{U_W&HeIajphnh=Bm<FiRe*>vb9;W|cNVXb{4tXy}&3C8h
z`7LaZ!yB7^_alB-PuyDlZp+_t)G)#OZ7<Fh0CS?-2QFs49^;`cc#EV3NV1VMbFqgw
z(})L0$5eT)=+LRZx@mOyvjnrxRGyZf6N4-I5;HYRvnm(DOa9dj<CQm1UxUDEZ~Qr3
zIoC1$^OMB6&7CJkZ>+=3`oixb?cvS*<$?UFyIscRHk)!xM`w{E?Cbxm8OUanLJZ}Z
zm`#ATC@!N4+Hh=Ds6A_>^E+P*4;>!jI3lf%na)o5Vm-IkRF22qkq$Rq{$=*(QW^!%
zt3FDf?g1Ins#n`QM(8o|ddmsdm(|Wgn8scGS5)%1k4`LMMaq+u&v(u_Mj~o=J;IUO
z_kNw7sI}sq;GtqeCNP{IvZIXHe`r*|9XI$*k~4gX6V<%MS*+_C@5B$=x1=R#cx#Lh
z{oI2f8~%k&PDRBCuop2bNYAquFu}Rxy$gNJ6044qO?c}0nlx{8_1$E~5f;g$Oi#vT
zi@dl0SpEwL=pmhvB>nJfaBRz;!PrxpYgn3Ni&xTJ%4H(B`ARB?*N01FR#N_vssqWj
zSfN%5Ir_A7QhtkC$oZc;4!*ERW%-^<HCLxu6ctE&ZuVL5>^Tj`dAXd<r)|Ch{UdE&
z^~oH|+===@uX>p4>Jz3|fn0=x`Y-#zTZ_PkqR)LJ=EDa9FroG{JSt#9sl$Z~*0}OU
zP5skdc*|=)b?%oQwiK1W+KntStS*`Bw|0YPNOb4tjTTHWJ&IiQ&cIA`-5zX9cz)}b
zIpzT}yJGp3l~K3lc}^L>e@D-eA%vw+FjFJ2Z`5<FS#`3IC=Uz99L@9f#-oFB7MI0-
zJ^A)AABqZiE$)!aksB?4(LTZ)Ej_@IEulUb(cJtPi6}!#T9+V_?f!&6{%MHp+Ij$o
zv=6cipscV<D2f8V^*+9VZWVDfk=u}@b3FP=^LQ;}5N5oEN>uMn)6LN?99<qfSjsN{
za9M2{;Bt%lW%$Z;_Dj13T*m;fM#I>Fi8}eX9ATE5<^TYA@%ycJ@#)-INA^LnT;ka6
zV12@UqRzEuu9u6xW^G><RI@eB;i3}-IYONiPE&@UUtt{XE!)JplWi$lr@s&H&};}*
z5ao-)`u)q`)a4g0MQf~>(0kMRuj!l?Xidh@=*r}H@K=Z}PwnKtY9BBgW*PT1#;IDy
zCUafxec?*RWoFRV631m;-Wm1a?y?<I@qL1TdlOnm$mZ#sMY4yWVWnj!*Ki)(T}LND
z8{l|}BMetw147$S`Gn&UycW>@M$q{@e1XGI3Q(Wh2b~@fRf`Fl`63yVRBL}lPr$f0
z=T&Dd&wPA;vF;owwxg!U%UH9{4^zf!ZM)Ig2?eEERaSir@AvpJN;HP@2n}{atTzEp
z#Y{RD7N|>9a}JW}Rn*N`MPHlZ4fZg)xjwA{MuDn6H|TKr9w6UJn~M$b`%Im*^FLag
zg9_%-*l3Dq$4GxXpHn&D&W-_E0n+CM#~Q#VZ*s8NBg;)KDE{#!w`jiGjy!pc4&NLU
zOs)dMgA-tEb1!SH>7=F`4i*qLJRVI+-@;d%?4Ej_8rD5H3|c83Rf0@IdeI8^2oQUl
z_FbH~AMO6UBbaBwL{S!Zk6B*buG16fT8Ny6CK)GEn4X_37I-*sCgQNwHUeBU)OBxA
zbv4@94}jw>7Za<G7RW4t7~%dwSxBr^^@EX5QvfJ0L+mmII!||bG~^hMz%zlI;(k8-
z3?wtZ`|ZCg2Fw{{wSpHb!-ZBZjX)n5RSpucu~(H~6Q#L(tE2s5A(YzsiCg%9)^L{u
z(|JJ0dEQWP?&;U~k&oh*R1aoMt<xyQu^}`Ogfb5&sVahXky;){=|!e2Wd-Hu!wvgc
z@aQi8aXi7=3ckX~qs8Ohs>b2eD*)#6R@=V!bCG3fK|8*nl3*=c`SqVb!VBJ)i7;!t
zf?^rpdP!f3DFHNZA?jI-W=qZJD$}Z5rfC(H9L@CRo%a!Ku3<`QZTzic_NSjj_s$Lh
z$S(w{R9b>2p<rWUTL8Mj+cj5^G%G#YA&JURR9YU$E|E?=18wH6+I75L6Fff($uIgs
zdR8Ui&dQm84tDJ5ix7S*QPIzi^WmrN@t|f=02-3_76Ps6jnVV7gX9LPlT6XT1u&;s
zlMQDvFPz3-`gg`~b#w`!ERaB}0$vTs$2A3$kI{MZ=N)MIP?APdT|*(7a$8a-(8ins
z1yoDWY|)VhPfs9cT7(0eh_gom8RA27u4;Ec%~^YmFMuxo<`RYS?0n;}1Q}9~cq-4h
z=PS<kYh}ZqRvv-huD-b&3~x9;8!O~+-ZmI}`85G%^WAHN7a=}YPMjHRjVR3qZA-q?
z9~sr6c{Cmq0fclsxlYf=piUr*e`O2|a~15ia!@6_9QYF|41z&P>YSIW>xCzoE$_@o
zf5cJ1A@nfj_al@m9dMXd5j^>6EjjnC>)!InDXiV=d%341J2<utwooq&FbBunb~@K`
z3+sz5%SXRJ)!*jXEy{Q#0`zOepo(UvSg=-tlbE-kfcx<=U}l{`$pp0c`OgexA8q}C
zOK}W%joTlQN87cYXP)P%#E!^qF9xR|2xfXZ4WN0Fje+LZxsNh{cuur!H+|10U>u?v
zz^3eg+MK<$tk7$MkUWc?aS2;B036>)-2!TVz{EgN4U95B1>7=U<bxeGX5lR~aU(P0
z9oN0}U_{!<eJanBfK$NKIJ-r2e;sI|j%zF!^DYfrJ9pDUs|`lVEy>XJew;W8J%F?=
z!`WrtV6X?7Wc5Owy78*zRl9DbtozeVe_#<l^TbZ5J6<slhrqDivW>%i<W+U+9?6Bt
z;o|?iC3=UmUSlwM{=U}DWh8HxLm1ss$~&C?8sj%v^NnPi5p38Pb-u$6I4PmA85Qg8
zPkB#z#Z%qcqF>|j_7L5Z4+%o8lg;?hIJ3SgDZ?jU5)uz~gE`hpDv=RMmsA?G5s(b1
z53(ureSb78WxDxY*rT=U68`u`WG?qp>9;GV^O1>bBQHYA03E4kFNLSC)5<;sw4qNV
z&H+^H_Zj3YWR|7LRK<g3NLrZDw<v66-qIy#0ALhJ=yn1fjn#fx!QA3@(Dii`RNdw-
zJt93$zaVu#j(KDgREhA`kH`Ee89noEzpz!cibb~%ZsuJnB6867A@omDR%7VA5~)4Z
z4eZ>2M9W4$MiWl@^K6FOV%#YZclb1qf3+}fqC^$!_Y2<7uDxMZ3v=X%v8XTNI#<QV
zsnVg&mOMjH<c%iqsp+6#*(%xaT8pk)>=3e*CQyDb*0d<90*#023$8rNt^;Xx7=YTI
zzd{qwM4XH&b*SuKbb{VB!4t#r?A#d-=|OkD*Bu{ii_5Gg5C`m_`SErFThk6#760tJ
z61Sv#f`U_kCGe2yLf`4(HrV|rN-mJHFI{Um_h9Y0UJia{_6X;#I^XSYIqgXXXLLdP
zaoD9LfW9vRoU}<?GStMCY7YRimK^&Q%iYk-4y{un8d10)DcaIABXty7YNCM^GFu_#
z=ozl8QUp-4e1@{g24nEQ0S0QfAh#pMAFO&jyQf#ezB@_>rYrO|B)#7_ed2>$Kkbm)
zxpv;?+coM2d;7%YIHJu%1~?==qt+x$;}$L0-O77Ob12;$m+HxcemHICxP6cCl}a6!
zhC?0l*)Q(PuOFJ{5B5&FAJmoZ=j*ithP@R);6(rmngr`qJ2i=Y*3CZ@x*mqF7>O%{
zQ0(ROUdshy50hQINDK4P;&2ML&8B(cPp7IpY9J_)B7u)i9W-M3-T9oaK<C6|ZvskP
z%0ua}Tyhr^HtxNHd653zSqQhgxwqn==Ib)xNDNIv1IRra0RQ0(cGwMAI}4W-B>%`$
z(9)X8T*6D(7<rkWR&f2m<ry<z7&hSEV!!wD!C!b^U>)xWdxpQVM}F}mhuv~AGr0!l
zrpF7i4yiQQR_YkjTPt-!*k|VF+jOw1Y<Fy^l$!+jYV|yp2-x9EkF22e5T}`?{VZq!
zN4@ysw(2hmJmpYlyHbMd0yqI2tMnbWEMM)9%A1yn*4d@>vQs^g!w3CmU=ADv%5kSl
zpw@lh0K%1*R<zlXmY@x%1af`c*?lV42(%q=@b*Pn0=nH6lA-P8^WK30QpwVV81w3r
z2l%uCYd%jW^3+Q*iso0U*d15Z&MUKVYM~r7P^NcO=ax1VUn(Rgb9DsTu{a%8lt@f?
zTU6B3<8ic<SLIVL1P!^O)++K{4=h((?}pg(SZ4rz@zaHaBM`!j0wDJMYhxfA9${HE
ze?wqvOV_j05zDt>3rT<e4pE@R4Ni(0sk0N$a76BD&f-9t#1v{DgXs}if`WrSIVvrj
zA7IZvFGdC_ULKq1G|sa}e5L5hkcp7@Om}_0e505&PKAjPRC`!=TIGi4=_&TG{1`S-
z+6X{VxcO)dCV!Q@_Q-W#*XothbGBQjUGhcr6>t5-aH#~EyRCM?MmfQr{TfnHpk4Dt
zMMXP?K(k+FqojM%YGR(_ktxk!YRV(d7Jqyver3EMX{dl<I}x|^KGWIJ;W>S4vXI~Q
zM(t*E;FjzWN^*cqm$Re)wtv`xjeKFe6svl|h$@mFM4N*P*;p%zUX1l74<iV;b?rI_
zJ&FjZbKTqViHIm>!6sdRFu0<!+xK3r4Cn$%Y2@O^s>2x%N3cz+k~$Ntz7*8e1^TIc
zUW`LS*||?vboe%EciaY7)QJwEsJ<gP8;jOZRNTR~X>a&bmmn@ufi{O8*1ro9-lb42
zE5W@kyfV->Y`dbzuna>nVn|gLSxVLcT1vqcFjkH^l2GMisM$lu(ZYGA`J@q38gHKj
zO%3D~l&VI8h4AQ!Z~1O)o65;-p$e~EuqX*jxyi%l*SQ0gCS!6nr^U=QJtDi>VELta
z1A|+`9`huV;R)Z0p_1^LNf+d3scF9g-t9hK=@`xjojFR{z06W80jZNrqY5gz?hf4Z
zuj)^ak)@2W%yiJ_J`DXI^J@Vc;9<v<LP39lHlP#)BnmSXQT`uYR{<4e*L4Yzl9Fx^
z1qlghkS>D`X(^?<K^g&RC8WDM1cafH?(XjHq2s^!es6rA@4sd(7p_?|^E~(7=iGD8
z-e+&9V?dus#Mz6#A2$fzI>hu_cOU<C<*3pYk*SbT-AdJ6eKd!iiujI#7c7#z{?jUR
z{#Ck7TLhxwbCZx~ZZ<*BZqBySw<2!1y8`NNC-7=7{lzBB%k`_x^`snTc{U)8O=6*S
zIMY2B>Imf2GKm^1R)z_-fNLl!5?EUBxK}?i_nK%_QS<zKO)FR9$w+bOSuoOQvx()O
zs+py)8cr!fd|&d!OVI>Z++7FsN%8tYl)~HE<uogfwF^Y~`z8-L84~!~Rs}9Id#a9T
z>947r>b{lMuT6!m&AytBX#3|`zD7`#Z+zx4N@SK?yAu)2-8M9ht@8W|jh>9NWYCkz
zWd%NtbFm&TIIa~k_?i|E=azY7^?}=9Kvo1zWU!XmLJ#oDjA&(A<o|PPyDJlM0j0!8
z=HEaKy@mvQCG5SLhNd_T%qN8m&PRIcXRYg%PuT0#S2I5)p3c9dk71#yuo|dNs+wir
zYbCS!FU7fUsLLy23?!L;J=R#ecjyK;qwLklDaUQV#~R@SjQUBhsJT4suY*2;W;dV5
zTs1QafArMc5-ZX`<&pn!MZt2~3)9>cmb+SyNzH?3qV(<#TE_In%Q2kKhD&2`IXjBX
za9{nPPrL%p^|gq)_0bZZ-EaO%Ds1kB#CR*t6OY|vGk%R?6<zV!0zSjVOsJjn+1FJm
zvjcVvm5JjRJ2j@re_%2F(f2eqiPUQ>>z=7qdU#0b@syCDg3FUthRfIs{@xz3ktx^}
z@a?MR1wAn|QS3JMN2*fEKV?pX)hISY7)emnp8^?H#WemQ5)$z7VmP=D;aRa&^)-p=
zDz@dLZy}bif{nXO2Afm$XjD6j+=mT$-~OZIP7#IB1ZKrX)9lci&{OXh<O#6)<DX2|
zufH;Gsa7f=($g~(m!|3l;zcQrb4{1^Uq76`v5H}xOI@N?-IzweO<eo0Vsbharu6}l
z^LRQ{3(v0skxn*klDT1OKr*R}Kpj*uiV<6TR-A3_P4joceYeKckOBP@)dzWe7$r3L
z*9ysZsi4t!S<JXwKL06J36i2#CJAYx6YKlu>b=eV2aDy;g6vyS!S7;N#j03B6~94h
zpfOVj3^iY}{O=GCg)A{FCsAi4sOcGdw{iUWga7zxSEQ#Kma3rkcu|}oeV3j6U3t3u
zE4dYJQ)6k7{YKz{e=Z9-@juIJ|M?a>recP-OI{puU3r(AIavSw`Cng`%mL>x>r@Sn
z4f@N)IcQ{4|MD$RN=$!IQ*a&vfd@ll^M4fU?k>xi$`W*Hp7#(uR3hf9_^k|lHw+Xc
z=<4k~J9k=amN^gau7rXY!TESyhj+W5s-QKLjIXsdh%kH<lK&3`L~s%=_U<@3sgVy1
z&p;vu59fOME_zh%4_BgRjBo#jda^wmdN^kccP4*i`AMO5tK1Q)1t}2kwFaG!-+w@$
zvhp>Aggdo!&FR;V|1p-HlPG0oQ==TShE%}K2;u#I@pAtd@8m)RX}+BYCNsIXuyGLI
zj9-|R=Qh?lX!(P#m$>FP5^XBp{BTj-f6v2b;br&FHI`g)2b1G5r~!Y`3J=p?_-;Sf
zJ)C49U;jmb0O3OzE`r@6Ja5OS&Oa+=c2t<<vTT62V$PiNiy%*-ju63B35jR-=d&t>
zsu*$py*xR9nj2LYLp+*PM)1EZA!($_S@zvq0B`t*_Xz!BZU1wE1>Yil`bjy~i^*qN
zx@bD^^0!6Sj^+8qM;JhX%k2}Z%)bs)n}UK>riu~STuNSxJmY9ee}CU{v4^inl_PMK
zC2>QuT>o`B?7$q7`sr&$;7Rq@>PzOo*R)V<q@wuFRZ%+OkE>?mfJHEJhVs;ukm9xd
zEnED>3U0Z$qf$ZG`?uv!@dF{E9$;eN+D^7S((VF@^*ce41o04(uSe)Ge_z&nUk@*K
z#$a$+-CU$)zW_4r4@f~|32uzBvH8`{LwsES9vHTVuya4D!PtL|mq!1Eh9%>{b-odS
zTTp8#`8%2m)*^x4nt|t94wdgt1pkP`|1<pT5Bp`avr)$ljV-Llhd+lz{&^hzYaBpz
z68kfqnjUcf+zgGrGc*4CtzUn6#v(6^bzYA4|IE}k&;OeQ@mG8rjuU(ww&w@JPY$Op
z-9Mu2pWkFmjTs`U(Lmk&B|EmH<X?=jW!>RE@NY14n{LYd>lF~8L_}5@)oRY_WI_um
zN%DR}9_}`OS^hVQ5tMRJ1j*X}83gcO!C(YoO4KTWF!V8%5VYT>5yle&JhACSQi?Ig
znIGrwy8c_3$}aJEqlpcWMqQlge=MXw@A7S-U|CtS)Rt!Fm+Z)oMB}`FJs*EX(_#&Z
zJD8A@BY=#fORDmBss&QMEl3!UK!IRL@UB4czf)li%PA4z^AiiC6BPgQb~n~%wZ|mX
z+Tq+ZpMLdL{MSGT7NUN7ge5WWJh9AkukF7zFn=x7_W^=WBMd$ivN**IQcV7B1pC)a
zFV}c%>4-E?uW2FtV`=|qkqc5IM?`*;*^<JDljNZ<`>&>&U)OrqagjePOzA7SavbR4
z{TI*Z&l~aVekhAU9Ay(t!U56-L$u$<4**$gYMG8`-$hpr13muOwf{Lro)54sVTl6=
ze^USKMd|r%iag`-K0^$H1IItv5PAOo^Pm9ssv+1<z8%11)CoBMf}{Zu!H&?B?G2FC
ziFYIXH#qYZoDO~gGDt%tjd9F=y8sXyR4E?%1C)LRAExd1-iH_(Ok_~|G{NIDeZDu{
z|LFHXC<n_)5kSJh?Yzrv`un?ru+V(k;BTO9>3$;dzhU@v$s09Yot@L}z{Dgntq2yK
z_<t4{7z25V*8od?!Ni90+k4i^f-|NXWO0_eD3;6rwX}Y%a6x@!AnGa30w^6ZcJKaW
z{wM?+e8>KJ82(yS6kk}t21^H`!~X+eBi$o-i2Ks=cdiCf{(Dc$r=}@Shn$O+hZ=VO
zC3vmCVc{182dgfje|_8}Jr9rBck9h#vuh$JFi-v^x>3k7f>Ar2_$YH%yhR@0PyOeT
z`_C8w9KDav{s(%VgSOh9Bsb;1Q5cYX9)acxR`BTJ{JG-)9$`;Otf%dL-xn8k|35IE
zLObP)PZQ<;|8ZbrS6ce~|NS^7x<tO=hCJYa{WG)tb2i=wG^C25J|lWYr5#R7^U412
zjq9(!0`Z=e4i3uyzZVeA>0VcE$!&W0JVWT;?!Vm*7R^8Q>gW3~^tB6A&!vBP{;{4~
z{Y<I5JwOC^Ve{JX@15??%Up&C>bQB5*37|zKACHuhJ)XB0>M=z(MMP&Ly)_jmitNB
z?~Dj*O$yMt_F>n3LBntU>|d6QB#5HJk5Yb-QT}x%+imzvo9<%pUoK1kn8=_zy;nTz
zYZHQpkt|uCeE%B1KOYznhDDpO0+*}aaa5r<{(c8AaD-Izhhy1M;0}Vgss0U$*QDrM
zDFW6wV6wKD?uGn$B>sC@Jr|)^j>SUvdqxxtV8|c^;onImfK-r*F8DL$(N`CMwchCd
z%TQ3*+<O!|$}5QuMw05^M)Dod39dl`oTM8cm_MiJ@B4lK#e?|a$_>z2eZcdC?O*l-
z4T|kl_H!qYTOCBf`u)Bk#XL?{O%+aI2H|icy&v!QfM@3U8FhSlCHyD#4?qq1=P~h*
z!3NZj<|ibK-@)nT`DT9q@BL`)MxLdz!-vCN)LFP(j(0!cNra4pho@F;zhfHA{`LnD
zz392q9z;Mm2h4X<(B)YU<Riuba9{))>??q}+Gw`<f1NDH5T=)9@sK4Lwyp2@SWsp7
zUt6srI3KER@&au~b+htF)NY3IjiFmhp%?+v!Sp?|j<6?pzV=^VHpj9VEy8gXXb5K2
znn8?|@!M(3K+zon%~EG0D$r;82MHC#&kGPxQk!-!*+D}g#;;qZMHmj(59K>4M#iQd
zzw1mmTJ3oQDfHq1yyRo|`dj`Ppi4t+0@RPqiW@Hg*nC^|W7fIrwB{9r1@)iL0tGdu
zJ>Q+SCDM4aXyvb57!$j_D{vfqW-Al>!Guj8-yS?m3C&PV+I%m*R!}(rnI`bmP0c|Y
zbt%xP90qceL;0Q~e=kbRb}`TFt(14-g9{ojpd@#q?6Q%9Lzz-LKrYYb+)#f~f;_lT
zshGOX4O%0|mn#F!b^8mhzS}Jx<ID1Ro};c}7r*NLDGz3gf)fEplKX7l8%QPJ8ido_
z%4Q+``j{7o-Rn0W{(Ry<iRz{HMp#CL;{XV+!RztXcg?H~D9DBc!?1`X+I#i4b+i~#
zt~euk7v<zTq5s99!9`dF<rq}_XOFZMzWPXj4+`$0JT(BIh~CyS73_98!?bHd_y1=7
zKa~Yt5`i<O`3nJoPQOl)UjpG`(Ox6_ZaEVu(!JS{xEyl(ukOlX4MA>-wZ0^Y27PYi
z)4O#zjUziRs^H)bgsfh{MjSI7Fl_`_p<<6=khK*oo@V{(GnYp$Ga2h0)pSTul=)r*
zivY3R&;@{`NdsSIeV(Mb`h~<-ZWBBloN;ktv-UNc)%b^M;NG~Yi^~jio@uH9N>y7g
zuixFis`zldKgC-rnmJ|AR2HQ)A~G`LYUs#6BiD?LJ!KQMVu%TGvgo9~v=ew@=oi(V
z<U%)WbOx!9cW0&q>RX9S6rJDUvw?n`08PgQgvi8?K;lk*G^X+pnTBL<(}~;4YOoDd
zU@I*8a<US`j?Z^zsxQ|@c5ORHl(jxkiNDy%G<P2bjb+ic;cnNA2RjPdQd^r=BHvIM
zx#oa}?#1isnyo_KyY?4=^P}dnj+|EkR5r|f3OL09TS@$Fuqw+fxlEbruTm4NS~p&e
zX%dndw}W09JrO_tuVqruD$Dr>*b3;V`q&zyv8IE|)aF(V(ADV+l;YfpElY4N*gOXS
zAdM@(I$omtjnhdJYTp#-2a?+JJChIZt2o=)JaXIT&UAg>v?lCIW>PT%I`-18>cgF!
zoXX9eMdF8=TQ^*50CVFB<0=ar8eMVo6VgQ9P%;j*fZpD_`IN*N2c3j;pxIki3>I%C
zhLA<KQtEO?IgX>}dSmoj)3Ol(*Ncg#kV1vk<!tEHn6l;pEU{TyirnOFCmqnx{V@kc
z=RJNo`RNXAB_9KNWbZc4@4)Wcix<|*?XUZcJQ+`hXjk}xj7PiH!4p-^KBn<q7<wV>
zvXQZYr~o9?P~F}9Fx*@Dua8-8O`J<P9QNvhug{i3XBPk-Tau%Y^>*Ci${X~|{g6uw
z``BEhjDI@9La+K!$-ypj!Ll7XsXB)KV&CJn)If{YyLWX(B1K~Wb#E(hNWNYozs)W)
zw<C+>$WBA3zd8&AXs$wJ`NpF)pxK#gozQym-ZddAnR7l;`J`z0C|w~-x&nx<jnL-6
z<xH7R)k<&W7nTBia2%j*JNn0${ph^;Q9TK~KL)9DW*s8mntC7{B<W3RQoAs+HRu?$
zFy4s)`}jy*ef$8FU32W>h!^9jIm~<Gxj~oqc5}-M@nj){)$Rm6*4tRXjl8d9U~u>3
z`u+;;U$?Be`^5IK->9k%CptSvo9#H8F+8KI$Y|M>`LVkff7DyPyDOafqF90d{cHRm
zYY;8{@PeY1DO=@+%;6n<r#_M$<x6E=BFK7_9s(c#?UjwmY>f{HsP`jdi$VX|U7~2(
zM(U7#JPp7ZBhcUU<F54n8jw!BfDlk&lR0DE*&~y|bR(Y+N_<?PnNCxA>d6ThsmUQ3
z3<s^;pC?7`*Lh9UKpf|5oL6lO-|ZEu+c{`7+;+Bqp`?>=h6n^$^0WlpRHb$i@tfHv
zGC8(3%M0$*5#Fu~?f1Fw8iS8o`Ox`2sG8#onFNrnhE5fk;+mY1H;>vr=w+I~RX&zc
zjbnuH+Nt%i#pknp+k8ip)BDD5O6k-^`;PkZnr~}(D<@Z8X{IA_Yr;n2)@`&dEb{%-
zWA&oUu8LX63lBFyH<@_E$@|LejcYkm2_Kjs|Gtr9D@|9%VcUANS~t73Ovu`h*&cXo
zb>3(+4d3mYKm<Q(SKkiuPO8`FDg^XW5&(#AZxLS!W$>-z^Iaj$^A@F!84}!RZj*1f
ziJF%;M%*ZEI(KPa@GC)l+o7!6ug70-f>ak<dvk5o3`lRgGIo>JoX6tb7kp0Emt@AN
ziOt@BeK~&XZM`~SFhyvPxbkD8hGFgVCfA-9>JA*|vf-9l#XG!t+Pz?Gmz;r{k&rp}
zTg?$p74Gd~5!a9Al?&nm!)w`TA!R_Fq+;;qu0i+?2$KBy)ob<%U=x*>k8T=ndVYT%
zkSRX*Axd-mPWuZM2W?<|;(Jyw8Q-M+c#5?pttzpd(Cx+6EmVf~X-2dCCj0T+?Kh?&
zY$7!!H8oxEg@syspi9z*)x~wg7kkD4Qtx>1Fo{deGZk#4;{cQ09UHPlL35~H4s)~L
zD)@1xMP~uo(7%npVRhUWG&s*gBjIXi0yv=SokIBGxg3tDGmwJbN%0HAa0pN;^>Vl~
zyaBkl5AT|CHJKhhL{LFt&PK96|9+9wjxc@!pfm)l|F{}obRgdQnv`VmW7_)rUQT-8
zCvvxFQ+%!IK7^KUtrm;$qMI#gR}_T;9(QyB^V&eO)}WXY=*4M$6M_a$Rf;A}>o+46
z04C2YZTpQ=$JrkAC4Jm=s+)34;>QOalrkq>5}{;EQ5ysmb8fr>Wh<c5`BZ2C=nr<(
z>fewO-;5Wib8^;f8apJ07VF}FQNsKVsc0d>agZ>pUJU3mB|YU%nZ<XOnftg`%SdqH
zqxD7vb#dx!Gq31;Ojg#%*-z7|Wn~xoJqA^t&C0W%yKsi7N^o(gk?w{IM`Ok~@7Z<H
z$@gh|)!Mo+SM%m1d^FXF^2v--(^s$j0XZhle7#$<1cIBd)Nn+8yFTyLDL0q4J@|$x
zkh|rrt;2Cl=2*y~#fBXZAleLJwkW+-ZMgv|-%3Q{Dc5@yQv$9<1qi6C)hYJyJK-|?
z2TF9>E?v(!tIVd(QKta2tpQDn#MF;`c1rq1$<=h16qMX<c)%@gb29gIw%H%s3`lX8
z0Zsm&y|zgm2V$NS2;6sA-R*4RqO&njN@qVA({i5xQujOjYdjYtNsfzFKTIo3Bvu4J
zxVZYCDv<60Ik#m$x<r87a~Sl_xv|!2z16f+`VktgJMw_|B*OK0z>xU)`<H5aS!aDE
zDo^MwvkLj~f#Zzf7UZ1^;I>&8)qZwQxT7ZB>CvB!b{GNgUuaPm*q3_{-P=NMM_og*
z_yrnufc54??YrBhcYP-3V_SezxUJ=<S#7@P)o-A9A&E3=q2$_|%clu3-M--qt<>qf
zwM^OxGz0XtUBtkoo!gtDqB}~`H8IPadEGhQ<znFBsa=1f;t#M6s&0kegQojCkBr-M
zvU{-57)!v0n;I|Cn^_w^nOy*a=?<5W_Ie405!JYrjl`id2$X;D6qeK^_tfa>0#ZcI
z#aLUGRBln9yC-$E^J8fSbFu5{y4lVtNRS8a;#B;W+IL7b#_m@pN|fcHZxqY-@olH~
zQLXW`kH$MdPd!4t5^GENF=q~ML9BgiqqzzV^>-ePcfKIB-y}3Rd&eaKSrGa9GErS%
zt{}-kuQll1s7Ti14Y#t}NEY>B5Wn;Ru`cdbH<zgsf+_hH^kH&N`$*-~e@00ZSWkw(
z7Q5bG_~|W%=A5U)gHrlbJ1ubK=K8W4wXOw^949AND6gO**Tj7s;Za>k!3mhsi+6-P
z20{at>=WxR^48ycoT@O>JGGNL8`HAc_NPg-6!Dvd;s-b%0*yl({37xTs4ZpQbxYLY
zhrDyKyWY<Pn_ljlN7r}V!XmE6EAQ|bp{ll?xwB%#Rmn^BH|2g91ncbTvo;@`n3&6j
z8fg$R7U8+o$o8xN1MM6#3~$u@8LI-YXf9Eu3-Gnf>s^a6D4wct7!N;nPbs>hKUhf%
z<v+w9KB$tnEks2@ag`NxyHGG1NUgF^k7CrUxwE5RhI?pH8JMUW2MD49Aoh$#@uSH+
zecB$nlWR0|_yI*fJAJ5PytQlh>3FV-Ad2Cg{x;)UAN{PDCo{q)#fMYbr;4)goyRmC
z&K8{J+%5|?_%Z?^fVI{e!gms_%|5mZM;<iXas}9s>ml_8GgfW$<3|S$i+zFdKvzt2
zF3=qC9i@JD*e>M4cm>P~z$4D;0|lnyHcd=5Ap*_!7=+dqE|IvqTf)+?g&@**YIT!Z
zw^MQe4aCc{r9(l<;!<n99l%HBZ@d6}xm{;$yf|Nl4HZQk5|h#VD%6PSCGP??<+iIH
z&ZTSaVIH6gZVfx+V9N=n(j1|V8t5;R_AR+p&kHyg0rqQ#d6HCZiq}WTnX<>5MhKuR
zF>$;&;X5T8U5bs!4N*=)biO9Fm946NXXEVJE<j*j8H~EQHSK!Pvi7a_l6GQEMQSq6
zbOv&v(O;zj$T~F(fM>Vq7NOE&UUP7t+;GGV3K_QAa?Lr-hoc%vAUDf0i?A?-j*J!G
z83fl$kE)qS2?RYGG4PtT7;)`2o(dryJMq-4l^MQN@~j1lZKH_OxG(H1l*s4JW^1Zq
z<v++3DY_<xkeugV#*?lH@M3T;&yRClNXqW@fNuWY)A`{oy|iGClgq#YMF|IEYiV!I
zO;?=9Dl$-hw9%0QXLh>181oMq^Pqz??p)CF%6VEMnHugwW7J^r(BcF8M6BJf(<YHm
zLvCK-DkEE+j1$AA&eHuC=UPa(WdnBAhwjsBjLg23fo!fDvo2i)JD(J!%UcR*8n1Wp
zCz<#!yWQm!1RL^)Ez+C=0>TkZS9=rI>c`VfMfqN&0AuON@*j5+^kX;EHhooOILdL1
z1YYl88_jXQ3?fuJ)87-rKiee4x{@bhuxOv^VybV_{KAL#rsg9s;JOHGx1CJZ;#acO
zA;59%81~7(u@%Y>q}7xJ9m<>WY@5hfx2Sc_ONKAMIwPm6A8)Por--=9P$Kw>2RXKC
zHq>Z?>@2V=?LZDPGHlETU7P?Trm+{w?{;}7h<IA~GJm^zkNON19qKH{pY&d=-ad~r
zn9=??R*uPUUYO188t5SXtPTNE8Cv;-6e0W<Pqo}J2^0jSYjw-6z}|GawE<^%xZFYO
zm-B##?Ww(IL4zM-zuVb@A5lV}?IfN%1EeTL7c>MguG=S~R!AMl2%Im=0*wL*P4om-
z&8F2Az88yA1jA{kk#KZC<DmYH+jWiB6_92=pCu!bPl0$Hf|v#gOg)?*9nPsgOSE_{
zJ=#S*F8*V{FlMVUgVN)TtEJp4f862`*?I+O?v8|_?2k7QDxtAfORW&?&XLiywiCvA
z4Y$O_Ed@RvJZO;fej}obOytdVk#!RJRCCd)){sKx)$)mgo5rSR8dRysOC5;L*u$nL
zxqf-+`DkTUjtSm%c3lSldXp6Ep6Q_37Vs|!JHb<wP6MRXajUij%&qf`(<*03`vqWm
z?$;U{Uyz}rt2K6lXQ}ZV@Tw<l-2kLaQy;_63eGGrV;@aFZ!I`@V6b&+cqVMl%5?Cz
zc~#`Mn`yc*uCr$)#_(j($?&mv*V2$#B}EyGyWiM+)yh1reiyablN#20K5!{@yPIe~
zxluA!%g;*gb|$1Hapk@^XR)~=KeJ|fez}Z;uflLxYZAe%V#d1`Z{u1HPRX&nzHL_@
zC4@3_0}k^yBtB`YeDxLEfW~vBpImHNe1>u<c!)N(UOWYR+WSjsD1We)190@%t-bz&
zF_~)+7vMGS>@J6yUEl$iv+45^BW@C<^v;~AAL$-joPTR{7Tud^rlwBSG_6Klj{*Fa
z!&z3Rx~(!eE^lB@UpC9<Fl%&Lo$}oIon}MV59IT}CwyAlzK<T$Pr8eMGuiT235dOz
zriI7hL`;OOGT%ez)7{8?|2mpahtun0hS!j@Q1|=|w;31K4G3DfY&{Y1nh1AuKyh9t
z7<|TqPU`L=FNVev!)=p!oADNU4Ag9^wbkT5oI2QDY7L|Y!XpSBZyoig$Xuw#9Czir
zxte{JpG|9>_UTBCXbc9L>F1f*T_1B(mxnrAj;C$Xu8Nw0S;yy<8~-_RktnvI!7r4j
z=l6P0I<p_2%$|26@Y@}vu0WV0{jAY4=m5FfEjMfMU84EaBK<IPkkOD^AcDzpyj#L8
zNH6K1v`UYPqpc`lR8Xj34;LhITrkKn`6M@Ew4B|=_5fcNC1o6((H3`Ew`;rDExEx-
z0Y#?g*RgJGq(7>495F8H0o`>*AdAu_Cm=*j2p4ySdys_&ul7F9I@t@yFXZ;;4e+Oz
zO2_myol}g=S?>@4b?xYTXAVs(@NHO0dUKLwJC)lYt2?lj4eYh}v+*rbD0wazCak-b
z`5s_z<!oBnwdGMnI{O-)r7+IoHk<>~3k{e&zoNHA4$yUlS8LZ1pvluh1MyaMpbw$8
z)k`}#mP*g{UEz;e$gZ4I4DaclN-_1!QJQwF?`kU|EiT1Nyq6*F^%+Opd8huP3IqK@
z(8OV@dtyI1pnK`gVoFNKd_sKqZ6T!i>QZK^X5}NgeDK~hpb#)LUr6qp+jnz8w-JB0
z22Wjcmy~3czT+FejC_}L!1p$Mr04m)+=sHE`qu#8eeJ(?SIZD*R3fZz`%?=bc>xQe
z@>=OzC6mPkYCxUhri@m*?MINPoUaMk*RS($yHSosVQVJe+cM%sxUVRYOU5zje#IQD
ztp#<;Ti`HW0zD2hV$i3zS%D%w#wjcQV)bX6f%(>N)}us7hk!ENlfu#YP&?Qn)Ui9N
zt@!0=lq;!$?$`+^K1{7q%FPSm=RwvBbJyRv?sQt~A^5VTn_NSAFSYc@ot6j2#YA>C
z3n$(xXcS?h1#bdspZmLTGty-!kfR@aFLU`}GbRJo6`DWbljU3Wlu6!2PJ4bpg1iD8
zXvPp<N4^nT?FjJgUuVw|?FYQybS*7D8hPAWY7@1HqDN2Jjns>S&FwsKHI`eL#+Z&)
zYPY{JNsWqY*5paZ>#7!Or)c3kc+(G$MvD^QOr4?raBED=K-TND+PS;smKKR<nHYIg
zs%FGQ+$n9Cms+)IBa`?FXpyE1*#FFkXy>eZiXY*tOlvU+>^XMfIWHc(RvI(|je0A6
zwyqo|mlS?4ulU(&@)#K-@_64N=nUVly<z{f2M!|TcSYS>ubT7KmGi*iDY+*JrwX}!
zC1-khJJ-0^8c5DP&3js&rDv#)&~230kTnVa%cZ#=jVBkJ+$GKRbA#vcQZ7mBR*MEM
zJmjpe+lx4IqqO$-OqfaNtXf;nH~^71=}Cf3tm*AWjZCEaYwp&FP=kx(xog>CEqAHw
zJ+ad3;s>6hu-8)qr61n-iau%jr8U|91XdGfBTy+fxo5CaTdF3o?mem7w$<+5TpgnH
zoK3H-mgteICgkN%)Zu(!bX`Pllyt5~@Cms)C0>2fMP-rJHV|J~ahtfVXex?8dG}Oj
z&zw;x+fD8b1H@wpr}|=LA3q9_Y6AbT*^Fqh<e~3qnalZy^P;oY&V-Kzl}I}|Et|0<
zD(^+#9@L`yqP>|(qnhCNW;!}p)6#5-{KQgcC~3csS`sF(6?G00&BmhC22X1mw)b+%
zC%FOfie6;69it-*Pl<Z14_czmvJfFPY$*FipA}`<@^i!jzhy3%3R!%0yY3vr7)3UR
z(nG@P$eh<3`BlxC8K@oWE~c_S3NY4xzC7w9bi=+DCVy}=V8Z>M6~vw{FM>Q`BD$s~
zwx%`INDm{gDMyA2EtX|UJ)3<34nRql4``3}hBh}<zOId=iPV0wmG(k?t9fOdtzjo}
zXj_xxm4V`6x*BFlPr;(yAUG$A5a69UR^_a`0ZkE%hf{cc{BiIVjoMt@Eq?yn1(L#-
z>UJ#osyp_o9FJ+oJ)LI^KO27Wc%kxcXlgS=8ES--<9q<<+S%lIXCU7=pH;<&JU$_M
zm&tLT4`IkjAmf50&uB`ap#D)3DmX+QuCXC~p``$twJWs)VLMD9Fp<6D-5DP0F|KrD
znET>vo&Vz<4So<khBb9wJ*9vF$T{4}n_X|O18Q$@y2pXD8U14tnbFY;jkh!0QqvBQ
zy&(!N_w|%oUD$1K#$hetA)}I|f<=2XX2N`@`18vEUNG(9osG-wVvVKFA(L|>gIuE!
zJyKd=wn)f}Np$YTrvg;NLyVyTUW}b)Hqmh?HW6_vRCN&$t&MM$T6lr-SI(L4Uxq_`
z={iS}_4*l@{rTY8;>9{%h9j8zVvsMmvnU=;fJClZ*v30>_m@m{4G@I9S$ta2Fe9fc
z*3+CGh_yoTnF;bGfQd*y)qM99q9GJpMH{Wg>z1&&TeQ>KZ;~xn?-tdnUpdHua(`We
zF2@Ws{}g%ODJwTb>X*G5?WxOkz0YSI!Th!soo)HG<}842RFaZ}@QXnCP{T*-2sdA$
zX$%x{hvrVU#`?7OmCKXnt1=ml%v0kSjlILpG_;9cB<&W2(VIA`EEe?~Ol^T$xY-of
zUfhiA=OoQ9xe1&hS`y#$v!6z@fD*>7T0eqhZtB&OQUy8T>DUO+S+vN6X{VEwC`81O
zqSxPIB{%SFR;{rrdAQ81(LVN-xThrAsT0WFt|gzxE@Vr^Rg{p(H9|VfBjb-ids#Us
zXO6D4IX0b^4t^BHcW^GO3$zY=Kwt^Hd`>mZ*9WyvRlga4roWH^FqAjYqUG6gS`@D>
z8*y0O>(`S7Q<;pIYKhgNP<D&iZd<GZN}aLl7i69A>Gfx0(Y^l8X-}VKo^A(JDDidR
zJWv{(q)BufJT>Kp4u`uYziyngeN)=U@x;4u)6pAZhcyQZ2y=Q21kcmSkcsqp%%{h-
z*##bdhx`hCl4lzI7zCZl$H21TOfRz9%ld#dsRiABBEhz|55)R40Rda|`ZM8oastir
zLW8-T0WQ6nxyl`<*tgFV_9Sap=WJf(d8@*B@8C9!8i=%om>>iW)|qA7#EvK1&Sp)D
z-`a1s(IF&4AuqP_S*Q1l>~D^kOQgxDz7%@Q%AoiH%t|>3`fKxink?#3p#CG^$&};^
z(VmAZbr=_@GOmQm7y)S2rn-m_Bae|^0`!+K!Hm}E_OS8r&ZOpB7Q-#(b%)lcD;X}l
zb3a>0!s+~iPm1i>6bD*L8TGNf*(d7{eb<%u&&VDLRS#DS*MHj#ObcUb#dj5wx@yYT
zQh?%XwFdJJ(xfV-4xgo(F;%7O$^?Se#>?dhpjVb3f%@2O>iwx*p&}b`P}t}bbk{)}
zforxF(DYjLddPtOr6nj1WNd3<9>>d*9-Kl|oL;brifVPAP8e8>sjodvdM_{X={PU%
zCgS_kvk}#)=?!agO}mlEn?>=^kv!4H)U$^qGFQ)Mlx1>6e}HD8y=MxMM!I|*WQeuT
zjz7FAia88S>omE4A3zbFNjopwQd3b+VBtHlm+z|^MYrAK{22N$cXRJ{f)}G9Dfcuq
z>2bzWA{4^N30XRM96907MTDgWv>!i&C)660{=jr|&tQA0_$W0jpY3HpXZLMQVyI0*
zA5?X&h>375r|6@=TH<-U)!q?*-x0~APo^>#E5WDrT%|l?+{>h6f0;{34(oC!-A!$Y
zXdb)Rc;Qb}I`M)`>J^en9wVyW@J0Tiq=S~Pi3AC8OlFunijkQ;(@c169hJyCi2EPF
z5(pPB;}DCaY__wm>eM<zSF%rmC|u_YF9#2H4w%;(?h~I%s0mAxrdB3uo?G?Utg@>;
zFHkGTY4BY$<ZbwI=Ea^#a;Sd=a|jmzT5B}}K1j#aiBF}QjG4ze>@+t%$09gAv06$;
zylS44J6L=o`jh?}WYXn}m(2A~M>vWi*%YrJcrTC#W9-aBVzLhOkA|d8=KR#A=#+9v
zN@}V3>|lyndJT~gw^O7mVvHD${L4~COHg$*=kL{Z$FMq4klMhpzEU^o3c{a+Z5b2@
ziUU56IY34|K%YPlt>Ez}FKb16SCGYS%BRBbH#}ics>DvA=W`!C3WlFV)IW=#7|T~x
zxQKOHE247pOFJ6Kx+#s3DCt%hTzE;GwFKPGBTZGtRJ6EP+P*Fc3UfE;KRwzAP|Q(K
zFM`Z(2Px1l?eLuCT&{4<b6MutP?U^$yl^@~?&2__@e5&mfWBYSQc~llh34|IWQbQ^
zUwh0hg<r`h>i$9o{o8My_3V1NYRo2?Yu~Bggv96CjgcL8ha|DD@zG`R(|Vp+^d&sY
zREQ?|Lf%Y^VyH`iO^9rJ8*hWoTu&oOP{i&i?h9@F+W$KFCRE!{@_oDC%!l~311#bI
zm|8Bo0E}f+f3B(wl@bFsBqeTYR<23^806~w1*D@CU<|1Q150)FDez;(0@4FR<MAe}
zSmkK?iCzOKZUY5QwpTEfSMy;Vypxe-h8CiWnq@C~SGi3nd1L*n@L2E(rVI;xSkvCu
zB&U2<7?ZuHb*=W<KwWuN;Aki@Eu}NWLWN-`SSE2`b*M_C?a>J9AcXl%Jrm2==EV0X
zr0wpUvyhyZ*WUW0)q9K>MA!gO$9)D_?NJxatq5vCwMqz9*+5;^DRX+wt~z$m@xx@y
z$e;)RB~+8WmkM!rV6MD>k{x#K44F2TJ=ur9F>KE^OD7x?9l{m@fA@+)LWFtE9C!fF
z!G`f-C68$=$JAjiFOgOOU5pr^*-O8NGmaw>&ZOH&j^;_e!DU6R*xIhpK}R54o6J3H
z3N852eFn41speC}(~w?r#POiUVH5$PXr?FBuqWpTLS@FLJw)zK2P69XSpxX=gjw<W
ztX7NZ#Z49U8j}QQmR$H=rS)X14%tJ2k4E-PH&+pHB}sK_Qzstgn3t@E_dDkM`Ac;p
zbk9AKWAmWL(dw%xTr+0n_;w<`=yR)Dc5cf!#^SEoy?2(6+FdVe)4z<nOH}B-&#9Ot
zZ8k71OywV*$%RhJOtMxvur91&kT!BI>_ldkDBq`YWv}^k#|m0m%-wLx5f^7T&Vv?*
zT3w{ELxOmtfbOv?yueSgsLq|GgRK>kkaQNEqwS{cu2@PO%EJsdJe0BIMOSLqMP0rV
z_iS;C((;kw!-?9qAkR=cw)+8*&-OCT&5y*<`X3J%aeNH%cS080zFGSDlcZ?pi8bdR
z<9_!qJ|*J5->2}4GoH%e@*J*n^MO>K6YR*>VOP*G${l)x8kqf()rCcFbT0IGUi8P`
ze5L#sTSk)9L`Gpb)(VR>dcjxl0fK|gfP;=Cn&8aCXZZD@=pOijao9Bbnl_Wp-)6*M
z@cSy>-mW^qEK^ERxUI8A3ny^+Ij^T6LU$Yc4&LI2=IAhnEqVOs)$pVfw!6;}!|%CS
zjcD9<><X8qoFzpGOx%k}CVBK7bBf{?Cpoqmr9rEPU$1!$OBlN@AU&VB5RIpO-=IlZ
zvt>?cR7<|k0?mqoxOsu<xl8PjHd=@!ZPX#@-HrVDYVL()QzKQu=B?L|Z1W5ZhF66m
zN5(w=VC&J-GHO*=_s1yt_rS90aG9TbBJN^z5@AyBM~Si-_vbD!B~#iEIhJ4(&iisv
zyH&D9NK{o&ha^N*JCIoZ&#RJzyJ5pn`APL8EJu{yS~UmnuWLBt-UnjlNS0yjCV01L
z3dzB+7I0&%kIvzwz4@|+Cgh$XCi_~GCh(#s6Z$9^l_fI1@BFf0zvsAtT9xQ8$0=>N
zSoZZuDxR*H<>xK##Lf-V)*--M6H}xN9S<+Iu7|Hr*PaXXRpK#fs92@5D;CdVUyL6l
z3Rw|xm?nolEh5IT_v5F`{m{iKvrDV6%I&J?aFATN;5NUtHKIJ#@ycYC-|299A@vQ{
zWFTM<g@2r!PVb<{T3YJ}r}@kj{|$FH)b0sRB)xJkz2DM~NUe-;235Yxxpl5qecb|C
z64q>?raBZ<{1UPFE$3k(`94;#HeJKU*RZ=?aUobd8kh*eKirwF8X|-eGSAzoMo~ph
zrHa|ON*;?m^f{$9am*Ir4V|t1=(FMktd$hOyx?4$cF#zutx~yo)A)V{7-jP3HhG)n
z7g--#{J5Z769#XZX<ju~l6;uWl40^!fY{Qu@GNB+kL2cpefK5vk*`9v@HNXUl^h_j
zD1u$$k=R!QF5g5RyH3JubDHjiOIX4q`)DwK`!@5P^zi(Tn6{a?+zVzb+F*aZJZ)M%
z*-b{XD=V!xi|TI%`9Jg~#K=Qzjg)q$F>;-b;3RyLzTdA35)<45R#Z3b`S>SXJE)#g
zq&&8mh-1OT^)8f(G?_d&Oz$LX4?F4A)<cr^4=%P`YpLAz=$?Lk#BMezIs;BBYL+?Q
zUUIiG5m(RLe$eCryZ)7F?b+fR1Clf}0-MVn!r==ju`ovn_K0~6OF79wNcaixkQ78s
z>dOTJ`CS8ffHTJb^I$8%VKxN?-%Y;u#`!k+MUvZvk=@z8PC1|tV9f$Ac=|Rrhu!HX
zPss!@vjN-8YNy#g9<my{Y%QSEM64sjC^AmQoLmnUOXcu0<-$u_QWO(dmv%y@I1dgp
zHGhE!5dS+__QPppo`?2Uk_os;CP^M^$Z`qF7pSq1j23uemmIG4r14+d2J=&zn8?;x
z3c!5S<vvI8bXw_DB64(887%_}6{4T^z{y9hwD&8OA4G;c=dw@0Z)6}%qE-YOrWgj*
zG?j^%BfkjfM3^83!n@g&j)7&fc>#J<KOZezos%n6StB$w#B=YxWvbct0#4u65!Kp_
zO%wv=9uzWs&<JPHOs}-m8qBU`k^?MI?~e>z5wPk9#R}X$JAZX^mX6wDbD?*-s|lG)
zjHFuSb=Wm7l@$xX!Lw}ScT85y34^;C*Ler$>ix<l<29~GB6A0i9yX0@%ujkaGv`;Z
zWyd>HGO1EY{8Fun3BH%X#GJY2GmuMfZ#UNZb&G75^TW*;gTw&}`K4s8W7s-qrgTE)
zrE>&i<(s-!VjQ_2#%)^Xw`Vj06YJR^r*e8cCU~2mNcj9c5=s)MMPfc-cQr6DZD(yL
z0y-KS-1|9K`i#Scns0am>V!O4x_jJjUF+lf*SjaZ#y8wlNHDd+CPUNVO&KyP)&E$q
zT`_<fnAA^>5qQivZL<lKAGm0e(X^#N33x=^;#oh<>x9bXiJmt55RwU~#Ayq>sOV;d
z1bI&u$R=|<YlpHt!~)!D1*U%NaX#C>br#|{(0~qsTH45JPu$4<0u^JseVcp^#BOWr
z0%Is^Ge`+&#t*?kjf4oKeBlrdO*|QsVOy*w#R;hQfaH5Fz$Qd$udMY7zB0db1x>NJ
zuOmC*0+L~mz-c{)Qp7h=4oy<jpf`Tk$Ngkt%{tVwNBV^}ydLsA4GWSGku#8}U3p!d
zDhJ(UFNc|tIMUB1;<1euwdhYyd7XGgRPM0HZwF`$DAA$5;ytk=eeH#=m-f2%r|NG#
zSQD{4GLd1FlK5S5iI8Akb6726eUOTINc2><+x4)+7&>i}(HHGt$PW`wn5&Ql01-&Y
zZfLzym9=Miu0bDJDogsX5#mKN0AT@rWjCu4b>}NM^F%x&rkQwRTLVMV+|rT*4&BJb
z$K9-AN_8&hlDH$94l}f|j7Ed$a(Cz=;`sZXSfxogiZ&GzF3aRmHRIfJPwJPGLf-e0
zxb|C#u~$w(w5xs5BLxVRWvjC0M+${G@)>x^UPB}x;iA`W-k$glUG0E}dvJYo3EM&m
zg9!C!1Lm#;&H#29@q~A8w+YAE+XfW5*KTlhEQMB|d0a0PvBes4fQr9!eHFL$a`8<e
z(tbEg$Q~(Mon`cNp0Wk^j&V<{413avBk*=D{jwW(d`K+le<+MK`EFQmwuW0RZQIz0
z>Xb@6=wbMltkGE=?BgXQb7=_Z?8QM5MC<SG2RJ^JV9t3D?vXhX%m}rh;%9E*q!i>9
ztF5y`_<RJe7eF{<_o#Xpbfyn)jwE&R7_Tktk|GLQ&Cj_*{R`BHXJ9!}Dwmo_5bs`$
z3h1D`E&!JqL8tVZ{LO_}djjNB>3ygpn+c4`(=~aTPwC*P@}1^pGoz7qaXowdIGC1+
ze@u1-hfeV&s#0V1uzh06;CSECNAfprCC$i6=E6fUHNZV1&v-c7$nCx+c9?_k;5}U9
zYpy9b6eWB~r172*gmje1*<8>bh^B<m{UKMmD1|*Oz#qo<TaZ-r8ler<CuR8Y8tCW?
ztOwzbtK{!#NTy<PL4TCJ8(KADo+om>Eea7(ZVGCUneDc+KSkRaFA*6j($X^N(kPg9
z`&cqsdY*)|z3dO;ygkkW>j<b3j=rrS9X~Z#;Bypn<iD)q2vgY!*}$eIUOiL6>~Cf^
zf?el!CgPs~tS&f)G}RLO9fMTba2CD$W&zM#L&QjW{49h)HaNus)$gP4DU}<-G^%#T
zlA9cXRrzkRfuZgLl5s7zC~B!#hQ6D#B^fA?Zcdt}GKFUb>t@dwY~>H{-wyC-J4|3U
zy?{!_9@7md3UW!LdGXYpsL2daKiUOCGm5}N5t{kak2=R8f#20P_64^rB2&ME_v#xB
z4OJ#K$|VF@WbDU+uxt$N@T~2Cv-GOYgRN{0c<A7Gz$U`$H}ZC8*ze62fcQn{u?UDL
zwwDkin0c82Ox<fSUIds2J5l;Qv4O4P96Rwior!$ayVV88z~JU^>I$Afi10p+=tqQ2
z;C2SA>c7;4PyMi9vg$Xk+NmfbdVi)FcZARo^}eHUFjkUQEDcjCL)ZtU^N8DK)jtMv
zejWsqq6)K1p~DX@Lu8U)=@gMX^bZAZkKAuh-Q@+!oX}XvV3r@iYRp*vgdW-~wMrQn
z3dJ~ze>6Q__xPq5_E>q{%0}Mf$tvvX=OA1pZR8z>Rbjv`*>bx4W@x~d(?|(Z0!U@;
z@9Fi`NutcLZ351BF%W~&lE#X(n21=Fauo+)lvHW`pxUe5szlQ>46205J-uTyyy%}+
zJjg60LP;Gwq*U%#A^wqGM$sa|C@?DvBgA0<j97z-HxKy(BKbBKfw&>%9IDXw`;8wu
zBXV-zcNoa*@Q_7?_DCl{+9isO=*eBShf8hj7n`ll2>=!yH(BpqFDg{2s~n4A&Hs=E
z(Qqg;0f`t`b(|>f(^`O@(+h7a-|a*Dtx=^t$yLk+Q{XURywK!>!h3nRLf^Sdb@0P<
zB2{4WyFZ;%nmAKHCVKT52-ty0&*M1EhUvz3_)@4<OW&6&bcfJDg=Z#_;xhTZ>-Y4>
zA}kF9_L_;;Fc2&+d)t61(_l23pR<kfhyWx$vZyBT%X&Bhut*;;p&~<YXb4D2_gvCb
zX+4TN(Df@5PJnIS>uy*~?a!A$9K%>hvh?6)L9bRSZd9#SpkN<YXddLEh`}OdQ7CvW
z8lRBvZ2^kkYf?#d!!}U1r5EP*(%%|2D+{=$_TlaYb_$FQ*u48l`Oz5!!YVF1<-3oV
zQ!47*Z!H4&PvZ!+QwQ@^v1O)^5?xLv3`)NT?@QM=CGmZf-Xn^8zD-|~r&(PQglnIQ
z|IAta6C&G8KS{PyUd3(~eAuTFUmgJ+U(8M{VNj}|7W#VOKeNoDC+@%B&3Ib+0p#Q_
zNeUUyd<fW$(~-K*5BVNl&!xY@HXAJYsVGvo&?+sPjwujEEtSh-zfD0+HKt(`jO2xi
z#}LN-bDEgXIUUI~GYK=dE0UhLnRqhTyA?2SU)e7O;(ioIm3#$dyyyd)(7Qo*kB5Nz
z9+u$1$`9sbb;PhEti?H`_xO|Ao@ZT(455hl&jvUBxwMl0?*=lwFXxhmZCt*7qBKp@
zfxF)SX)j~rWmgJz=t1gk2~7L0Z?24V+6w6}u?W+`Qf{)`uANg^WQ*Rp!rntNAc=gY
z{?;VxUN^g>h7B<Ht@gPEdP-Gw_F?l5bDE59Hb5mQ2M}rbpg`Lwk8_rQ3GT(#*OXMT
zK;z`KC+FM-y&BjI%2%)pV~CW%;C%+d^n`@j7m6=Te6I&#H2{_n97-r+k*GsNhi~VH
zd`@3_-w%HOf>EnZ<HFnRJ2HQT(U7<o0juEySl#rORtdX)2=>FQyRaoH;mMyu@GsDN
zuwY_o6{wA73nI#U85v+sjEp3)lW_&Lp_?Q3UN`|pJ6i*%>Zsr@NNka>$#S2Vcn|dI
zlHonOC`)ha&UU+oJQp}n_zsda3N4N}PK$x9fp?mIOV)H$CRsDe`-sJhK*tHMUeNFj
z%l#d@``;g3JrshKN})g2*v#)4KS9-oUvHm==|+|S$DdJZ)h8<~d#|S$=rxF%20p<?
zA6yb&0L`NJg}S?KXZK(L7c3Kw;4zx}zT0{~<zb?z0<uCD5T|SZ&Y}lLfkCZdw>dnj
zCXh7-SLY6>zO2{1XKK0k1zU(3gpff-dYBu>YJlFSf%Yr`4gsb8694vPkjZGWuydOC
zIokO(o1BFQpF{|J3YUjC$()4rPih<ojp$#8>4p!zu!50FdHe&_rauCtIGVWOOt?Ot
z$Zj`f34y+j<#ls@YsSZjKRt-S*71;i1<0wRGPkthdFccbm5chsU?|#0VUS%Z@MrHk
zd=YSW!*VnkE2OV6>`$)SHUx$dO7=r^Z}Tb0y<G?n7ebQ)fd61FF8$lj=l8reqFK_!
z*7YUK#|kwU1z=>67U3~c-Nw6$hSa2y_6<4bF>yzXJ;+U0&XMj73o>i3IW4zirEb>Q
z?+}&iwMjdfA4W<-1(w5k?2JJk1cMVzC5R+5OgOJIlv%X06q}%7j~d0~u4PXL8*A7H
zP+A(JGZk?d7)4mN#w3^<d_#D<@3b)1Y}j>rr=iCDtGq}I>3)X9)!iPnE{=1SB6J7U
zf^&*_c+4$^GmuC_Ztd6m=W7<IFTF4@k>+4A=dq}1vQw!Ts7T^$k!ZG=zq1B>@uEfL
zJ=`bPUd)U$$LT@zs5)Hf@>hkc(_;S0@@c=#{0HSE@InYR;(s=#xB6w+sr`!(TvH6!
z4ZYrj*`0}_w0(3TqykSMq9cwfOWwtJ4Z2&hfKRx>roIb%Kcff5J(Uae#wOWHw?&}2
zGe_Y;+ps_tx{T|g4^AhkP`n&Qq{*&t&%#Sk96t#f3O-gaPcQjdi6#*=qw~{uJ?#za
zmc0w37j!8UzyGKh5=~E0mSzMS{T?Rnkr+zg{wICJz8K|wU(nT0t6G{qTW~9CUvt3g
z?eR9_HTyZ5pq~X+gYfPv6Ks9%d`;he{$h5%aTW<nR52KHw<Kb4_6|D;Cd6y|{-^;~
z^6F#?aCZF?+yp-#Zw#5po_$)|@`l3qdvX)|1HBh%vrKl?RBbo^hLBa5N#c@+g8G25
zcdTWgngoMv40Z;ep%rIF^kFrEBBNsGxcdl1okuiD43Fnog&AO;+H~hrMs&n_8DXs!
z4W|LjQ^=-m7`2`*pEDofem+Z|3ieg=g=Y@37uO_uO(=L;P-p60y;F5>yo)ehnoTDT
zgI|Ewqw?s|;z9!59mp=k<U=sKj>nq5X&TGdF9(M$_*yURqcO?2trbnKjt3|lb@;^h
z*jw7w&<#=Ks7)TaEnB*vCXeJPlcV7{3{W)B5h*6od!WJ4CZp@*DJPAJr_`UrS8fhv
zh3>yU*;aMdkbL;857e|#`wUG=bX&YyzbmIGaC(?@glXn^`k0q~i)F_s9>d&29Q^`w
z7A@{|K-;Wx3_nGUrd)OeV_f>aBX_rQN%~T5_~R-T1^x>!M}g-_=p;z_9r^yiG;!$Y
zNk*Z_{LFG+gi}v!BV@K%hf4JlRvyL-5G|;-h?l1OL208UzZe`*AayZN6@skGN6B)>
z&Db(XT-}-X=R+vVqG2|f-4|V(_}*W{o<7W!|1ljVex}n$9PU@mgWdppQgW|*#6)Lb
zz4M^$KNU4lS6hUkK+U~E8g)q~;d2(<>%Y$T33g3Q?P)_6zE3W{9hBDO0$99TXFvQc
z`~0@>weqn&n>$auDiJ<6rnex4>my>>F!O&YiIIR3&Y`;qiXizQxr0zS=+&ii;NHN>
z{|;-jKVab}mT>zj6*eV`J3gity4oASK+qLx*5kyb{7HRJTK!DKQl<7nK_*#bs}X7c
z{78U$CG|rI;IMbxAKCK9p8ImR?Lsv(G&WUnO+kL!Eb^Iv(>&jU4r$wv5OzHJ1QT(=
z0k{L{N;g_Gd<FXzO0riyiu!m<W7YyM9INPz6_05)5NG4*d4-le5=pAi1X~DNKIov@
zelesUV2T(uA+Mi%B!Dp2xm(=fNAG31GifI}L*7{P<$e_&gG#bc%)rB7h;%HQTv;~e
z2`tKb5;$i$kxyoC%EWOfu^|}BJZ@rW;EI~5vhm}3qW|L4P$}#g%(nO5u-$0>L@KNY
zMRzarQNDH~&HJEa+<OCvpllPy!Ti<=#w70Hr!AVnN6SqZtqhrzAj`s`L||<(v?F9Q
zEas9GQ)t|EKlFJNOzKD+(vQ=g-bgg%L}#Ph{CVMdAIy8^ffNfE2bk7%s?}#N7Py<3
z$t(6l6C6LE`+tLJz(0z#L(w%)^K6ZdWi|Mg9*&rG1G`-2*$b?zK8q9Cpn4RG_(_+J
z#;`>%!=Pw7Xri!~0sfnrklGu=umOo~c*jp~vF+Y#P{hNp3mXbHz>DnXE9Jd~bAu!K
z^r%4^hC))3f@BLWvG{GjxJZ8AelPk`OpN!m-s*6w7=4Uhvju2l3Bs4g`-u~Be0CzF
za9@`{<-o(EOYLaVWaf!9C>^z=e?9}gii2wIMXy|L1@bjvM1u@2h|xquJm4|-BGfx+
z9`vtWpDMRUy932CDYmLdIPi1?0*~&kj;cvVLtrDuVbCa4zcHwaR^Ln1?@N?2k(s4#
z5SPsy4F#!#bcwKGTOXEd=r^O?>8jdzC(dL(Us{4mN6e#A5x)~%=T-}k98dVmE$CX=
zZJGx`+tB*G$|N2;x`EF^m-puAPiNz#XQ}Lsr8)E-fU~z0-1^(XtC};X#e|S^iudqw
zFzkH-TCbYWitiVrr5Is{^baqFo_aeu#8gJNu!N$q?9K;EeLT5ZFDTl_HzO=Fr6^-D
zg5TSIre0SP-cfl^Qn$c;B{GwUq+YmAj+q?(Mb-7m@u3?f*3!4~XnWQuKtX+`7kG+?
z?vvzRS89mWlgKC0I(oRW*TIQyG+?`ovB(SC|HT3MlgJUMdv26x6ncEGQI63Idi6(!
za5MAjT1(zlJ_6oU*s9&<&uX|M$@~(6XSl$<2mBZPUX^=w8l~`lW|zQeZaK|qZ&phz
z{C=ZvXdi0KHzxNvc#~`z7$8f1^(-s2cquU1$5Q(Te58f_TDRXRD8sX54@$+6^<djs
z+RtCzkeZUgNK@1h7W*2wbTg?Q3Y?FZEx~;ozqgMNZ8fH;D*EwWBd!7kQV;q)a`;h9
z@p_$yK6tYE&FD{?29t&0SKzsPW(z(w?Hp1uq?G)eb1Z-}M+(Q_cLC4DBXkDjHHXb%
z&Q9~vXHO43O;DAPJj=}5zcf<w?ge1BvLa>0b5JG=T&$<rf7xy&c7Mh7CIj6=f0|e~
zYv{|X(2CDeNX1B614w9CBHJD|0XSZwTn+d8VS7^F)jD@}P{AZ3&5PS0rjRP&vEecE
z!8SCWB0%ri>P8Xnd{#jgAUJUZ^_nP{{PfVbQHCFy9k!#dm?>c-590Px3Ve*ZNFAmr
z1s`~@2$HJ7VG2rtmHi%{*qsf@7|8<Z6owI-Iw*X`jU+FGfMpPeM%g7JxnQ7hCa>xL
z*g6ZSD8K#P&(I;QAV{Yoor83PbR!~-4AP=>3?ZF@Fod*pGlYPGfFRwe(jg_?bvNgn
z-~asAy?42mvD6{vefPWf_xU`}<J2h+9{BP@Or~juz4ZL`eK8{D5NW|zV?&lck?M2p
zWM5T{7Y{d28p&tRT{gA8N8Gb6vz=}JL_xk0;)4E=jIi>XJ3&_BuupI(IPdnpX_|?!
z4UrrKOe+Sad5G0oG%HK01RU>uz1nBjf(av^0`P{|>Oj9k(E7Z>jv-VQr+>OjYo9<2
zWJ&0SENRVe=KM8&oOT*_5-&_wTT`grY5QXUd9cbyMO_Ll6;J%$0D<FtI>HFM=zQSx
z<CK9;)s)un)O`5il5lW7bygw6%u<EBPZ$0?=8;92uv`nIO%^it7?qSW+;Mg=*!pCE
zU!c)rPzFg+^%sIgg$6Tsh<!Ko5ThfA=LMhL<17%58-wx38YOO+31vT%MR)lTm(8c4
z`Wclg7ykY|rX}!e?t7`fD{>7dW!c)wokj11nC;ty-R2HE`1O1^A#p%>NXj;gdYnKZ
z)<5Eg(YZZCme#<g693`TGzhr}OOM8JL%D8IbsyX80rAd<C`x|Yc~oWThf;2Y#P23|
zG6E(DB|;F#gy{Z|>EC|IQ{BL*?qXbbgRTHWSgRN;UvQ7wj10ftnEF2&0dxV<mH~C*
za!bnU*<u?@a>3|P<dsP$U`96hoThwyC9&ng75)sr?Eo}>$+<$K(=>yND9_Hj^@W88
z!|ZS}xq`5X?mFAFhY)xr@B0qJc<82lm@|hICC_mgge1nA+%<m3T*%|drD4xu`&5Q9
zA@1Oq&-tMCfByO@nztKOmpib-_Pmn!lGhX2aEZmkzukPk0dcm>ctGEExf+ollZ{9T
zA(bM@?jt@+onwvFv~G`tV>sL5=!C(8&9ED!B+cD|L-fHZ>WdX5FkK)B63o;3SHJ~0
ze)F&FmPyr!RL?C~p8pIN5OO9j;~MS>+nM4$8|J?D+S?k+mZIaWgDN8<A^d&)nSL`&
zUmw4kEN5xEZTu9YQ;qr#<_v#C&fX(v;K7im_x)9t31U?eALa{w=z(XSHy}x%q|S^)
zGK_3{&UY$Ox{zvF@8;Qp<$~o^^-MgnjfN~y##7%EpZ92->gN0+=S;nzQ*NqDA0f_~
zm5)fDcv8vKPT^y`@Mn_sG`JJbE|}p`!MqMA2^wU~1}*D2^Y`Dj-M#C;(s>_Dm{?6O
z*~UwssZfuiW!#6(TAim&kl&v5`)*iN0tL-gsmo7)FT4nRNxW(YQD;#}+RuS-WPd}D
zBh@fb8ocS#jHNZ|yIgG;VJ_+7-h>*4Hfnz7>x^F1x~PQ6msCbCu^uikybOc{g>2=1
zR5JwEe-`^ypfWNWC?~dASF8s*u`7%Fimf0;cvOrCPB~-b<9mKM(R?P`*`dNM_#PES
zgGy~9$$~+8KqDOHmK;9$P`qIDQS?d19*fmJQp~d$u2VhnU?`Qh0Y1#wUo(Q1#Je-K
zb3$efO@hV@WnS|x#L{iJe_+?E)hLG5nztKR3}Bdrb!Ix4U>4#d!#pz>q%7b!*K^j<
z%!H6ghFOx1?7?Aj6M&P#niOh)J7-}(V~M7-VU^~3v%))Q_6$DCv--FBfU6r_hiBFn
zAyxVeP8C0Z;N7J@Xa>xz`b8u=%j&bz>Ao6Z7!=28<$wDI+$2iWFqf@hTJM2SHD=n+
zHvj0($voJ2_t=?P@$$oc?ku9-{BC4ngk%vq^l7P5V_>2b`*7K&+7Ed9Tv~{m?0oF0
zq+??Z+yV)*1=|(H*xa&S5mlub<;b~^u`GWeQhdl&-du1`x71LJo2!k5^+n+e|1!N@
zf;|$4QIKn}!ZaIf{lmEln;u#SnaH{tZMlJjw{;b(`M{)bRH_ehF<ErK@1+Fkn92GO
zw`)is2pE)@PR(a5@GZum*|s~gh-a;x(P@XFyT83C8gtoxw9AgI@RG?mXW3)agI#6X
zRxR#xsjNLTAK{no7wtrn`yLv1``CC6g65gfdgs5%iVcPDveoJnN_-?uBi1H<2<Fg_
zo-t`oufk48rkdO<$i<d0(xnkIUoI4(_Pb7jSyWGRd*?K&#zc)?pn0Z`iEbBk)pQFd
zRaq{cSM1))SehT7IBsQt1ttx{zq!AojD^34&}X^mR+xxSUnwSFp~;zFE_h?gxT5>R
zf+^_<iLI{A3D8oBv0bCUg61uH9!f+ve9(|196QV?Cpv%#&kNw8RDQqww8A7{sI`Uv
z<^15fiLy<otHzTsv8%&zg9qXXgi;h6RCq(yv-89c7{)Z<&|RB{-pBSc9e)@!+S`LK
zD!`tboPKDu09l@8t#kt+<=Mdj!)QH!?Vb9CDhjjTU|oHR5(LmA!wlgkl}s$#Qn)$L
z9<ZH12(|3W=(#w?XvRKcZCwT7t$`Dg{t8f2CWA2CJFV0he}Yn;GDqxq@Y+QaOzDcQ
z_(#45d%IGa1wsN_iQozSB5f9H0t>P%kegvLUj9YsGki|VMlxkDT!CfAH=63p!>>r^
zuhB1uV`zS|;I77&7Gz6l`)4pI#(<kl(noX}0&{VRA~Zyg8{qwY${^FkuAei|%w&lE
z?Bu>3;=!po8bGWJmqSfqX}gL+8X_o3Vf!h1diovf3CxMZ+kx%sXXKZ~H&et?=0EFT
zS#uA4AUxL~H21K<@zWebG}xb3|1D5W(Vj4yFiM|l$47huVBXUgdu1RPn!*APGpzeQ
zddE!Tk#{$Gt6fjxqa5)!yJnX{nt|*~e#S}E^GZ8R=e}=NpPy7)h1#M1VI)>Mp>EC2
z%e`GRbT7cOf&_}RbYp;g`01!p;{F>uj7n&}t5Az+S#~nLGC*IZ#m^UmLz*72+G%oy
zyv-daSoFkZcyvgdf}eH1+-O*5ivl3Aw!50lW+b?gG0!+#5JLRwQ2_{kkZ!7dtZ&A(
zTR-dYmxZngr|dM;410)G6(5p4q-%CUU4tRJrOrrS(;Ks761+4~rh(4bVHrvy1L+hX
zl<`=Vt+VbWw_sPiW>O?f*0Es@JR%wp<hS}kf0W0G!<xQfMRrX^3<nn^s{|_~yTJyO
z3}mMux~Z%!LgY_R_bk854Gwhq4|W}{_L~A9-#o_rgR9A0MWQPx9efR#uO8A}j?tMj
zzG!^InAH{v+blHopT)iMJ^aHfE}@On@Gb=>hWje36{M8VUq??pQcMUs&NUCDoau}<
zeaG=cSMpsH9J5n9d8cKLHOuVwDOE#I9cH~{D3AHO%@xs1E8e)d<p1~wQa-B<%71Nh
zCwuQ(xAi5%lr+SC7g`4OKo>vKC2`46qvKz@6*?9a(r_4}_TJ4g;SAkV6Rc5Dzn@PJ
zDO!d?)Ab59l?8-)k}Lka0COu1RPGj!Yito|Di@;kS`wlz@ab~lvaVl`M~ks<qAme%
z-5-RgJa=j9_fWdi)_3nl7bt)Uf-gt%gsn@XL4;&4tpoW4!9->-v0b?fUW^%ysmyaI
zePHc07WL3eKv8DOU9?I1t5U*IJo?T&G33ETp`NsVQ{(m)2RFY{Jg{~6fH_O|*5_=c
zbJi&Xsx8&bH1ua<rnUqbNGX*xdIce(M{V<b-Wd6mc8OKl))&DWTJdO>VzfYG{HxK+
zFd@n;g<J)w)PvAfWW;!}p3cZV(GI##-QMi3&SQcI^m+4q+k^C@gwZZYEKHmr>l29K
zZnNBP^&!c~#Uzi|Eekp(mtr&{@*IuT1d6Qu=DTPDoVaT+G9@khOu`gmmr&<9A_VCK
zK=0@%o{SEz#TUo1!t7rcH|x^}f)0y`TB`T@YXZ=*BU?mHEyxPuISnh7dUl3$3lL%#
z$5*UrRLJ6Nt&UZ!O=7tgG{!Q+y?|r_GeWyNGj5Q+7;5arcIa%`B}=phBgN$Z%sl3`
z7g~XikqaJ7%QW92H$0o%%ETn^BKUCJaN-pR(qPnAwr;TLBlJJKT_5)?w&z+}x)^&P
zcjrZ@&;X%6Ot+zi3X#(})9g<dsoC-=H$DqbH~LOk$KWG+UuL)c1g$GT7G6>r6eEQP
zRdC+Gq>|{gA$X|}Nx_{IE5Cv1a_4aY{lat1MfADEZtR1;l$s9r?@^SaD<VyCLVtqR
z(PHF=85fyav_GxM<3j@wzhA4;WT9jGO<KQ=_r9RJ45ELNrPa|Tj<OkyJ-2+(G%;#M
z5*!zitL?UrO@IfpQ<%b<f;tDDRw|-LJfUx>l<f`IrS^85c`BB`c0#PKcX71&I4fF2
zFXH3K@isc$#%`5#P)M6120hO8P^^O8Q#6e$OAnj2EeU>e{p3{o*tM-{exc52{KCCl
z-UZ=tOX|gv_K9?(&#ym3^?Dx2q{+V9V8c5(U?L{J?^-oxy)kBM6t~6b8?a=K&eZ|Z
z$*Z>-d6G8AOSoSu;l$&JA2LYA`1aiQC3>j*Ou-dDnyK%*zTJVi>{#UJHqUb&-zXwd
z0Y&<lOdDGM0f;2Y7K8|5H9YaHic3Pa5POC&9gi#~D&<AP8ZH607}SF=xEm`N5*f>c
z1Bv$&!l(?jRSH1ERE=~0o=4jMu%(lRWayOzjmOG|PK%Ahy{^ZYaxeYH%YOcNP|_iv
z_AVDmiGJ$IX9UccWUf?-O{-Si7>-gl=%v$p&n*A&%fb;5RQ9w5&QR-NHf~J?1iuus
zPZ=d`Z{h2;_dSHsxu!0PM%1X@B?4&;4QBLNI;D{03i;!||1@fEe*hd!v68zV>|*eY
zz+*{bJUN4t@CYYG?h&nBonq$vE<Hi6IQ%^RlDG1P3pKQxhDjVFG$B-mi3!aEg53+d
zz?h`Z(73_-7T16{2xsznp2lf2K#dlHeiwQi%$OdOK#NwY_#BTOlOFw5L2Wr;Y8k&~
zpZ1HQU7Nq|psvTAC!GHvLKrrH1zGCo$aM^!ETzO5OcRbxM|O&CgBpu_YARCe4nO_k
zPH>J|{b}dLuW;^%4l>EY$UEge5a3BYg0S|T+~K|6lb`&gLZMgt>YmGf6cL-i4#aUd
zO5Qb!7YZfhQ}90d%XuX;4c{838(ZlYW?OfEDuo|S5H@MDSuSinf<tWIbGth<ggNWR
z{#@#cDS52@N!|B+b13;e1y+ZY+wv`%d*<>FnoA~Zd-MzpFm70|oiV0rY`D*e+uwxJ
z_&q<}n~eU9;8(hqA-QIrayg_^vs>Ix|L82CbauMDG<<FLcd9A~z6UUksbS&uO9Kqm
zK7MdC-1c9-%I9!<f7VK*=;M6AOY;=67(}K<t||xNks)z`)LdSgt&vJ|lX9#1A>Yt@
z<mS-E6tdCvVbSyN{o(J!jN(y(eU;%MYr`!i`J?7@^MPoG<Jn(Q9WEy3(<S;S*C$j`
z(B>z}7%kAxeJ*X`dio815sCgBZ=nyZ22RgNd#A#6g<isT#eRuJBu?Di31F|mlI!k;
zXKOm42!Sfj!YYC&0*Xu$Z<sAz(;AK?QiSmoI&)h(ab9?Q5?HPH07@Vblg{QRkdT_y
zao+VKo=NT$n5sYB#9ww^5)nk54h?0uAEwW7`gtrS5UMc==#z6GD+DK3@$b)6nxlWF
z`ugQ;Rc;}iWPr|BAQK_Hi_=EJk>$3xSn#p`b)KM97$sAm2&B7M&vdsKn;wfE4JV%S
zxf{3GSU-qN2%^>jSBzph_5-y?BOo*%-y*;Lmvw*P^ZE=c%%N}LufPOM9)i1}tr8T3
z<oNw&G@2IYWpZRzfB0(~eiNSW=DxPWj18GgDRL=KvI6+b+b}3i5j`N!3x%iyHUUyZ
zJXs_U{>Pkt2Z(n2Fi1$U5_bYr>ZukOz(K^g%G3MsQQlt1Tb!$};rAA$yrwD_Ox*Cv
zn_1W%LF=G$&ii*$`5_;vLmi|TNq7A$uGVzF+{&p^gz2BUzJC~gfXD6Sne(itF=FD^
z)-=C$6y!RsN=6P*4D`hCI=t5TXyJSb%NLoV?lD;Iv76OY_?ZCRW;1(ir}I7H0|pW;
z%sjztr6+bp;ETAoA8BP0-}&)&?zb<?UgelVJRYH0!yQkgO<?CX=vXbb_I<y64mf9&
zFZHM2Fo+xFz|SD1+0U>wu}*8>Zl0jUpj89W{i*tW4x>Zc{?L79vgoH$=dZ?1aL{6w
zvoF@(&$o&fpvEW4pM9J^-u?Zo+#d7of!d5sGP*r!pJMWm%l!K_1iC%f8P`|)2XJ(u
zwKQDtj7}g@a=UWb`Z2)5Hj(w6#J20)XYxRXs{(Z8Snvha5&2Wc_YUx)F{^6<95#3f
za&7GP_t!o0DAXv!UG#}lsFJESttWHEgpvs3!NhD?{b*>DB99zgL1#mTp;cc9E(=A%
z8|T5Ng;TW|ZCY__YN&jeMPid--JklMU);j+yw=Xcq`{e_OpR}bIs7cbDF>u^s3MpW
zaOjiC4(2&mb@JyRq`b<w2L1?{R`XUz1RU~l+ejAEA)$tn`xe{7l8?bm`JrhLe0etW
z?6myFLV)|C`FrK_yN|RZLhi6p&K#oLSo`lsSebP0O-1R{oW<ahnkhf5oE`Z2D>oj^
zaDOFvGe{urJ&OwY>r&*KDwBg{P#G*Z^V|>ZE~v9kpCB5XEWheQzbYaV`^FqDfVevK
zmvzOv5xOS85z_c`p?MI8I({j0b$(=J%xQF3*KD)Zh_;KYQh=pq!fT;Q-`rc5K&p@d
z%Go2PlOk&;jzM;?kZh3V#U-^82Z(U0u^#I;+8l)nQQRP%Ii0CtHq_3Dei6&E-c!ld
z*2^<jy=1Q6fhgkvgAY^hI__Y_LBD|tPFF1KiH=!x95bXO)UE0u5|^OMGIzt#->NGk
zH*y`UiU0lKe0=+XR;NinVXCTo9qYXo`G;zT5gHp)KYT7U-?!URC|I`}O@%qu&UAL?
zlzYRiDlxoBq2BTLr^YS)bF~_-B)0~yC;!7k-@ZcdB19s~`AH`8m@LKW|NeM?MTphM
zFMcKatS8R5rm7Thzfk!%vGwnt(WrkQ8NN(VNo+tK2-E-XzxjWE&vlp3*kAVPQT=h(
zvVc8TBCX5O|4mH)?>{$ei8zD(#MQxxBw^*}EyX~~e^BxN-><Eh?E%aNtXf@1*?8*h
ztAXpJ1|<KP%>^9v2Y|6r2{<??MXYqNs?*^e*-Vyz&FT4%0S>n~_c<_e*2`C={I2zB
zHBoRUyUB$brb6iK!_jp((-B`qE#9|>S__2z1?MOy35sV;VuIM(1?nt{5Yd<a`p&st
zggkJ>Ir<Zbvjlr{lln$lw@<IlWu9~%Qec)L<~aiBZz<D8)_>k9W&}F-m%|*djiLfw
zAdxNyCFIsoa2nm;udtuyGr|4*dPRGE<kNxQPhro4Rjtpia}`toh1CdrWhNPMBCZ%q
zL=B_20+Q8CF^kG<lI6E;96%?>Ig4`q5)V8D4A&UoJ@0cp)jYi=HUlx8ik7<*u-~Q8
zftD1Vv1zSER|wBbpt`?W?W2^oTs&KQTv9)0k_R&K^8SLJp$=}twNpSR>w%kKKzWu+
z=M&KS^UhEFuhOe6CQ2zI1lxzdcR@Q9Quqdyzgw;=P8<UI>#|u8w1lu0gFvRWGBJa=
zrPh@WsCtKLjxgEBkp7ji&GNE;hCpor00}I%$@JA%L+*wEk!h!vSWGjTPEuMBKBRel
z4FvgGucKExC!iS3-^RJz^Wf#Ho<wD7f8abW!Lp~-6`u7-fKTfdYA!^*sloy;rZ2!Y
zl(nmqAx5O~W!kd}my_YsY2-(k5Fa_?rd9X*BB14EkbHT)(o-N+diudvL4k{&89Rc4
zJ74;HpI17e_Dau3S{?3YO*Mz<YIy*1Ju5S9k+7OM#gI6MO9201`YE%E3pW}e*onyk
za{VB%%od=<_Ylq&WS(e!nJP~V-V*8A_)CVongfB4<sgWxUbFkfEsN1A-x*M(^Af&2
zZN<JWukU2=;-7hOS3u1oB$WjntGWC?yj<~2odX~P(z8sw%hJw)O?VuH43^X?mFu1S
za*f^i^*P}j5R!U80g>kv2X<v`H&7A;fa~Xj#DP5@9v2YONtBXvn{ez2IsQV<Hx3|q
zb;S2N$Tq;KP#;JF1<+h-k!qEzSuZ>3zypc2U?LE>CsTQ|oG|{LfPD{0-=ZI^c@S@e
zR+trxARLl~_c-T*^THEtES8fWPx|5!y8s=)L@Ua2>=T<RlHPL6+j)bPVTBy<mz_zH
z6^8y@MlVqwHWy;LzPT&51>xi;_L)DeKlb~gypYY$>m4l}gUYM&*|dBXvF?$sV4tn_
zCL4UmqYmxi(*3QSr<4tbjD$zP^cnJ$^e#F3mvG|k>FJ?#&zyg<049DXTAH?husyMC
zIMyrD9v@)W+G!tuY{Y=8M!{)%AZB!JbMY&2sXPXKEe*Xs|FarMxzZ!76q$>1U-?Jx
zx~!*$7{dTs=mkijv9Y2Ay)>b+m21$vs0mn{5mqgyl0k_JWA+s8Ii_OxaKXxTH(Tad
zKEqpi=WiPQfOih`nYN#8mf!waZveI|#vtWDBjV$vh%2ch;~|&<76xJ8=ze<o;SQSp
z(2R)7VuKfmlNc0b0!5l7<)2qyjW@ov<6VKdZ6UE58&$*s8=VKpiGjkmw|oS7-d|l&
zw=GRT<4m5t$ep^E0X)dpX>99S>p(p%ItNJ(s3Hw@ht*o$G>mA{7AYFhIs<@cXv1m2
z=P~%iCvVb}`77vI)T?)Vp$Ez!Ix2brZ8X;Q4j=5-_4Lm6@7-hXM7m|Cpg~T+m1K8N
zOY!z{YXg9Gw*H*;g#bG>08uGgY4NCJ?~PGF%z+w6)9}?X+-(f!qYaI5u(uQdK~bp4
z7+Ue1TAkGr5VfK5V5C5!0_=o1XY>&sAiX3T{^N)2CY=xK+pQ^S^H<p1gw&@mCY|C_
zc`ZXaQYHN}9Ci4hHibYXO32O-_r_BSvn4p(s$tOuGwU~>g*U1^RDFRdcxIW^%!neU
zB5!lFAi2E#>KC3rC|=_*V{HK&oq~Lg%wp|NPs^^aFPzE<cvZjLT;tK=%CnDlEKo(W
zE$cM7IY8B7sYN(kN&1DuToM3BMY07(DFx6{|2H4=OX&Qj%8lymvp18T4kiaR()f<A
zWn!;#uUnL*J<4@nV5_AoDLJHa(M$^>&;<U8N3p>X<ecnFIGGYR*WNM?Q@bB%Z^VPy
z<id3QGv-{nXi^ItbVKgcZyvIZ$4Xuof>l(M)-$-9!4;3^tmkn;@zsUD&Wk3u(Y?B-
z>A}0^eMl{@FX!5;#3^!ip$3$`qYZm1Xw<jU!eKTr7tpfoiaE%cFL^wYF5tQb60e3@
zRV7jY>~vA~w`PKmVy!QQ^GYow%Bl0}aGk&7kdKOPcc<-%GjND7S6KJX##iAYMP7|5
zJu7+E-dzvgBs_w9Y4g?%*D*r+-E79c7fvK^s3$f-l#)7|!JxQ-!h(OX-6U6zU=fOZ
z`8MZtPTWBt3pH6nIwSQC@1E`>?L1xotG7yL(qM3NDBW;8JOKbYZ`UlTfXnkQ`ij|C
z-QJcz>*%b+z;ML0hh-3iXH*ED^1b^w7k|2}C(%K`+Bhh75Y?v!vFVhGrTrpCES^*?
zYqg0f|DY>=)lz1ar1>MCbus(1{bz4`IqPvGhljSuiXyR7Pc|nF1kyoz%+k2)P!`n7
z)h5}`;7+2uHegt-s4KDTf1iJ(#oj*r^<?q8J8-v!pf_($BSmxuXF8k>{=3qu#G!+S
z*<`+Y^fG#pL50N3`fmIA(y0^l4xEeb2=Qd4=X()&dpxN<fQ>*GNQx_#H5!=`00^ID
zx9-`jjk`{osj-Q@p6}<NTgRCM+AEp#oJx9Y8j<9sp<6h~2El^sO8^wlDpuf1(ix!i
zz{8oV6>VMtJjU~W1E<WMgkkPDIz59*5{Ho{d+E=rTPu<ZjkuTYc-eCUe4$S)o)LE^
z!?VDJs(u6_LhM>!rN|^;OLE~@()sLWAOEg4#DKP@fSsqt$hQK(pY}Qh2LbLGo-G`X
z!vT_EqH)~imYHx6RtKvUf~|bY_)3pZ&a)cjI8lhlC%=<bTE!N1GM9}uHv+I<;}-^J
znPf>pIgQmP0}Y>&@Ihm5b7}Q}PmGylVGEeDbpZCS30RE%U@K3L4LUc!P=W_Jz|>O$
z%~6|C>{lhZwi_$tYK06ZahsMy%h1ujmKs*c>|y`FfMb+BJ17cvA<}^2L0ci!kO#Y?
zR+q`ai;vf{R(j_sh$Fpw5IZ5af)ca~q7*|XnGY-{D<b1D`uW{Kj{&BbA-4a6N!uL)
z;8G7+j9viS*-5M=bKDmO_L#d;gaNeu!Wde?UaC@mg!|RUzjzzG1@(B!si$KEWQrKA
zU@%`X>*M;KHla(deE?y3_*jNEl^zj^XoRORL`e^C#I}mThon9LJ?`jsBvYpgQn7X8
z8ReP9Zy*x%Vd!%7i5lrJtxk5<w9Y+ynNtYz15k}0u^EXr_xyR_5xD*kB5?V;)~-Gx
zGIA7RBO6ZM3vN<2r&&b|gqgARU`9K-7ud_?Nf%wso`QpWh2^SjzG})_(*j}yHsn5(
z#-BS18#R2t<aS=Ulp<oKgSw6p&{vbl-WcFNCAi=0f3x#ye;I-h_1L9fH|Vol>rZ1t
zu$xsd_<T#`c&0tLZHf^l*IHpzTa0jHjalzf+O}-YVZq~NH3g5mE78wbW#k)g|Bw+B
z&wZNwp@l|I5-LX7b?8!fMiZ^EOcHUQ39PdtkV7b?C?w(RfPxtDvm;i&E*$7Q4%CWi
zvA8LUyLE2cMtp1_GMYO}{7TI0P>D#6ZBD?}5x4j?Jx6q~AY9WlHfYPM2A<k{;X#2m
z4|`3G(+$1S9=*7wP?=qgv;HGjNHvS>0wdrn{uGVHkAHpG%-Bi*Yva|$y9@oUzSbIW
zH4gTSWySDrZUvb*<&RH3Y|-8T2?dSgm$%cCGh`%hWaaRW2MocI5)Dw+p*E!WMauy{
zTIDXdL&7ap-1?wI#96Xub+|uyuFS>?wTup|<K<Fdduk3d{3cM{A$3`tVub{BwTG>h
zVDLZOb}1Cz=D3#Tq(0`yPGze&$i4C8-Vj}{tz7OhXtO@r2adpMa#b|6lJ2ys9(7bU
zh#XVN9HxDkixqN**lKYez}v^17{KhMFQBk7QN}$FO%;T<LnfdPKRtaLG{HE-WyA<~
zfMVo4Gb3xd!TAW%9EWYuXZUw-39aJSD^IJ_MD3@XDRtnGh4xNOpgW9#^P@n+UOxO4
zu}Qrn4X5tTn3&hIoNseLU}Q(N4~g+w9gGJv%vYnwzI^|JaPZ_+e=)G%Dd~c9zF$-k
zZ5tfLH>}ThH=GS#ss^}R3`)eeXEiT3_o}1>2qmsPK3Tc!FSi((6+ed1wXHmN>L*is
zFggo(vyHvod@O0a-l+(U-^5u%5)kCO`%8zstNw>JSLcTZbY4*tcO3{9v#yf3UU)gm
zFL7yKksOvcvKK|^ENO%Yu0;!Xs(4-VF4bH2>Z8q*gdijxW^_N62TX?Ut)QRXkC-4N
zBibdaBy#vfNQ35A0u}=DuV<`0F$+#85!?MuFtINv!^nb|;qlH6BWt>Z+h7MUfH{!t
zeyoDoP25@c?)(E<c`+-ez@Ce8DdlzUk$Ya%Q?G}*ztJ@k;XT=(ZNfhH%s8kjgNJ;v
z;6erkCq3eMtBlm@@wTYadu1<iY1cliLsgSzo1EI%2KZK1k<J${HI1MkK>`Hg$hr?%
z_1@aduRhTmg<&(8vC_W-_jxb()gMBu@`3%6;E)h}=mJ%<3GRG5&aQnlDzIgeXaAY(
zFf<5XIyEk3)19CHCTUGzmH<V&zu1a4gCF-G+Og%zZ5@9$r8FAs&2hcMq*2Yk4pcJH
z`#wEB^e?F2;<^%QBLC;moz6t)EG8uCf=oBQlSRO)g&dH=A=-LUtz(B{j+xC%{tD2^
zLgPOkX=!mPqKABh{hU+cI17#l5+R~PAs`USmEPo0I`2^Nw<nsm1B!-oU$0R5O0@m$
zc4AN_i50Xv_bWr*?4E<hSaQw>s@QjKm5|Tl67T*__=Wl%wo=h%S?__?a@)!+lJI27
z8_Vhv$5tw|vxg?t%l?ei%6U3B^l?8Azf)|!IjcP~{K42XcgVmhBvkg|BIP83#!}!;
zVGu(rZL28!Z2ts>rEN>#s5%+ALG=Y+Q~_VZKc(=syb87&CHe$16g~qA$n;g=JkgQw
zPAgT)Y9F31b#TdU9WxT6?cT3Gd)3DHBHCG=dL}ZQ;;d6UpX9IdgK#z+sl?F|1C27}
z1ZlfOW9}JDxDaJqS0YIwkArMbfQSXo0V#@|XEfaD_wL*G0_z`gFcY|581bqwn4&6F
zZ~N2A84xhp;f+D6fH@BIiMW`?T&?%~Yro$UWC?c|y&^LbFVzZJY{yOrvE$h0*>kZU
zXn^Kb;bygJa=4Ckyr_8@{rR3wH7GRxak1SJPDE#~^B(LEx_mHv^RDwH;ya65zGnMb
zGTM=?eyn{pt_zOeW2*wf-d}U(UBG!icoHHd1T73kv)_U({OzU~6`#;cQ$)Va8BP>+
zcjPV-Z}AkSmAH&8U?GjxFkzkO2PX&SZE2kW)=pBt*;_ka-`fm%_6s1`g|Ekd#GXk9
zKB(lkf9+S9JXx)1IZdy>)?>wE8$}_#c}*>#_Cx1e6J=?Li^K1rPPthPvhTVKGlR9(
zErgTZL>y<M+ut$xHU1yHi(Z-<$#0_{IC7O7ZtoBz;p^2qAQBTeJcbzhI_>r}l1Wz_
zcoejDN>>13aj8w;nj;=iM;It^&Am$*e?M6HPVovvwXnS6g}2eJa?~Z+du*A<Uva8!
z23c3l_WIk&z0_u8L($AaeC-<uQe99-nf_Sl5A(h!;|k3V?Jcp%X~xk_Na8H|F_tr^
z6Fd;}$U=y2;VseOIAH4@J5+B5`6CmImfBfcDRjss+<7XBw3aZ-iZtbXtmjK01T3;(
z_$bA@5JG_nuX^t?gXi1|6R`Tro4{vP-ySUAIHI4X`f*8zb)9JsgNKxiO#lrkx+P{%
zVe*^n_>PGd(TyP=%+6<X0?<@@3+R99yp9THj2$DrRz4u|Nj(`LmQN^Sa>g_FF1qNj
z2b2b8%)@HY`FvltojIUGLO_rKvjh@mY-t-B?$nt<hY}e4+{wqL=HuOW)|C)$S&qAc
zZuX+|Zc$4&(}-d8r#s%pBec{%fgNEevVsO7h2EJr#F`wJIe9!4_DNNgFpV(knUDGz
z?)9j9$mLal(>>YYQ>(@4Nigc{83Z)g^tXc54P{<;K_bQvA4ObW4jztlw15dzhwhpE
z8q*nz744qm37K!SQ~hr69~yeqM-K$x^sAZh6mz~vxBFDWK<~6C<iU@n6D?N%bM^=F
zAdUa;>_1n*OTtQgJi{O+@$^(+6)Z=NUQd!}VRxmQ@6I#iq|SYIwW)YIbx~6iYf2@V
z{q0*QquU?qi{DBm&ABAnrBnO`)a#DP%!SJ}Y1=pS&0t$<t@u0BDQ^zOg;gn?0PM}{
zqc#c=;94$w(WD=w2tCL;C69PXSl~Q8kC}x<x8r9%z8sy8C0dX3WD8C95K}abmB5G}
z6Q4Zsg>lVK+#x*x`6Q_Q*gP0CNZM^^*q{AtTV)|H+ZoLg=hyZtAl6x6fY^^sBZK*x
z$Owglq>UqzU_mMZ1EBv<)h}2r#+~XjvNB_j(8lW+8qwOhh+~sCJ_UJTOaJ3@(aDgz
zWh>$7EBvXi@Y%uBM9OHk(<lpJx&)@MQwR+A6WB>E+zai6zpP4<#>87bi6C5lt0^TG
z{8;v6E;jeia&<3l*)^P$IJ3$u7#{rU=N6$FM<@E`d&<$ocM52hEB23)+>m@x>*~NT
z$*CJ7@U7TmZ_#2SuzlHZH7}W`jleScXhd^yZy^t>NPW$2KXzhnxU;xBw4~+lF~94e
zH##x;DBlwC&(jZ#>M#>9AMCLrS*2?9mCLW#qYLs(Df=8WK``TjoRQCk``{(}t@M1b
zh<KJQ<H`G*Z;wY0!}tb-L8xkWl=!kINixopXL-r2<w~{~`cr^OP2akz&xF3uG+<HW
z0zHHm8fIA${s-a#8V`lsql)hPMS)r0Jh)WM|IRM80*dO$SYjic(3{%jf1*Q3L=f}=
z>ty~AK_NpjE(RAR_iU0jq8MdRYg4R|a`sI}dG(M?T{eKneBN*yu=L$dmx6Ep3`zUc
znUNzr@!8hq2hr6ah+u_iG^!_nq>es)BaKNbqtrf!Re<GETdTw@6YT#GTPLE6)>~D;
z-TvO|%Tm>kPvkGlrk?G_JKj*b;-w!iCXVI<^DBNtq%uV?x*7E;Ya+3deRfACk!XEk
z7szuLfBkd>_ykKp6!n4Pf!|UMMz$kx;d<FLRwAvU@ADe_jLvUj=a|+zT$o%I-=T&{
zrDH9%bZ(t`rJOm=^9EwH;zAUGiD2aon<)zEZ{q&Db`j+TQWPvE5`iIi@5p?oF=6_a
zL}}U(_j+THevKLS{gKQmR`tj0P9vUty<L(y3M3{k7HDhax{PKa>g%QV!8QMT5C6N-
zOMwhvu23EPau<zjVKCSa3-;teM6DxS0UDRh8R}}<N~JDSgL{ClS_@OdpZ}zq!e`XG
zCHRFFt1mfcoNhNpC)4@#X$yS^s1)ttkIn`cpBbTV=fRuxk??ty7Z7#Qpq$$FL}~G_
zXD=g9S6c%(%=lT^;WP^x*dXsI^^)-gub|wY?{qj24~spJfL^csg%?|ZwQ`Kf$Y1e6
ziQDtbxAmDR-nk;^4XPK#`elqe<Ojb7?PkS+GCAU3tCkoy>AcM!4=r<Q;w(E};FnT;
zD^g`UR!CAEPuz<e0v55JJyYIqKyUn#PCO2hLq@PsDNfkx;_Y|;*gu+q#R$=n8A(XM
zoi#H>PtX^igEVahw3r19y0*huvb<foL~(DSXH51EtO9%IN1IHRAKeM4WUCNMxMdY8
z?(wb&_pIwV+bQGat3Sm%{}L*kC19<g<O9M4(6;;iRAl5EJp&AAYG?a4Euz%yH^Ctb
zifC(}(*MQ@Z96R1U^Mx;MFafcv5X{|>-=(zjkQ4{k({b^RMB0<BuAjM@94&mli^{A
zNQ_n5#70Jvv88TKRwT2dsm@=Zuf-;?s@)x8mS7*Ewu{z}b@}>gCR&J?O%ti(f}{B3
z`%So7dgoZNo*fBnngoT*7ALgOl4U>rnvsK%2V9|k?YO^?YpqV{nxxp@5TKYtGUSFS
z>9<eJ^xvTcJpv^Rl<w|Y(1C^e?*OHP{a&?Y0XiQmtAX@q)Hk<O367Lo26eA}7!6z<
zf2rdUP7PxxTH6N((d~5}SNHRXBs{he=ew@pKI{nK(a`#|2D49lSYYzyyzw`BN6Whf
zOti-Zag=y^LUr53&HNomG?<mmEb_VY29Ppds3wNBK`q|^rl)%8d9w#^+4NC!9=_gJ
z+{jO=eBqr5*|MRpoKZEfa~6rfk8_u;PJs%|n&t{MoD+>C{=n~?)7{x^O6-aS=_e*E
zhq?|=Z2T4-5<X3SH7AkPe;2HmDlQnJ*S39^a+-=dgNQuj$?dFPeFRD!GJgQ*8!{9+
zU2%VQ+Z{Sp3FF+_AXrGMu+yWqM>AokmVv}|O>Z~3POAEp8B?nm!(}dIFNm$y{V`?-
zJ~pLe(y6B%%=6TjNTl2eZGnxL)Y+#b(w4R4M(Z`<f<v`yQ$O;X{p@8e*0;<A$@7Rk
zvsiA)itw6O<4;Mp9LQKi=XSU8OK}l=uqRk|u#AJ9(|_kF^J^>x@rY{p$p*_W>Cm&f
z1gXj5c3md$WqRfpo|-X9s=u(G5r8;nsCzfIFuqH%S^f9nd6}$uHoI-hQ_ervy4^Tr
z-UgsV%T|1Tqio10E{hcY$Xeb(0zummU3Dz0QrXhEtBVui*x%zWVD8YDmT-RcM!tBg
zXfXXzK~A+OdI}ktI%ZSkt%zXV7SLgWzQ0U>bD2hJ?!Lme9*Ccxf)t%5e0SASR7gYq
z_d3vas3>Bd-mNVd_jTT*f^g@30(B`cO=hNhX5E_<KWYw!5Tton+fLYub!eI>_^?bB
zdRGjLeci=nrNc4P2E+`H7a&SU+nzZ>@7b$S`uipC?|?3)@H4~l;_;4HG$VXa6%c8V
zFdS$$IpPMh<7=hlW2?Mh8MO-@v0<9Vb@u7u<oi7_Hq}-J-Fx2|n3(B_$%2`c%fnv1
zm-B}0gUw|cVzHT-lK<u11RN>kDQW%_9Iwtpj)}vnB+78MzI#%vH@tmS6AZ@9h8qe^
zkLaR>S`(b_KB_R>Ao_s^>)PvEz~L0@M5jVzV{ACDFOty~(YQ&B392MDb^IcX<Gm}N
znvlI1iYH4(!YR-c)iICW`q+rePoi^V4s6(ehiHOj&6Nn*LnWIzLyw>#GO_7N)biM_
zsGu4AN;D}UN;v1*w{|*UyC3#W!|mDi1u*x8w|lgNWJ{XJ_$yL%RrPfl_5$KC9P5==
zH|}eDAD;ZSvWh5d>Q?m2P{s`YI#RnYM0z3k7dtR#5c{aSU^iK+?#zO$Ihj4kYLxg+
z0A?fKjNjnW{44K*8oTW^q=hC{oE)}#AtND(m%aNbnaYk~0g3D{HU8Wq>Xp=KuKf+0
zYau?&@;woq1mgqhIp2Cc{|=hO*w+UEJ=k7dF(0k#<1L?a8&q}<%1V`ibiZZb%Jx64
z+4mE+n@|;C0KjYx&7>a_o07SD=Kx%w`YJ>&^1<eTW|np#YL}5$a(0lFyVF4iqsg#0
zs)KB`=X2KWx4vam-8@$-ui2tVa>jZXr}ZI&E0fXJ;{&Db$|WduWx0Hgk}IF~lZ%wH
zaI-eDAfkof4a0NM5ktYZ%T&dGk6r&-LRIIFz)Ad4ndzDkd~1iN?Z0-lu{Ckr`1<@@
z(xBUR&vT*f^@&RIFI-zd+$T%kBeqrD<<S13Ch^zH(-m<vZ(8<kRW*le$7SB=Ka0FN
z%q;z3Y9E1zrJt#i!P1>LrT|%7rOsch+8?jz@;a;XK893kF1V%lF3|Fm{tSLx^)sqR
zd}FmRHyH$qxNSo1cox91=pP|Hj+oGliy9x66xaNMsw*Il8fJ~X0!fSe<r46ZOli{&
zy%Yd@M(OI-IklT>bwh~<ekM<CL+?;}siy@cse-J8r<4#`*xi3oOaFdhXCdm=`ZT7l
zfP-#mUr{Y*!DyvOrmDO3zx5~MQtH;yN6u7FE8AP0w|UJ71|o*-8{+zN+G@YOGrgpC
zHtYXB7g~<_&u0J<@|dvSgXEEmag|j_T5r|Ci?n}VJ&V0iYQmpgu9C-78_1X=Kiy1@
z6_nj#uLH&0|8R}}NlAOXf|w}NGI*tUYVZPlg}ra@G`RgchO8qEy)Nm_Nr}EMV-^AB
z(T*g&l^;YL92@@M9~MiHzjiP|uI>*8#mrSKE>-{iCxIdmxTR_)XivX)Idt{<|4Kl+
z)$^6^W20_T$=G5<TUpJ2xfcENfA3%qmG5QZS!aAMS(184`0LH68oTSNYBueQLvu>L
zU*!RPhrIPnV2HK>WcL`6Yk!hGQ0Sly+Sy{C7LV6hW&_h%F|Z&AB^9oT4@^&RzTMDe
z2Mro)UuN@LL5GFWZ(y}vwP`N5NtiGt7Hx3{gJR5|Adp<)++pyWil;+FN6@KQ<PH@@
zW;6Fn!_Co*%cFy1ZHJ%T{u-|;73`*siwWLr|ML~W{HY<RZ<pOdX;1t=b+zOHW7R|T
zYCL&x_SaNNYNpwUUu^eVA7Lpr`O?Uc39`}@GYkqf8J1ls?$6&g5$b~N8ke%$?N%6U
zq=Z)MKsuMHUW=c2_}mw7VZpZ>%gE7;iZFy7P%3nQ(ru?IPYAq6;)g@Byx+y=;s@#+
zfyI7=HLF6U7HCAQ2jUVANI^S==FU*bowj$Gx)m+6gLKrRJ2lF76q<#a(zPCYjDq%4
znn$<gU$I<0-rj^6y|s@RmR4k};69TL_*Dcq@*lTqms%dqeefiCE%j5LM)WV>GTJg+
z_1ymTJ~r@(I9DMAL%fyW2hY}+tr<QiOoKC2xVX63z_qe`cQ;C^@HZIv-2(O&rGOwV
z505g)!H4aG2f|pLgQRh*pjgc-oNewecVP-weFe#GKfe056oGOmL6hiTUwuPofD<iw
zRc6(+;cOfzU>g8K)Vnpj`Sq=OIJgMMi^Rq4k&1LpWrl3_PPxMBdBI4!1jrBxSuz8x
z4k$DcnE9-jaA)pbx45J*HLOU$VkzcGLb&hk=YPg*#^=<3hL;3^X3klW4YW=DN#lfH
z7rtDMkOcLOWOwEW7WKkIpdYhyw~>lU`im@Qa+{jG{p9w8BAQe?8K~d-vGBpJ=m&J1
zPjkdIqK-c)#mj)CHX0J?$c~3>4AC*4ei<&xv3r9Y7AjOBCEwGATcvyHdu(Tmv!@4T
zdi#SOLgAM3Ca!lA7-{(zzB?kN(#xe}J!!%Dk;Z9L8LU>#$n+&0Cx2tV&`p(u8-Gtu
zP5PE*QIO6-Sxt3)JQbhwsdh=BiAYa9Tjrha{xU_rrw!`3VU0o(TOGcgN_Las+~s9@
z`C{GZpOww|s|b)!=?Vie@o(er!sY}0V?lhxpM)CyB5<VQuzE|*;i00I!l(Cv<E->H
zk*p*B*KKspleZf=*Ovzx%AhOZ;n8gx8|W5cr>ndb2-|o^y;f6fbc=P14i8$!0all-
z7U6{%AS?#D(TX>K1yCD+A6l7yjS8xUQ~h*2W?8q2_Oa~i3YnTe6VFE<Vu0_Wure&4
zR5`%}%!Hp5_H$zWL1lU!O}E^*b0*FeY3HjJ`CT|<0*ifS&xcJ32Q&kd1+pa&6Qd2v
zb#z;9{G>+ia-0UYl|ptmS_9A<`*l!f9i*PCHwD0J!0zuSF;OwR?r#A6AmjcA%tl8W
z+&HnDV<HZI`e3p?{SF`<*|XD`+h>D_dhiu$jus5!urGq_z`~hYJC$Z&R49>;oYi0I
z(ujjCoJyyHNL#qXY$3Ro^Z#r*7XT&XP$zYbJB)~G(eG>}nW_-4IPiK(qa45*O(4sy
z1S|RdKQgZlcu)(%a<1h+&c6l9|2T-m7@w+I3iOz)wX@;vEzv7cAM<Iijs5Dm7~Un1
za|S}#@6itfaOX=Svr3X`@|@nvk9ZwAHm?tVldoP~U)DFTtzsRtYUL{6@gxH>AwmK^
zk{2!$gaO`@oSCi4B3h=&+ct<I5YJsC?tT1&f6)<uEC<YK=b%?j4?L(M{L{e~@~7&9
zN@sfjddsF5g>uXLhch1ZgiBS=g*P%}H`G6a8dqJg|G$^ezjVGt5uqy_?!1u+>SAOk
zo1)unx>{zETW_`Pi)~Tv9ssA5UzCPNL_Xemm))&dIlqLrZ~%x+fE1<hWrW<J%;_2l
zZ8TLZdb{~G(T||31&}9p4qx27jDP3!{)aNqs!jH;pr4DT6ouQqOVjzvyaCj*Nz1B8
zkP4ON_vYP1>X|$!_EJC~H4pGjRc`4(E70xeK%lQTd8Ms0!Hog@oEtw<gA!B)zTaQF
zIN3=)&F)-!T+v*bnb~yo-I04G^!eJB$F;WrTYI#MDh9Yhi*7o-T*HpCj6YX{lkLoG
z>J&(f{NiWAy_q0xg2c}BJx&3AGQSwsaNmXtWo(`2tm}hmSM|W<e(%4Mc&Z12H&iB`
z_8k~n&U&uB;7<GOy7rJ?A6GR>FoyT$r_;r~{0|_zO}|tWm~(V?T3TfEN)4qC*emzG
zc?Epl1LLgt^I*H!BKthlaA$6JkX?g2`Bl>6$t%(iUuT*<Yf3G_>Oa1HrgrrAcUSLA
zAbA_PIwm*U;YOj{cw#~8UTz2>O+GMV5B8!be8wW!z>cN?xY}n95$AXM7;9;qD2Y?+
zZqiBkf3@!^SgYa$(5dMWx9v*#(#h%@y9@q-67!cL3^*X9MaAW#r2khVkhef$cs9>=
z2Q5xIOg=0{Ok}<{Pb)X%7PCO}gFL0@SsgT$^1pw467f(wuM)wm{NDIcRu@Qs(7{QV
zuCbwT4hw@21HGHf++(3lzVZ6!_gIbL9C^W^N)x~wGZP$A>j1X|Uh&}7$y~*|-R<)w
zza$HrFHUn3Mv(G+poKBD@yPUox})Ys6gZo03@S>Z9jI`B$#Xv3VB64Xa5g8Pa!70j
zPtjtay_IBmm!sr><W)O1t!@*_X@pV=mPWh>Psl=$&^Z%ROzXK!LgGezIp?uNKtc~q
zIgJaI*$X<Sr-#x;Q;TRL9%GDYO4y>cdGSN3K&T;GpaG!A>=q<)4TvOAAnqv)S_V+8
zhs`t~vafL=<UzB?YRHql&{ee77XX9mXM)o2#$)`TQ2ClYGuim|G4xvEV$0Qb4)Csd
z#Hl2P^lnJS80zf5P29<}WSbqoRyxA{<%v(&UvFRqSc4bX04Ar#O#cK4o-7)zU2||6
zWutoYtng=|5q>~WGspcM2!EQee;LSfV_qSAAS4&$;wcEL0xWaz69}}G{KsGKgm{_|
zGlCy)K}RWWR(R3C2LsmlMs%ug{Ic;IFBf^ru?eUOM?ica8mEI=0&6t8c3xO`j&#aq
zpxfL6H)1YBT+(*Ir}69cLrW^`zWkOH5avaFGt~G^k-%W};&!lg6@X>RoYPWc)lKT6
zel|)(E2h-BdW+=EY*}VRiPw||r<y`OgJh(jI{PyPNFlJ+{V?_h$dDjUs_}W6Ul#*!
z;FB!kyRXFBB3*00DrFa3+m>H_kT+V}YB&}@U>Iq@m`Bsh9VDnC=bw#oqU&U_B&F|{
z1h*mN^07fVo`{A8rAEFnwN^5ay>FE#tT+XHrG6s<GwNb)<*xgd95Zr0=a^4R1!FF0
zJ?5d%lN5?l%4RkXcfX@?g=dNnV|kBey`c~$ZSOGrcPhOzbPi%zuW>FP{QX$#b9RBo
zK^Ez91C(SHw9eP50`(ZAndid9NeX`Q&qY1Budhxd_f6{g7p~+QzQoK7{=Qo23Lx|H
zl`eU*zB2h{(&E22Sk0PLC?QR4<yk%!I;+x4aFLToM=6QWiJcZ8H6iL!K89}VBhP^9
z{uDvp2Dv{3LlE<M)|t0seL3)*kyNMo($wEYBSV5Fmdch_V)7;A`aU6?1R?=$0_}a?
zo885V7fmr(=-2V|eN<Dp{+ne8tnasNG49u$>|4jfScv1lS9&Oy21PsU&=QdD3UtmR
zR;A>NP!QsBsbi{|c>2er@z8(2i>qkbIvHa^GFKsbBx{fn_f{wuG;j0h#%GYEB9YX|
zS2apwY@&kg3fwmv(dq)7y%AV<w=+P=9+zD}oB2i;T}2|?&#$!$C}FRLR;@kNf6p&~
zLYN}K$a5@Pn1H>ZshuXqIoS|SPv#BpJ6UzvSV<G9;7bsWf=et7Yuf{H1A1UtWY!>?
z^v9$xG_1<OIrqQZ3uS-Nh{6(jk5fr*_tou!m>T~cdmd&xCY2l&=l9{SxA;<_w7-tO
zl(<_<wEoxwC)F{j(FY@hO)>sgN6A<MSiSdWZ&hMn1Y)$pv{mGGYJq{n<dkb2G*|*S
zC6%P8dN4z^gZf7m^vm9W+y!I3a${wI(XDFP-w!gf2#)?XL8GDjt>OGl=R)!&4_u%I
z6+r3!TR<V6$q#n!lLz9cX{s5eC^D(&+i(~cFa(@1b5s=1q^*rizEHZ9-Ncom*k;0U
zU=6-Dd??|Hr+IH%$hBuKo=LG*Xy^bf>s^{DIG5bpwd{WXDZ{y+`ew62Hpc-Sfv#l=
zK|gl_YBcG~SVi2SoAlnJ?};S6Piz)v3<SDzqE#TIowW0WkULqlo1#YcaNTM`ey3;t
zh9#GW0GqW#a6!KA{Q4}5URd#i`d5>?Q*)<I5{J9;#BQn!dIMw!kSy72PRq3;rfdhI
zypyn*(PqziUje6CA3^V9j<PQYD0|hc&uCdMV~dowd5@T06r!qP6!)Q;cTNTvolxJ0
zrtptqc#n2XEuV?r&Ju)uhQhvR&Bj8cN@F2Y;G}p5yL>1$wv@A={y`s^``iC@_zUZS
zO5ybR(zik}QsAI0=VG7NX_4XARjyk4P#+J<^MKs71O!Gfcs|Qzuppb^!I9LEc%~#E
z=GegSfm}P;+l;^ks&<FRY6QTUP@+<XJE28j=lCcUba<ZflHS|>K6EnY#9yEDUFI`H
zDP#y0Fs0Xlvg?Z3&I#ZLIn!k`CeH}yBxF=MRefAnWTy~<!SOr5pw3B!|I%Q#L+!_h
zI`o$y?LY^ATj$LzzPt*H=l<w5j=dj52n-HF?z|QD$1(BYxivPFV71&TqPph<hV8O$
zJ;Qf+tuWrYovQhvV-pfXp_QDZ!1$-)fku=KTnAywGr-C$4xu*!m-5g+_G}cT4n;YL
zUg{iev#8JX1Bd>V5$<f&91rqvo+F;%BKkxiT$ANt_QW=O_u-OoT&<X?2UAk6Vh-a}
zhJU<p*9Z{oP`ohjet+{ZUcl(L5<-6pT;JlAAv|uKEyuR6NeDr$3nVdxhry$qs*;iT
zah?BgT*>qLI>a}m1&==DSB{p?+SqMSTB>HQVk~J_F+z}diCLeKi<K5x4=@1~;D$cn
z<baLLadmZ9F0PotW@LA!9mi%m&D^gpJ}#&Q#0s(*LGEkx9&PLwK%3FbELYKx?-X7>
zJO_!bU*-Wg+$(;-{B;QzkzNaL3wFWlFKXHuD1r{g+ccv$x8?i-{XShr_Min`HYa{#
zQk!@1Gb>7bk1{rqEYqEQ&obL^g0Qo{JK^S4!py7pEJh`l8T>Ym9P$mYWD@M5Y8OPm
z_1Hpr(LYm>@ZJD0L)`edJ3h14WyfnB6?<PYo5ZzW(7^kff@}x^=%-IlzXgRhul>SU
zOdnf+?`=ePnzff;y4yw+s`kbE<nA_JpWxJRSN^Pq5WRWEw0T`zr>g)dq<RQe2oCSF
z&oUTJSj#@oYzvr&5?-;?7p{ad{w!q~a}~W!Ikzb8ROgftN@ete^sa`wn5d?M8t3m9
zvfH#$g#3?WQ3SKV1#auyDTXe<(-9+TwbmtmMtS6QyXKur|A3mAQyka*9C<ZlTO}7M
zXxii(QXqNb)98@T1sM9O<;ZnHR!u*d`6zC46C0d<0!bQ4zc&0FN5o@4zNG-KcZ}X*
zahd+j30P7i8p9h3KIc=Cb-0o~r##90KbAd@#segTNmNZ!=7!4w1~g8k$%IOn@|g*?
zip6mT!azilYDSlhx>g0~fi5Ivd8(9_#ObYgF=|MwBjU!4U_Nz3|F9~uiQyw)6{|@0
zvVtYM?b4Tc^W0*8kqRtIUBV|-Q7Wqs9k7woX(19oC#XWekW^@gIVtyJ`WhBKi)*NQ
zlrDw8E%7<SL;Hn(igKDWsc$l9u*eNzJEmIOS<|%qR<0}alPq%&-lOsZ$!Du!su7f4
zRZ4?$0{MRbhpV@Yi?Z9pMq#Kyy1Nks1O{p8772L(X^~E8>4u@CL_|gbN$HgC20^;J
zLqNLwEcSl(`<`<?`cq+I-D|D?m0~WMeS-q;F4*(G>}?&*&9Ap0w=DUmj4uC9FL*=s
zKiqTxm;nnZIE+69c{QbjAi7B2Ue}#3h_tjFAcF5?&XEo|GVueEk8Ff$<=qTiL0mEK
zQoZs^b2X@3a-Nxg$0Gcqtmm?xNf`FQirGjWLL#)j*x3=W4}!pae1_!jhN4;u|HHYV
z`MIgNU3j*<LKauTe(-x=8lk;t>BdksBS0TM3V3Gr%P6Ba!BO9;n(hk+8LP++fG$28
zp=u^)gfgK%QciaJJC%1!S8<+^E-WjttlfoAH!JkHleq1Ia)n~R8r)LR?Ka9@WiaNZ
zn}bOEkMx{OlH@l&vP^u+_zggt%vE2~#o!`+)QO6Db<iFwdMx6%IS}m-5wL}GX_D=J
zKJSiU{b_0FxoFCm9?Y`(9)o*{=uUZ}<n2C5NU*(I3I)>`?}&$_DTcxlV2F=r6mAF~
zfAGHS+(ehUxx03W;KoOhr^gQERnY?^Mbs#u+IECzXE?S23hjBG?htfo$(QeF)+ZGF
zfrNwdc^)_4O!Cjn%s5o~DC?Zzql0Uk4}8tahJ94YktnK0UZ&6@stFWjB%;Wb@+QQy
zL5q|~a|3l;y-<r?S*57Mpx`sF)9r{*g#RwDYvlXreGe0ZpIXaCwE18XLnIP*c~Rzb
z$I^(<W;J$QnM)2HS9VQ{mQKc5?Us!?IgE6<i^@}J<QR%c?KAMf?I&<s?b=?&@|;Jx
z?i!h9y6-QT&_-Hkyp5k2Se$?Row|mxhh)3vdyeIKXrrj-rrz$qu&*PgBa&qXl;SLv
z^JaE=PJ!1rBII#y&#D^nGLb@pP{kU`sFmu)v5~uHd*~@B<5G;*mi@S4mJhr`STBO7
zk>T6~)R0%-Z?cnZc#T?i9<}li8V3czuycb#aUcpi6FF%8FRa_e>UWBtX1oqw3e$oH
zNtoHDOj%D~%C|HD+;WYXRJE+_GZrLD+JjuEwE4m>*YGxk_X_ErAh`+hYe_KEB<#6)
zFEONQNyG%u)!$(zldVk(QyI7b0>KYMoLv8kYyV;ICb}o=3q^{Emqns{nZVALQ!sVt
zNFI40PYeBZ@uY<n*C!^D!&U*uGqxlJ;2JxJM`_7_I&}gGKXFSgfJ6n`q5P^1&tybd
zTF4j(RS>g)+%}nXKH?e9R0((oRk_fC=wK-zDuCFH5a}Z`+zJm#Czf{pHIRa5iK<#k
z|L(A2l%v9fg8V(S8)0L&(0Id}Nu-y|a3MJim#5U5&v5@pD2Kg^e=hR2Lf>)j_eJYi
zl<;%E8=d}k1Gfs^z1y4iYMlD~C^opAD*EbLwGP`z^3UUdMf?p!AI(GpDB^01VZ?pc
z-p?(<m{XsQjJ^mo&{Dc3Lxd1zRb`@Bs~xNy;a2ygf&VTt(hW{4p`$QeR@94#h{t*f
zv#(N;S@2A5Siy-@N=2l&A#mH1_x__UuXIs@gcUDmm%xlvKs+LL=DqlYkOu84fnCW%
zN{L#<8REUct@SD|ZDur^sdxm+Ns|4eJ{hFvs$L!1s=#V<Qg^C1wDF?Qwj0zinE8kT
zSI$cC9%vb_Oj-xQ=6pItNV}O-9nUTP=QaV3ydY}HUwi>U*5HPU?9YIN22yzWTb<YW
zGSygQq~cU>k)W7z`who2LVml`ya&cNprYyzQMg`&KtzN?g<{`BdVUWEg8~5=hnBM2
zOP)9&r~4X_@#@KyL7)FcE^5OCz>e0Sy5b#n8btLViGwwSDA%G7!R0wD-|V<of=3|p
zwf`fX9<@~nu#J1REdqo{8lrxI)Yb<@LiS_QVfc(9E(K183COvoFyU8?gj|sr4&jI{
z{2SEwNSaGbbuOD+8*zID>4Y&v=1@-NX2oy)cq<3#(f8j0UgepCyxdo-k;5g5%Xp*d
zI3`R<-F0F#_??cw`PIue5!f$4l$(t3@NXfe(A2S{pV-r)XOoy=rO5q-c`RO{M>w<(
zgZoHK!1bd+;W(_BI@f}Um8~b|W7Wp(^6GD65<`$?VTVg&R9C-VS*g43&d9Z}Q@gA@
zrI3!EePvNSbDuna1Gbf&Cc3oPilVJ(3{;ENx#u$;0b$(DbFwk=o5Z<*#EX>pk-#BA
zfYBxpn>ikRl|J4sDhCy?MP8y}`4~VL`q#&aG^z^o8212Wk!>uMsm^X;+WO{3Mrj?y
zQhZtMVMAGPOs>IsF|(y2R`V<sp}o98?4+zWfsG^zm%Af(ubBRRAGeVxriyGngpc%2
z#v?>$7HG;htZDEqa+U?nwuvrCE(Fteur|ytddk=sc9QcfpRRuTi6nj?C$B3{6dV5)
ziC(MZZmf8KVmF$P)Rg8Gam!r&Zk2xUhCqxR1)f@N)8DbY@??{+OjD?3r<;Fr0l(^_
z(fcX$L#~xuv?qEE9uc{_Jf4oHInl17p9E$^u;>JVc>mLC_t(h;b}{am%w*^8JcAo!
zmYx}-Cli&y7f99@5`YW*>H}nfOBZZJ>kl@3ns-C?hoP$z^em-JV7d+b9g7mth`!d7
z^a@e4^;xTBz5AcZ6;Ldpl*ly<G2(Wr`VfIrALN7G752<>Ad4pb&Z9T1f9vywUJGh3
zIx;i~_^1(p1ZAZ*-U$4HgR2EHCTeO9F)ee5%USr)83|ahtY{fdKzJnecC`wM#4?nQ
z_3sNNqsxej0IowZ_Jk`YpY9O_1R;Iy<*nV6Yr4eHc1B87nn}0fFJI}SkerLN)*BAu
z!CVYcIT@r<dQym*k#eftEIHo67N~L~@<__(j@ef`$&r#Hj??|{ONYImQda%Lr?O{e
zy`tvOgs2{baGd9(AB@j_5E8^a!}}U0oqm5;Y@6!7;JPoi*%Y`r{kyw<^KW_gBJF^#
zZAs-BaN5a`Qdx^W^tjvv)b!^M)k(G-_3@I(Y;3aP+wt_8iz+Z~@JG!7UMkrt`Q-#C
zDD97GT7ex8$V|$?{H!JooI2MTYu?LYc~AdPvP=Mba}nF|r@v>Jw<OjBh$wtDb<$PB
zK$$1tXboCDo)74O-V?=37jauH11}rDbFZ%B^x>P$hAwkQPfRDe#6$QOD>u$Uvpe{a
zHE}-oFtzal<+5sSU1|wGuH7ruPTsX~TB6v+r3$zlFRe^8+fe|y1oZLz>dYYJR;3&L
zUxlP%a8Wa!T#>~2yBLWX2+<QE(K~$866ayCJuuZDUl35Yv3+u;vk$WVMs-l1LP(J#
z1LWw%p%>;s!M~w7da>=&)Hgp128-G05?|*xJki5V^{1T<L!~>V612zWVqZ94Cf{3%
z`ez!CwUB`;3Qo4VwJ?u8Afdj{>NJ1uxAW>7*bnRDLmTADH^yT=Vv0dlKXD`M=PvX+
zl3V!*i(4aZp;bYn@H$IT**5VJj|bS>N{k0%qU8Q{l4inC&h8xN3|e~c5fy1foa<(5
z*#W}IsgA1th*~7Jib7mAY9mCX>H0aj#7IX%%h0~gF_S@r1$A<X%7U6H>fggl;twnJ
z1D?^p)y%sDpK`5Yrt4=l;f*I62ECYZYMKzLE)8{VoROiHX|=|TwGa)Z)L?Pwiq+SV
zsY&I3HaF9n=bOA1Qi`t9<{0Fn(RAC$N6?Z`S@p3JlB86AZCAkg-_L8whshCy^}n5=
zbjl;Mc=C5`xwHZVO<KAuokt#zTJbBbs5hyY61;c>HDU?(q_%hju4_arP!0UmaQ?EL
zsY+Wx9)CbaWI;c`g4>X-BS|@l>aBRlltB`FMq>PDtX%C9KlNh6G1vN!t8Cl7GpJ1j
zlD4_P^H1mx(3TnHQ{P2vyACC+z|V;}>TK%SniI_1Birt?dXbWv(6kr~xpEnTJS1B}
z5=@%wKE{tQs1akZwW`o@D4eZx!-U|)43xg%;90j)AE}WIaRXB-0(1+;3qssPAD7&j
z`$`v}_j4-s;03~;MX-OoSLHdbPz{S;BroIq3H{HK2~;8vgrCT`V76N&`Nc6gGKBC;
zT88kGT_U|S8LwVti*S7*THumCnd70g$RhCyx+H>A{=~cR+!HcKYD!T0-yia@?phd)
z{y%@jLgxD+Ow;@5UBkzpga3Zk^wW%vog~`1pz-GyCak9_PMqos$S)VB_+@#*OvhPU
z$=ZWr&p}QCNwzZWi^+URZf>S}zFfheq=*+dmZ#DETwbiUShvBHV&DFeef{;<#wzgp
zAy9*CG29&GxRFQi{s#Q}iUO600_CGA^Fdg1p1ALo71eS9w+g;N(cXALPTfv%y#a&b
z`6s3*5Z6pp7PMI}2Cy(Kx=m3Lla#;hSrJILU5d7w!9rf%BUzX>Hy2Jez##${v#)@a
z_!gn{9-PRU0LvrfyjS8}#U$!tCI5~meh~ngaA<Mio8JA-k5tzpzl$wuldyk|ef`pL
z)>N!Q@Zsr1NE;KnZZz|18JTtm|ND^{ctMw1%HB=?+y(&1>p^jS#gON?CnGZJE7peq
zoI4q=^cNzrA4&@Z4b~`|chUxbXRF`R&EMqgX9z1=0q1qoDyv&Zo(5~dN@wI_Hy$$4
zdo|Pb2e#+_?soHsy_i=6D{kwePx)8y4r)OX&<{WpegY2L#FV4>nJGthx3d)!a*l|Q
zWnf)lq!poy8V_m0<6p=d?HmdlHc8a+rL_}#=MyO@4Z_cf;a{R%pHsKA-Btc?{Xhfj
z_t+VWZkK}Zwt>C+Y~0!H`Q7zkgS7AC2zGUoO*$3xhV7M8QB7SZMIv^(HelLD9EbY=
zuU)bFb|!aw+~Uv16*;emEADA8$GD6ZgJ#_`^WI#JlAE``E>J^X5(yklV;EYuiE+QV
zqD^4wPXLHyLBoe|jm98e9PQX8kHcEL*XYLSUEs3)deuNP*SXC+lw}cM;S&6gOOmIc
zzvD#(kh|o9(8+vY8c>U-AR3}PrleXvlAv2QH0|W3H9TCo7A=EC?GWr5?kYw#W%jA9
ziZNUb6^I!QGF1u|TN7mtHa@%wb&*}($N)0Pz4b2T&6G}HG@1HY?Z00nGY2X<_J!KK
zvyzte=mV(;@`_)B#D3r+3LXRE`ii#^R3V<6Z<~TT@(gcDPC+Vt9?(Z<N!;t-UaZCf
zkLO;@LFju7Txw0ka|74$*Iqe(sb1&vwE0G@0H~nWzXj0l*8GXZHi$3WG@8y*G!60c
z$k$BVi8qh2A-OcPxjb4YjBPmImq-JO|LotdxZ8f*-Ch>}A6YIg&CHJ5Tm}gKB;t70
z_w>nr)$CCLfMINSQ-qI_X>NeuU%t9BTO-zeakqHKrxKVen>su}gVF2J?C94ZCTa|L
zl{LY=Wqcq-B;^JiAX$6$`-TL3^nausO^$MC0ESinGLA8%?yxKBQ5T5IsoVzUY&Brl
z|7Y@zdkVGjRlne3F0d2}dpZx7t?IKsSmXtdkw$|-PPJ`ez2Z<&^+JCCq}Kx|TmT9!
zAA2+(%FEapJ{vx4433z+I9weE{)xxY+TO~#V`bnMd>RCecVi%tYpm3`eVu2w4FhGj
z_tWD7Ks$cET8Z@3%gkw3<Z4EY@mY_&07ReSUf%BqCjA33n-A{yB@n8~!1@49quN3k
zm*pr<20?|$4}ah{%IvnG9K2c|NYSr+H!VH?Uc=({@H};<!aCk&G`on}`xXFe)^oNm
z-u$XpO_|y5BrEvUc>O9E{nT}Fk=2wsO8oj=73K32Cf1ig)u(xVX7_`x6@2jXjEIZ7
zYdYo_vKVxU4Q){PgSL-DCGf{{R2s%g6jKOtv$@z`eV@~lOnkXvpQD;G-WA;}eClyB
ztT3*`{ra`woMW*4jx`8h7<{gx{Z(YS>yhtxtqa*{J6>R2+t5V*Yo*v#s&TO8$tFcQ
z<5CMP6>zGW2XU!>7iHV&;;c4{Jk7G7zlvEB0{zJ10p|^#Z=YKOm3|fg7jl3aIUO!`
z^s7B1Pi9t_>jDt_BM<@BIs*@5a_jyGKCSJ^lAKCXx_CdpCRX=pxHlv8WVh*7m^%!x
zLzcnLWA7ju^}j3t^BCY$ab640i}{XGL~<PL8s~Iy*ljwl@k%TY05kNyg9cke>6!89
zJ|LxAF?Lt#6UL6X3zmdbl=p;E*9f@Dt$Q~4c>ai@vj0UNH3<{|6^gI*Yu(rh-tz$C
zbR{rA0`7=sL%lGyJ=8aAQG>_!c@I9sZF}sW2_(*q*WPD;hfl5ZDsRt14XA*vfX1y!
zL67wb$H=!XmvASw?7myZn_J-hw3(^c3d9h<llN-=C`veyewBPqMrmg?*myipuZnYZ
zvi0NIGHH9fpxUr}^=#otWFbX*H$;ve7kbbnp1Jw%gHTqb;=k9f7DmTm#(>gx7$K;S
z!C%IRmZAs7Vp@VGM9N;B>sxJ*vabbrC_*WaT1qu8cK&&S&Py!iEWTJRUD^gTi`f8W
zD7)Rv`zx92H~oA~Sq~m{e$y^&;~VjdwAA=613u6~*xGV)GRh8sEGr-YxzDd&*iLf{
z#g01%je=|t1n@&zcb&c+Z}4<+(yDV@24{_<ru9RE-O*o=CqNd8w9$<(asf5@?sKGw
z2{y}wy44sF;tjb_z;K%NsNWik%V6n@*jkjvdS;=)`sEzaP+~$`0G_Sc9_o+_M%h;@
zEou)sFh)rPk7+i@!06Y*KTVT<OczfyLLcsRu^cvdh+Jkf&YmX-c@pkL|AO7Hp=Rzl
z$E4G84Acm5L3n#tB=~@c%s9(fZR=HLZA<x!-DP3T3|Y*))pQw*%2#m&ZONt?kWt)!
zRIA=i|9DG)8c0@yer@(5k<0FBG{w~c*mSgjzDgKlu%zi*3Bc31FEuvhBAXIyxf`CK
zsE}VJ^MOX35X)Qqtds52rY_!;i8XSjwp?)Ow=!OkWINtWowPz8BXe%_yDixUc@|%6
z&*{3@R?(&CA!XpYyK|SDi=Z0<HrJAC_W&q90d3#6)2~tUVAty*XL@*an^!uOpz#XV
zK<UH^p&V()#-rp@BiDe-0fzOn=Ln#Ay3^TA3DB|a3c&tk8#s;k%-?7z&CxWZN%|Pg
zL=<>x<rm+w-FEr>`pTOFeQzAYP*U|G3*R4q3Iv#_fJ~58y@QF4!<0^fW_fS!xv7>Y
zUXVgu<}7#7Mv07uYN%4NB$yyjnZ(%EX43f~zfc#&<%5tPEUIDyPQ`)th<HdkCTHve
z$>NR^zfP$cPL`kj(RM2QTVy)y=4$8C>sE=gO;m%m)Fj4!h8X{X2Nzrm7*=hz$MS5Z
zf36>?8Wq<}nvLQ?Pw*LeyPIx~l)frFh=Py%zzerYmwSHrPdoQ<DF1bg$fFMd-D3=6
z`wq>wEJ02cg4hi3tUGAK{8+UE-B%))O+T*NfhoTI9B`Hz(0LUReWtKhSrvwmb+zuz
zdshtkBtzxEg^FET^t%FM$$2E%!vlEK0wjglWgl?i=AU83nZ6E}H2Y<bDg{Xch3H#C
zk&|S#+F5XyrkVdU>wt&ukP+szB0K$cFp)bIh}z}pr6P@Hz}<{BI=GfiyEu6v{cVv%
zscEv?#0ajm@gL)rDc=x4$!hSutTAl55h;>7YP!h&vv5lj^u8&qvR0Y@GC%_4f4~nd
z|4zM<m4w;N@L}p#SjA1~QRKh0zGtzR^Z^xY-)e1H20<W1{zD$jgO-6O^<ntQ*V;Ab
zM6zZYycCkZ4y=GNo-gR@DeGJeg~nX~=m<_=L`JcUR5eV*qWrnyMmH@416<Cy&tYGn
z`WJ{!K_v>iHt<!y@8`q&vI#Ypx7EPqBT#;0{2Z{8KltNfl#;<tXuaWAFtCZk`cOwW
zhJq)WR{0UqB<g{LVwj^!L(mHlcWrXL?U#+<uQUB_U4t*QrJjTub*X^WNmzvI_A~gJ
z*T8#DPv!!j0(8Hhpq*mGQt)Q)oGq02QgeREP*ci&1Dk^$54lMqZB%-T`rrBVm!iSF
zfo^UXQXCq@3LzBl2<lEFGt8ccvf+zTe$Z?NugYeb=2l&w&*nwB^Lnh8c2j&*Ij}La
z@%i9WXOs~=Ee*_*`xScj;6nQG0v-voJi(G}1hk#>xr>b0cmG6K*VL0DHnnEIH>X!S
zfx;wt-#@*$m&)w5LNLg(>6E64nmpwVJ-iM&Lq?6i>R~l?y({VQ@B(6nN)}X?qPolk
zjp8{C%HtL`#kj5yX1VvJE-@C8!20>48B?J&0%;|Lj6E_G#n^8~y1~e6HCmpsyXslk
z*Y0}v{V&3nsL{(^@{Z!I=K%E2f9`b>UNPSn=e<{}pEy1cv5g_asG#Q}9i3oM>->_!
z%72I!Zf6uC5SG+%#hSihB7{v)li4q1Bq^z*+jKL^I7h`}@>xSAm|+JGj*p2Wxqt>m
zp6{r61$@h1yOoo2;XS+)aua57&cahG1`r=e22JN%Wg}#WGP||-`-!f@tHQXwB!=aS
zVr(dho?D;kpS2$U3i;JYb-$WJ+I+m*LZUS~dzDPzLI$f}m3CC*E}4}9HM-@3r+Sz8
zitU89QIpr5SUZlsv{h=8!B&g<e%<v2P;YC3a3V-l|L;>uQ#1jmph1@ZO@C|QEdcr~
z!kE#j`19syIC3H+$w&+6taLfwKUsyo6R>*(%j*aRUGtG?-T2=Gpzo$0zsGuq4?k!f
z<l@|5?+Sy8{0xm!3-=!RF61;f%_G}S$4esaVT^Cz+!(O1b^G!YqsJ0OTU-S&+M~dR
z`E42)MRt>IP))CZ5+0FZroAxtRQKmvVGBz+RQ6Y@02v4ADndXew1w-3qRYhrgMv<Q
zOJs`GU1qOC_`K>-Z;D7_`$0Q(daZhftYjv-%TKV%K70(NnQae~7@mPnhO2e1t!|RV
zGt2Cq)u0J{#_0m#+M_d2-O8%2@t;gS>j}txFZ?eK2*Nx!{bs9nO`O|ZnV+!;bm)TP
zMmJDpL&w5`rID^dc@7QBWdEl^{#y_}A$#!98yP=$Dd+tg=A~*hgr-M%Lbke~0-eAn
zdE%HfK|Q?;wS{*7MZE!2jd8|hTU@OP)P=>xSr@3ym}~MTl%0rX_@J+F{H!X<n5?14
z0?o1_JggwSU#svTP%Bj~vcPe<O;sxn=FS*WBl%~t$roe#(*J79Sx@K^4)suoru<O}
z6UT{b6AAaqdF7vCbxdK5lQ*>IeEsLBZRj(xbw7{OTr+4DBx&6}Sf-0OW0ZDcBIY@~
zIeBF=Ah&(k!@5e`M#e|LQ!NV(PBR8`Z}zwCf|89$qghmXQBl7ehsHe;F$JS|K?g&z
zy=sv*$`Sg~zf)!Lip6N;9F>yg*|eFpS%U=9bYfm(p0wLRr)cTAVBC;1YOfDJwE_}}
zP1(h?ekKq+QVd3i;{F<Y$BG+rqK2@WPm8XMbKkmV6u&|v=HU2)N=EG>Ar)~8+<v8Q
ziB$cr75khLt76}d><l6$g;0UcG`+dmz6z4$h&1i^FRoB?g73)dCGX*UVihHi9gezk
z!<x;8&L9|1%#O{%N!)m>DtKrP8ExTu3SsJtnj=mYoFNP~${_f4VC;Xn4z*JkqU(db
zf!RVW@c;assnoFj>BGW)k6@vP@qFW9kx>4LU8Z6C3n7rU9dj9)F6M2rSighLgwqNq
z82nh<rQEB6aB4i8v7Hh8ZO2#>WZiVs%f~2`Nn^!#kWP&gVjC+G`!eA>i3QOzNfR6F
z^#e#C(kpz&0#iAb@#_`5lnvH~F@<VK3)b#nFriY+sfg|o$a=)_kbLMcGs$;b`&Fy_
z<eOHMB9Mkzuhh(L&|bsp)N<Wm4lS`&fURC+DaiHz%DT2p>Wom%#oFat%i&}xz9xIC
z3{^Y~Lvi`-kNVBZyW_QyOk-Q+yclT%jPLaQsw%?UFiB|d?S{vXmxI>G4!*-PHnUYl
zq~V3mH_r$rU)?ir7-ZqfUj)@|e-~?%4G^~)qNf#x)0G*S^uLN{adj+>!a&?Q!e1ep
ze7%2P1tQUvKdnfPqR_8-4+mFj8NNw;$5oW5(;m*J47j(T#MY5#Qq~)~s0Y1nL}kvx
zVQJoxt>MRsmw)1sA_R_T$A32Gu05M!T};Nls@Vi65ZjE171~9=31f-X`87L#PROCu
z(w8LO0%b3pQc8hCsp0JZK@4V*NHZLWebIA+$3@6QfCXd2R<<hT4zvnfLLu%hzYvG1
zdMKryCanl^BeW~zrt&s}y}?^;gMK@);y{RkNs1-v4;X|yZlF0uf8o~*6>rjjKcI8Q
zoXddx{yFmN`z0TvD;W3GPxAty8veoIXLY(Fe>&K*lVFi;W*NTMReDd(N`#)`UI2nX
zqy$#m8kMDp7e9#WXYgm*lH)^9qVO^%3zWB{gQfgZrTL$GtHUFG32KQ<<Y3`=f@c9F
zEKXf(zt=kL4Pg{G)IQMKN)}pB>^;mVBZ55oqyou^G#Z^^SDJ%Cr-d-NdC^-LXBdAA
zfINkw(+S;!%1F(bZTM7lnrqPlXny`tUtedHNz+~wNT+R05KHc$cTo@(UJOw=@Zm>O
z#+(SSsI6g?+ikF&x0(i*`rnKFz&|7J{l*{J=ecK0Z~U16y=GRGns4-MZ=SUWtQ61d
zOV`*0k%|bsV@lTIAuR5OB`e}@cju!Ln8IW-#xLfFRNIB0WfvhHQ^qrWmjb1y5kGpm
z^*WzdIIsVeM4fyKUy0&|f0O?emYNB4Z?YmcDTOKS&GcnXvZ{1a=D)?+n+^S85=H-H
z))mSjywNA)PY?}dWi4|8ZOLHH>+F3#;^BU7%(o+32pL#_>$7?Ydd8(bTn0ZA<PYk`
zI$r!`LA9{wR66IC*eOjt+#47*)JI+UAlVf)`P*Wy)?uS&!)jFb|DfL#3;|kwU}WnU
zw$GHukvq-=)5vzeVA2OXYKa-lxTTh66g?)G@k7%m>>Z%D!ap;w@!rfV%?4|zbj7CP
zbKqM1lc)ZM>wZ`#&}asxVZsVR9y!Za!s?|oOy=v}!g~Vqf&$G28$jiv7nzt(3IU;$
zGqE4GqYWk#s8kNqdqT9T-Tt{J?3uWY5DpePX<HOD6eZO9>~Ag0ws5nM$Yb0)79&JP
zy8qp!HuX^(qLmzn=6Q!PJhO*&rl!jPOG7yMOnP`}y;`(S^1$hEIqbHR4xdulfH!6y
zf<*4u=VD*G{zXT(JshMLdGXM-_Q19t{IcJSE@A>5VIT0JfAOtA%%C!syZGm(j%gd|
zArLSZ!e(*PlDndh8OZK}ZcGRlCeEfvCP!g<lP--WCcijg7tND-M9Fc%*7KRI{48ut
zDfQsB?8ZIF-ISd|BiajVar`?KJJ{-s5ckA3W_E3hc<K^F^ap>UX453MEU@L%WpEd@
zT|0bGrEJ#l#ZBtcVuy+!?0q+$JB_H;gN)26qi?3=W|f#bTzEHo%2l-5pU{~rwC!UZ
z{gCvMVcmP{U1GnGLhili7CoQjsL`0Bx@tWDdYMHdG=j^L{E_KmW9`XR=)F)E(9MKG
zONZpMo+ghl4Q-EYaoUlRAa+_k(H1ku+pjUowSI8Lj}`>2lG~A@W-bj}v*AmySb(e3
zT$tZv1R~k=Em{eYkUT~hID5Nd$X5mi!F9>@C(iihJG}$@4#C$84W4FuwfyVSn%yom
zXO#O5i#l&Pbx~I$TS&CCCKg$>b#am|Al9S$|6Oq{onZmf5m?2-ZODQEEYcG{5TdOE
zSniQiv)!z~t|NaaGP9rK^ETDg7`tp~#IPf5GS%<7lxjMqgdeq(UV&NkMCcRx$&FXJ
z4fDq3mV=}ksBVtIG+k|L%l?7R(el5Qee(&UE<uZd3c&z(H$6st;2K!XzX|gGDg$-6
zeg-H$cbI84^Jc|_JP|yMi=IHMFO*lI>mC3lzwZ@Xia~dCL5ld~xF2)V7uZaM169mT
zmszjM-p>bBHH7&_;~?;ELZ9ZbP1YHrLHuF~MecdBm<;tX?K{$CuHRWM=TbGLFV<IA
z(VCwF$RQcFoWRliq&v=gnWS!Rsz!s32oEFkO<e<LTeP?@M70VG_8Sy*q2!U!Sadwv
ztes;x=9fi~c8*CDNi+s9CA~I$9k+>YG8wBWM@=V?16bk6wM0*~If+Q>OFIA2u^dYl
zlRcmcnM6~j8F(X%+$}WnSlo9|DEMHwqnk4a29OyM!ccB}<@ea0(IoI^(3<bwd%|C@
zv)bdFGUZiFc6$|UuvnFyuCQ6rYB95`R8uBpr-FhV?_{jHYH5<fD5+?kH#}|&08rmc
z%DyM_9-a2y?h)lKo48`<_b!g-!3>!5moMd|j9X37EEm5%n|}g%Q!Y@-_0@Dpg|=+R
z!~ULm4FFyYr$Qe168ip;C5cA&W}=opY38Fv`jYhWiy+0<ASQG}*zBP7l0RlWiO)I<
zZ4$Ykv`Y+keE6xfQi0WQhS&G^pWYV^TV*jA@R%JvJKy6s{Vd?gX;2dmEsfsz7SYX}
zlg99(oB7Lql;ZZ9Ul^Qfzt#7Tdy3=a;8<q^Q_v*lHo$942(ox_gL~%P{xdqS(c}+y
z$K1+NC$3^e-y^Je@aBm{6B`%w4OeM2YT&$!&XZ$?Z(l-<YO0k|yS}P8AQ`yXU7c(W
zq?KFIUxs<G={$@CTw<t%UN}#*8nOavCc+lsN_cj&D_Jp;Kwi;NmOneFB0WaYOkwqW
zsev$*T~AtMbv3rzilJM3AVPL{gFwYc`Hj$H6b;tQjeS-zJq$$B!9OWFaBIjdkxHZT
zn?ERUMcDhz_&@%rsIAz#CQF?<%T`N{8%PyYb|t{z90qTBwHW`M=CAKH#6pOH_k$(9
zI5!}#pF&s#M>hv1up}f*L)__a3Fid&oL9VX<N*?yC;->jkdDT=`Y3>bB{O&0Li`BX
zPH0iL<1{#3WKeAqe2ttOGZn5o6Xi`!(mx)-<p30t44{xTVj&-^#-o>~IwZ-wmQ|!<
z?<aE%I0?&DD%5U@wNJ2oDL+_B*Kd5_;EG*O7Sd`pwiimlQNe+`a%dGoiIgJLlAqzX
z)Cr=^VYWq|FNrfpr@FSKxF7KE$X30p{w2aVcZ02Jx_4X16IaTTs^+VDJ^a*F1R-E9
ze-+KA+8*t$irG$F=6`<|_p@s76AuKf;_H)&m3nh0`^WpKR296JPxX?<9=%(^&Q!4?
zcyR~S2~;`SH=S*ZBtsTw9u{4$(NM+kTn=Gx`WhKoJzll(U+UKjbtI=*3+tOpWEGtr
z_nS&#%Then>K2wOLP9RIY!Q%LOqgnTf$Y8Y!v-_4wx4a7poN;asOgyV=G95)G{lGL
zqZ%YBsnXM4kMkYi34LW8nDqY7$+|_a2JAimRChOoyfJPTX;sWYt77@j?_Ii~S8MRV
zwCkOD(HClA?QhuHMrq@;(s9UK7V{vk_i;{-n<~`xpfldY5Y~TL0UayQmY87{Xp*C6
zh9uhJ%-#REO&u@AjEKRh0E-Y6xxm;IBS?;Zi!oqZ=<a!88q3yMT+vrRvwVDy-{Ria
zS!(F4PGcd~+NAXf_tSq~gd{U6;mdZGS24*e$GtXFveL8S<^Lt9kRv4t<&YMU$WV^s
zWS7gw|Fg__P`SRG64Dts9k#nhxaQau%h69WIo6Uctlam~?xg&!pAi`P|M8js_m6>*
zXn~(+;=Fq`wTxWKP7;f9{tx;22@R5L<Z=wE6(Xyc!{iO3Zd(<#_c9~nWh4v=?Up)d
zRfp<{pVU^LN*AIDZV@@!*DRw`h=W8~?c|NmABhtyAz^pfs+hWDNr`gwdZ@)i`*nPI
zqdGWAR)OV*|B|RJjQBVZp^wxeZc+Qp$(qg}!uBdAb&7z_zo9Tg6;lqF9W_9H%%{H@
zoh<3w_$uOLGBlaiuN)2B{ZS>9^3NZbO7XTP1@)qZ$FI4mFP&FLen^tJ=z}!<{lfa?
ztKLRZ=94-)0~YCPfM4brw~MP6=~aI22F3+V5Uq(5LwVZh-G5fERkK(R_`dJr=>;*L
zcLq4NKTM&L26`}l+<1g4kXO{g@?B_V?O&MAh`<s52_VV8fyQE&!A?MHj)DJ5!0~jX
zzcm<R9Nf*B<RY$g@9*ex0U05e5`%huwyO=UNS4VXUXMSJQkC2gu3mPkb$3{~JOAbt
zccHH}HlEGY3`WNxHeFFq?@y}E1@HOF-rl76S~Ei`_-Z976X4<bp2u)8Ob#wvx0-W%
zN5jOKnnN+8cO9})^mcU>>?wB+DY&eU8U!@{oikskLLC{2RQ(!rfg@qNm&3H+z5mH@
zX=#u2y8e%s<E4j<wD4YT5Hi575Tn2P^u2DGNeD=%+GdEveQo$F+!)esyqZulc?|1R
zbgZy$ALmi0*t3l5jXdc~Q8;il7)9G$Jr`%_RSFlq857hLK1@MrT@Ph%r|I6Q6)9e{
zEfH<}e$BZl-Fd9xFX41Emg@8Jfg}aV=K@~R2_#iWxAev49gp8ck|J|{d2#9_5jj}5
zpW_GoN*c{KKUHt9cF^6+7)ox}HdO&<gPW*bb6zaMJ>nHj<u3NOQjWd}n)W9C%wNlh
z=q<IK*Z*c~C;>v;_uJLQY8ePT|E>vgJk@eurK$t{f;zZEM7bWR1Ah_;`*>c(Xf;T_
zwc(BW579WX!XSg52mF}ev$EH;9x=!RV0_La(x=RWzx?OxT=mB5D;YujpL4F@g4L73
z(mddV$m~tzZeB2MkOtAJvc4C~NT>aRSSc02lk<<n%|UQR6xgS2=?JD@4<rj~0-ElD
zYkUpD^Txo{<LTjLJaGSxXRFxSxu;5j$Srx0UL{}9T>V=2r`bObE1@e5n4vy+0e<8~
z*(~^8wDYSM_cD&wdWt9#HGwJhHeVxWz-F~;G1;fKMHIv%%jcVPz9dwoUG%?i1P-jy
zmqm63=ex74yIF><C%>5R3V`KV_Xha}P#@b5gqUkY1W6Q*o+KD+SWX{!Fkp4_F1#w`
z@CG)o9;XdJrm3a+3h3j|T*aA%_exA2Z}@Hz2lV<j``x1IYaW2)BbtXi)B++_)m$)v
z<}Wl}znBC3w1I@T_IiJk!s>Y{NCk|}H~&4c4N~DZPw2XLs~i?p&t0YF_pQeOMZ#aq
zI}WJO6z4`*-TmD)@dim$l1v!xR@p^aMQ0CvU&y4#T=RVZEywg&<z#juQy?&2mH=<y
z(%Qy2ooYupS=<5311-r%YDdzi{b-lk2@sPuOD%z3J~#BjYKu3Q$C#$c!pACJP1M@J
zG_C<;@4Ec9XW7M@m0mY=ulJj-zel+fUeX+^2khW^EVwT=GQ!q{%*cyC-tCi{e>&W8
z)uJXF9;=>im*hTVM9sL~Tr2{2lK<F-^gNAm$s{?Bp#~`2N^9tPS>p!$f6+DvOMzGS
zYOR4+JY)cRqyUas6;}{+7Q}Zpf9g}G2OJD<V?LenR!x??Et!lKwE?ExXZMdY%l(}<
z2YvvfC(YL#a0T5JHKDPdsVD=<MwxZHRW~+ge}%mYvDJZfV#1{;q;4y_@7cLd*ldM$
z0T`XQ6@ME*x*6p@JlSBQ!~=1^WW3+Nprn)xe%lt`Zt~BDjXzn6L^N;N)&}Qj1i4`T
z?aNzc#oPxDN5cv-V;FQNAlCP^@)*jSF=^<{Puesf<7+%I0z5@|iH7;?HN2`{xbIar
z0wSnvs1KfwTu_R6Wp%j)8;;h0eWwYey6?Z)<m**#U(KI>*Q-5ZZB2$6+6Ly?^6iMv
zus)E~6VRHM8>e8=<pZ_GN@45gNKzZXzJnxGfFl1Ai8xR@S&M)5;tFV4R1fbiKm>#4
z;P=Kg#Hw+7X#RMCzm`SRS04;)@+YgmZwDUl=cICx5x8^(dhXZltoSK}Pzpx9oj}Y~
zqOfSP?1mGxpL(weirt%DwTqTFlOi`m5A3?!GsGVEa+?Tz|DU{p3e^J{7(rzW;lStn
z8kF9Xp`)UQS*igK)>vk&cs@n!Yfz#%e1#bIAJ=zqb^PA`uC0Pv$Pwr*c;)?T1}CMH
zPCvP|2BFyuXE>~Xi&eRkfmMR%R?U6xA2q~B;3Ts)9=5&=;54{UjgYzQI%G6g3)6RK
zd`-R&p0xtN)v3DfFQ~vJnI*lE{Aq96Jm|ye{Soxa1x*(#SRf$2*<?nx(P#{EO8x}$
z2u}OJ$GHKguKpX~K2gUz#&9*Lp1?n?2YGnofWtfKKEs}`I=KoKrN4k#U*Py02okV3
zf~r5pwN?qKiNb23;kl_L*)y;ys&i=hlzi=L4MLVCMpeSUJp>=UO?nHL^(GMPR^sPn
z<m;cida7Q7>~rqP4Ls{se88q<4)7LH!i)m+ewsxrJsL%AH3Lt$_BG{|tO^1q1yp28
zs6wJHlW{Z?;P}e}47zRllpdN)kmC~AxG;GoeSTqOf+6j9jT8djQs8%crB@m!@lmMn
zt;Fgf$VMji!T&I{`8ik(?P!#4FQf6G3wa0l?%}#pQ$n^AmO-vy@;kYxm-z`1xuP=0
zz~<>@p9Bg4)Q~8ZE)GZo*PG{gfY5tMDPg+s=2bDExVVA{QRSdfpzYuXrGSqNoCcEz
z<EBr)%NV>=IV6R8P+O3@SDRA-z?B@0HzhnTXXkAnVKPN0yx3g;>Dy2TV=o*KhY`$Z
zW=<;w75Kveryq{8gD31v+-Atf(%Sv}@)jOaao!QJT84@*YDOa^*&}3WcXM;hZT9XA
zP_?86S)-oO|L|H45VrE|lrn6n1BFD0uEMWiM<CF{%r7<Mj%a>2t?v7#<8W{#hX(w^
zvE%&!D6$R_0C~ySxCp;zi{Cu61iNC*Nwr7=a7Gi_!;SYW4U#G=fTqL-@OIDc1nsWg
zBn*H1dhwEn*oN98n)`@N_vh!rU6xeK?|Jm6#tU&dx^%aNXxYUq+$H=tf5~k{-1i96
z|ALdK0zb2=aCOi9QJWsjy*a*SvpI<tDBpSX+chJE`{jE;Oz6{FWp6;4+Ir$W7)mF|
z7Tn+Y`2c{wY~o#f4%tS<V1v5~rQiIHf38#e^_PC6^Q$)?3WQ>I(U0l^-Eq^hgcCAM
z-x9abz+9xL$(K7_^Y8uX<2(GNRa_0gY+owOJkaIiKtTh{8O}2lD`v(Ehhn^dr}S(B
zowJPv&o$kk(E`{rB(6JM7ok!btZ$oqVm9PqIuB%||1a|iNDN-`n|&Trank5$@Qk1<
zBwpX&iDZ5q49y^sX<<!`8iiyLFy^+L-Gf%C43&VP2pWfQ8VPb9x4i3<tpY2mdSCfW
z_?v4W%kNd?55ln8n;UdpFdYXEs2gtiV*&&JjltBC5LsmT_d64u>JqBgj<(YccUJ{R
z$T(epW2AvFzq+l={K-Qp1}YxKGtC)fjq^6F1i3Uc)1L0<dJ5nMo9F=@p29iLRDT6=
za{*mUPnBt-X!Yd(zL2qEOjik3L&kZG>QKRo7*Yq}4)qfdii~VkoG2`}-xxrFwf$k@
zOS&WZ@LPn5&+X}?7j|F5y%lMzq&y>B(QuW3rPO-V;#2=;HjR)n-?cae*)(z$|BF_@
zP3nF7r%sypHI8eeU*a?Cdr;L`iIiVVs}ShfdQqfXUe(8ost9&F0+l3moQ02kR;^K#
z{4W`lAH)!F&uGtx_h)#nA<HS^zICLbxL?$cq*jrZ@m=_vTv}`w7>A7KbsD`YNk3OP
z=W7>esh|<1f;jo%B^O%R8pjegaGbbKm96B=LnZP=q_N4>H$-=6zP&ghQ9d+J{4pfm
z8}V)WT%rmDPO{HH`Bo!XArJ<H7V1YRY-h}~UspRg>4Vyol`jadk*}~Jr<2dLSns8)
zbgYaoEC-VC__e+SplVI+-<BYl!=)9K-FOxVymspD^RH#UUt43ikhbD|i?C2R3ErH3
z^b4=;7-W+?k{O)qNALs}QnX4xb*FTR&Nkkeu<I*=W`51t8hyKCC0^Sdn0gq}g4H_l
zkT_9!Djt6|v}oGhRj8QVVl@GY4_$P$rQ8_+EH%A!0!X+}B-|cNm+_vD{V}KeYX)}%
z+Bm<@%vdID9DZsJ0DCWA%tE5REwJe+3R|r)jBWak{4{s*#4t!tDX+2~onn^QoW$GM
zWPbDr%5NV&0*>dR&Xb>`IZ=`EdMIN)tf=Ou4xErC0wLG!s5%2%5B<PshhOhx|L^5K
zjldMnUlB<oIyO2CkunO-oY%}#x3MUx&fUpTa!=q(ed8>H;E<E?QDYhlROFYXnW^44
zvLC#hud=IWDqLphLtbA=>>2{^b*%m?KfftLi5!aHRaR+>y9}3)Y*T>}8gVs3TO-mz
zrdtZhl$jf<04yAQR5=s}6fvct7gp;nLm`ZCGT2mmm_M_H8n_0I1$}H$71`cfi}={3
z0$cIh;;^eb0JSByMF=O8Af$Fnge+gmbN9W%5ST{C*k5qYsxSIhuOt_OFO?I*%v8uj
zFjOJcPp6VAaR(Jb;D!<C(L-Yly|x}3BbtGDgC~O5GvxY|mzIZ>2W|va!65Ikx(*gz
z3ULpFROju^x(l^Y_L^@}xd<aHWI*Gq)5bKgmdlnyanNM^?`pN(W}{i~m;JgL6yFx=
z@Z{~F;C&u;sYYN<WXDjX?5L3q_aT>f0;^#fE+{1ke@(o43WBT0l`SY1AyntzAm4#V
z?KCHrv64y(n*|OVwI02DwH=z4Eg!8dON2u7z4`7ri)oem4@=)zJNTr18=vE!AqMr-
z7#eSQ%4(Kw=@Jzyn}up2C_73Ki5({8g-|JHVDpp=;hS*sz(bYy=4#gjz6slm98>KI
zL1x)s8;)66a7RS2A5%!JW+}$cPlq<piu-sJ9)N3+Ezu8!tdb!;_}prm)o?xP?2A<_
zBFTz>z)->per{auU`Iq*%927oX$=RZy$bIX9v!Xs4U>=HabHU8BtL|`BJg66O+B<v
zPu>%{P2g1)5!6Ly>&Tqb>jx8Dm%Xm{I5gR+*VOaCZUTi(lD0^`Bz6J-<XECc#ibU{
z5BZ%}s5mD4YYORqlNU373#$sORV`aFUN30TGbag(ueK@gjpmQmDtluE?b-nKA;ZiB
zcJL)stKb7CsIg4!R}$s*;x{58##2>x`i`G;&S8r97DRjZAd%s{a;?c@XkDvl-%w9H
z*dJX6xtmxn9D&ZwC4@J(EsPOW8`w=)>FO@Zt-!-}7Bc)MgYxz13+Y@n&${-5#K2ex
z;@d+97pfq_SPJ@&q4zHTaN+VQU3ne|6lzT|DCQ}XMqx$7&|qE6!cNybnWKipTODwO
zfPPcORwycQXUPY+2D&nb&7N%?bfd`4(q+?tY2#IO%joBbRLTkgY_!!*--8CsvrQG+
z)-~GWJ<BdL`|~}QZfE9rxYb6$WCMTO9B@I`7PwWV+()pf6=D$B2bEz8YTZ;&A=5(C
zQF*OKOU2WCrS8+rF;*k*iJyEc^93Y;4yREA&}1qSGyu&9;}>1k!tkpN32n=AdC5Kg
zaN7xn>R7IGgJ+X_Wt&Yg*)-}G(-M9M@4IvQDsBA<s4=_ypsDPwE<-6@1{k&)D;ikN
z4(K)Afs2Ry?t2DBoQT&)>Eb(+_w3(-vkm+*-(byI<D9{3Cw+-xx>Az*s9nM?YXt_<
zTBpPW{O&xXqi`l1q4#esL_zbTZd=>!VX$XSh2b*PBF_a7=9K|e8GYOpPEQ{=1<C9_
z3eEg6AZcn6nmPIRtaprK44^=>U<p;*>b~6QO(<W_B*kd6k~sN0t(5VZ%QrTH@@(>e
z5yrS7GeSdAgoDaaJ@1yZDj2v4#Q;b<t-8=dpH5kk;5Rm?&5KyKV8!3u0}ME;>Q#R;
zkDhJ}`d<Y<3yla>G~FzBSoAxRdbd)h-wfn`E>xRuw~@<Zh0*VPv4OK*P9&;9{@NjY
zxD+-BTW<R&pU2l@IfSf35Zl-yrx8f{sEepZ((hs^dcn@lm-+rBpMkR9XLE9Catyb2
z9?!O96m*w9#PCY@A3#|2#E5PD)0K=SLH)oYbO>G^VhJp1fdDq}g|TlE3=~ZF3v8GV
z1mlzaC6oY}YR}Hd{jJuG)I);RDn3XPC~FyjJRXjjyMtgsk-5J&?PeMMD1m}A!Z6)K
zh7z9rfuuV)DVG8BH>V8>aaG44Y8j_g6vN{$G;V$TAd2P_;_cUGJjc>PLFJlHM#%ZC
z=bGcXdO<T8bG~+cGf-NZPjK<I>nNP<FXr@r+5W@NkifgKB2q>0ba@i$uLqE&qxE;L
zK+>EaFfrEFHw--+DweD{*{zP8SG#CBo5h{5?0vT8PyLnnCA?Tjrz?s$-LMp0S-EdY
z+lu!A=U~NOeDhKN`j=-e_e~6Ug|&{Lex(hYZSwW4#ZjaiCv1!H);WXN<DnhOi?+fs
z!-SH8O!`kq&k;b%I*}`Tg58n#tt*b`lQ(P$(hzZtmlhfX4M~+R7;EliBWpRV3x6*;
zwaZM1^aX;9ujIl&gmXHkZ4V><TU7j@y8=OUQ?2|9K$HGUS_Q~0F-p@%p6NqekWItN
zWs`f3(!-B6sgc_Q+LbenQQAi{<od#_YAj4}A<J;mjmHSmr#kc&+R1&8;24)STCoKp
z&d6uxdw4Ni=gDdzGlFfz%E3a1=z%CPDhz#IMvn*>52R-EM)hZ&vb#+SWy$_(k9m%3
zM8;+CBx<%LknRH=(oJQH<A^+*5mO2w1J)YIPfNF7?-hG2&egUaJ%Pod_g8JDo(YJd
z2@}Kvdu~oAPght$fKtT|kN~ivLu!H9MvPg6T!Gm_(%lz2l6{-Hb`6-)AFmv09~WG_
z<*A?Y={x^(SbTAX+)ia*Bc9cQGlwtxU{U1V9JID54}d8^m!smXH7si0_Du}s2j+FA
zKeh{u+924?`ELJV$K(Ab!NZ-9%AXI`fM}yfv^<5j60vH2E!*Of_-6aC;;`(JU3dJ3
zXB_!|#5w>Nw<zzla4)mC&cS5kozKHwCU<RFDV`TjltIaF^TZd_=#Zch^=#S7>obp#
zyH@uYC{S|v%|>tGdK@i6J@KbZ?1R2AG$p59MA`At9;Ak}5aQ7GeImy*giCOt(nJBZ
z_A%7nAehS5!QhvJ$iaSa0PVeDp`hR4`$=mM0UN~djpQO(D%{$tBi#5IZC~Rx88Qwl
z`*JSnB5TweYI7zJTsgmPGCRowA0Wr?@$Rx~TN@DkR@sumHOIg6BxW6mINf-RGwy{t
zF*nbh9>XKox~-iNN<XWPn^uT<!`%h$R));1r%K}*P*5PrLyzSnJ^vVd7V315r6R&h
zHNL>^O?#C>E#Vi0S=9_+V1|}@I^(KSV<K*HJCpw%csc|KL5J_`nU(MAT(MwhAcAQ^
z`Mry<ZiImE)fTESni;U#jC6p=MriP!y+bJ}jyNmGHDR2gfa0=i`~8!_bQ5@yR;*v$
zG%>Tqz)D2W#VhW1xNP#L?<;vc3f~JM-%e<hqeXa6%yi&Ja3x^cX6AC@nJqCA@ca?@
zGJh)`1mdVHLOwSuk=*YuOMXXibz#EnkcnIxiWGl4IEu`l@*YOH#npCecfHnr^CRVx
z_#Q$4Gz!WjM^TDGJTXes4XNXBkK48w@ngjLRn14U`(;}~moVP);YYzNRMo<89U@>@
z82$LzS$TpvRgPw!K#3}s+v0+F9n%;J*1=PVNV2Mo5A*5YX^rAb(`~FO*X@S<$|qjd
z67=Kdf(bRSi23^F)s9U@YWcrws}AWv;Jb1jP}6*WR}Xzu8~Tsj&^n9w)XFp3o_3?9
zY6<09Lyz`k=l4E<N#)ZRSN_B!0D`56cdT54Q+g5?K?HPr(59iO_(z47xyT*C#dlmm
zug<4h)2)H@&GRX7s_>HKf-|<yWU+%S%k4kDe7?G^G@~6|(Mbq)S?WxdPg~r3ja?s7
z-I$o$1j#Na`{7@CEkBemf$MFOiJS;`vFWVX#P#{T+D)s`NO(l%tn9COfk&V!Cx!ZV
zRA~*638|B<tTrILsGK+Hh)8GohSzr8V}4FBOqmX3hMWN2**7|KsVs5{gH8(Irn8zV
zehv*h@vwt3$lyY0vDl%kF2QrPeX#%j;eTS0jNf0;opsLJ6#jc=1VEuAFZH|Ew%j>k
zhblDEr@^}Chv~r)g!^JQEXyvD{ADUh6F<Ly6E*QO-EdIWG1^`KevUumZB>*O2W24M
z0rrE2S)o(wqzN-AdQ1>Guih+Zqj))2e;whrBshTO;(k46=e|i|Gp9!DJCi2m`&B`x
zr^<}r1r${WPj{579xT}*Uim~5>Xxq>tb`dxNnrpdb3CN~IAkkXw(|JLp)G&Bc@Lu_
z0|6-X_z%CRM-VC7Nm+ym=WK;v5@4U4?as<g5#ET-iET{@<K}!<dG5pV5GPJfbl#mV
z^2#U4u~5iW4yhfZ@S2&$-GEO<2;vOBx4k8u$u%kc`Sr^@g+k>O$6!ywK1PvFL->R|
zplLe;vxWnla$5g}68%t1+>~KK486<nl^-Iyp~i_KX5(@}XrH_AC3?{Ic>aip{YrSd
z$y%&PbmI+r-NFO#IoV{4#)00;&Jcfy`q}bLm#b*q@zV2ecL!}rH$&1Wc-6edZL8A5
zQI|_Mw_^{3?VyiT%ZDD5+(V_5#x5(z`WifbEoD)OT>+=t8<vi#GmRJ#TtwO}nZ2(|
z_o?@84o#ouRZrs=*<e<`sv-KPZ=0kGMIRgvq=vXL<}48~l)xh}OERdmv|f|?W_i^(
z5f-Nyu5uXEsVxr|qNR@{!ITpY?$_ONV<)Tt1X<a&#lbMCA3w2^SQZO=GA1&^VRzuN
zunASq>=YGB1*hT#@K*5vsbiC~sv4kV_ql0MA6XGOSarGw#_z3+ID3?mv1?E06-?^k
zB>lgPw30_s^vRUZIoC4n0Sf+);FtGtaDKC2#kc!<<!#)DWt3aN<FSh$)wVK=!WOUU
zR^<p8Yge73StOYa@Vnjti>%|bO+O%E8F0;wd?Agd9C-7{XzKEaN>Eb_ik0v^{-=<D
zC~>E&DEcOeioeIQ$^w|ZaJd+z!1Lzg$l$k4xd@)kCUV})rK_*rB|J7Owf-{EB*$~F
z>QxLRnlX8*Za=Ta5Roa*ZOOcOnBT<_H%mQv>ecfAW#hM+sznn=JZFHP9$Bx8k8{@G
z#(|%!omJ5~n4yamG+MVmY~RX9+^l8w)%o3XJ$OT6ltrS5AKzqLmml9H{$x=tiJhLi
z53fBw%dpl=EXx4C@3SwDfdi1Cg@x=MKvU8THpgZE<yQkE(UHM-+*$Lu5`8I|Y}{EX
z5f?%ug04Kvl|tHWpGR|Cc3A4Jh=*K(6phI)Zt8X7A))n{=hS9A-?b2C@3EP-jG)f5
zowPT((|mcY`+&}NVm}Jh1V-9;Q<oHh@jMNE<xd2-8JONYKH5SF)ntma$Xz5BlM~GB
zJ?;rbX>v<#3`Qy$T-0Y(X8#^E4<^u0EI_4`O;SWj&aPBO(DHs#sh!X5L*zN`v9M32
zu2g^LY*@tq!_`}dRh{nN--;k0(gM;cA>9oUf=EescgLn1M39z}Zlt@ryBq25?uOsZ
zoH^&YzRy2%U86I^-k*KPd#&|aGH1RF0euCM=^XIr%$oT+k?$Mz&z>dd|Al(~lWC3k
z0A<-M*IVX&Vm$GOq~ImVz*a#CIl=c%a>Sq)QhwN4zj&wAv`akyfB)P6`B!x(XzPw4
zomW6JUb$<V`fps84h2d{-R4JM0(%(VaN>J~Wdk+P+t|^=HG0+&M?yia5=aPz*?ql&
zMQ5)&ykNaSzF{rD#VG*ywZO(-NJFY8K-h*hA<{wOfpo)@iZ6&d%cEo!zemi<gOYJ?
zCEE?Ro5Y%mxWy#HO;pu;@Xy2^aDe_Nt-DM#k|QI;>VC^InJrCNit0c>w%iGtiBrR_
z_rtw7lbZ_`fqT%s!Y=7dNt=nbcQy7H-<5CEvTd5{#CH99*ZUQ0jx;}hF=&7M0K<3)
zZi|ynSbZ`Rz=P~?y7O<$Hu3HomJR+HV9Uya>>^_HhBHD&At0?BIa3c?3D|T_NI&wr
zHWh!Ar#zy_^GNA@Y8<IBLC2QU#=ma{=*gzt6^b!m_%#Kvjou4a+R#9Wp}k}yFKRgb
zy4OxFb{Se2^<Qz~BNWGH8nvndP{V4)`h8q9j+ARnZTq=G_UiB`=cq{yb*aGo7Qy$E
z?RGn;QqFO^In4t@yqZsx<eH@ctd8i#T>&Y~g3<dk<^8IcAgn}g><u*N=1i$PQ1zGk
zHJG#uy+V%ebb#-{&gT}eCo(ZWNpwUVkdowqLb(p;KN|2;T%r8ZJ*2JZ1-fjcL0;0(
zD1Voi=eh7N;bTJ#|4m2t!v2$&4Y>h%SrE2Pyw^@_ys@y~8N141F)5PY8PnKc7n?X~
z*xk6`*@Z%<5^&qwiC)}ut}#ps<9n;*P;+cNCwL$6At)aVF#Y;vC0;y)9vSx|s~JZj
zw?!wshfK@Us>f|Q%RbO5A0z!EzJZF;;{x3Z3-y2H>g1c-02GTEk5d+#)d<HN7Ld_o
zRq47##F)l!-E(HK$>{A*Dz3eZT$IL(Oi5WXKAkJo^IzpZpOBL_L7dK?U|*s+my_t>
zSm_Jh3twG1MNp{$#tkwOsD|#*)C#BsVdxn{i+_JzklcRxif`f(zwB#gS1@)V65iip
zpb~w!n#^xbCve?ov<G_Dn^)&x(Kp3-00CITo?il7_r85mJ0hq<QS@2@W4urI5UwyB
z+l2}fJq2JvxmXQSSWo5$Q{qtxJF#ez&u=OLu5q$J<yixwat;-$1v_gl04(qFjOPGt
z)Kso84urT*q;8)hY&vvapPpG~yWl&5qywg2R!ccpbCmLj9;Knb5xTQT<i`)&j6qIL
zx-&o@@SeSEotP7tGj*z)jik0p&4a;c4sd%b0PeEN>b%>Tr0IN3DNfjVgcVrrC+}|%
z*0DWNtU=3&_yW*2ch7g^cCVJ6z9n(lPsI{guf`(mmjFc}hn4@7+9Bk*sh98G+UqPW
z^eF_?U<XPQ|Dlru@kJRx6q=~NbNzrYFx>=7Z51-V=va&=iGXHfX>es$1&7k{X$Z;Z
zykaH7h-9t;t!B*)z%)NtY*Yi4rP348AdMS@)L;06ELWe$yA=dU!`JcpGQ)O3#H<jE
z%oBqB!3WX|^4q)4!W-WRtpgUw;pAL=B02PH#`85T%BRdy5<k|su)YlQ=oF+~goBa7
z!cf$h^nUsin6$_W15dyt=&z%DKkuX`0t2Zu98|pks#p#{A>=%3k~aFM>`QJLE|+g(
zZ-nFP!)N^h5YBAd{X%x1gU2)FTgC&haan<Ez-LfcIh^%&7ocEF4`x|sqtXl2xdBcc
zh*{r(%>vIh>Imp#6ZJ=zBaka3UaOB{&ETCEA3pjg1|2amd>=Yo6VGKINO$#y)>oW(
zZG0kzbCZ126`T$xB-st74sN9ZT3igUiWO~UQESp2v;b~gWiK`e;1?A5A8TaIi0Kq1
z<5^?@oz}Lj{$grB4=4d2h5X(!9rc1j%upTa{s8%^NRyukyX&mg??q#iv%cEb@_oI{
z(HwWl1yz6ImT6OF{Ci)(5+MAlgPQKgag)={MxIN9W^vdJUvB@S)RYhlH$dUw=PO+7
zO^?SXh4pfk8Vy^5x9UoHPZ%VJFStBXf(t=nv<HXZZd+7}Ky%L~vsRBO4auNyo*owg
z^!rLkCa^8=L}(5Cq=YOs&?g-~yK{gn$Z1x2_DQRw(rUpOTFZ41dy`qucD+Xikizmz
z8;X8kvHk@U4p%9M^QXROeojb|b3a}U=qWQ;SXmIz!llJ4ezak#!ou`E`}sl;cdvbu
z2g)+&zwsc1uoqCU?Gp6LB`v6*CJOKBo`GO+!rHd%|B8W++lM>yI08rmToQ9Y21z_l
z@zh+~w=C9r%D)A?=p-X`fMXG4mSgt4=}llyVIcM`kDPAy<bP<WF+`g-C483>sJ#yK
z&R%~?n^Z9P7msseA<fCrYHn(~p_oYj4AH^SzSg-146%PiN#b>zaWHLC03E=IskR%*
zXsz&Xeba#NTpmF-l(Q`p*iAk{zX1ofah<2HyeJ%>)`4FH{EjuUvsGZkHJG^d&Jf>k
zSP~Ab{Wz(IH5sXfP@eJt7&;cbXhLgk^5+d!lU(|0J(1NSk13F~Cczr(tK+DL{0ijo
z9rHVbV!k*}Ys6?WqcT1sH6*)EPUj=%k{J*wt5qLjvAUca-)s}I7^hC)zMSm|C;cD=
zBu62xlaAbvO(NFwy`iTCDk%ZhB>?_`lfVt_f33?3vaqPJ#QVyr-B+O5ndIgcpF%iv
zSe=OX%(v7Iv>o@|$Md>Y#q!!c3*Y!10z{B;N%nwWwq3z-yJ7rnW?gpz7G0B{5V(>0
zl>JUTKmr$^|J}-Fk<uLCKT)?yZ4RV=UdkKh4;vM3?Rdj4ch7Y6Jce0&*F_g=tOTI-
z6W~Rk^o2*S$aCQWnd+n6+S>@W@Apzb_z{Y|K<0ysb-YemI>CN+J4D0zOL&fN-8-T~
zs+5YaSoIR5!sZwOR9uDR%X3$tF!)Carl66mRnv_8At(j<*tSG0<Y%fr)>p~h42H-u
zT{i&ZF$vtv29jj-v#1i92C#ry=<ST+Y`KE)!{%~aIv*6pW>mJUYh8fbVWNw#YH{Nf
zd@^o!pQa6=%eDY&Ch4z49_KQ+8Wi$%AgBAPWYONQx)E3awse!yvLtpezOO8bh4X1I
z#f3|~c-P9aK>w>Cec*FI>XWc)SmP#=GtopQ5=j1qv$_L0D-WDQNj@<dFd&1XbSuk^
z7%m?JhR#Z>?`JkkcX7;y5+J0Ybg)pbbPTfMj<i2NX4Kj3?CXmrNvi#o-yYAd4hBu8
zqayu)_u%p5K}1fW`LL#MN>2l|h5B+zfbFWlZ_pWf0n^~$?GTH=8^9d{F@>AlZaTJJ
zv9A76E>h<Nk$%(0S6FMAB_KLf1_>qd(a?uOe9uBX4}@#3!H#0|P49&}ZEwkET>aRe
zeloG?B9qVMUI_qcYAhMFuiC@-4BM7WE;s85y+WXupHgMi>m+94rdj7S8dAR*mLgK6
z*%v+y=K|?cS`=)W9UTvVTc?Vrw`7IHiuockBm!@V?B^iNNr3-ps|iQpHsRAo&3eD8
zS7S2HzVSOrIWe9O6hD&`q$BN*DCR3_B=I=Qt`gaK!KWQExcVJ7R#!ZJ8!CXIA>^3l
z(Nh_Lp878fpv#RW{xev|qmAJ*Mi`hll6Y>yrYf8mf8PuNuMsN*A0^ddq3$z67S5Y-
zs|s@x-YdDKFYd;(Wp+EVGtnCaY41Ue#*{j=r7yAd?RsSCE{J9aRnGM@SD=mf8IPo<
zO^Mtl5$&u2{Jtr%x<*n~vV#*2f5Cfx<D=#F*H04PSk}mzAOdeDJOPq7_#D}TELEBh
zJ3DR{watsoiU7q=`}YsU?vu6d4@lSMb5)ZkjqS>oiw$uu%igfi{of$$@m0iMAZ9;h
zwn5+CI~)@@f{}P1lpO4r@z6Do`wMs3#;;I2Liv!?l#0};n)2be?Y2wUi1WyyQURQQ
z0pbz)I&iNBpj(7!-`4XfdPZvH%6_;(&C{sQGM=0$Pyv})IQ807(ZbOwu~o(`lv5rs
z{WhBRO(c>3+wAE{^gXrNxLb$3^Y$h2+k?L+v?@ZV>NUO6Gl$ok@y^Sv)m^~zU2&Tw
ziBOOPZcJnDobt2!Q%&FY>@)4#qO3efI}I8lJS$C$IRq`CDS~IT{}J6w>6nI4@XDk}
zY7F0!#&#5o1MT|*0Td-PP4i=2YwyP~mwi^p9_STJs59{S5lhafJO;rF_CHHWg6TCs
z&6Y$ywheQSW<Kp?r&KUhgEH1MtfTDnonm{I`2nG0F_3k2=y<5p)T<I_*b}cbrSIn^
zmwk%m*)RSKXiF{J5l`R})D&L5@o#MBQ-sj7Ah(`j(eV?tbhk_NtbCX+AG?bwwB@y2
zm@&NJ2ufQ7FlY5!0D%3*_XRjJ2|N~y5k3K-6;OHb4S+;`Yq-U}bbw|2-~+G*A1<;y
zfKipX_u|k}WNTJEHO}X^#LcO+c=A3E$tM9^|I4kuXti{y_;>Br*u)sfp?I9>QUJvq
zm-AMZT-0MYp0yu!3I!H|1ud54?kf=cU59CdZQN(ZY^h0z$mcPr_G<{vqynH5+do)Y
zJ6tS0LI-DjMFR)6{pT2~9wx>iXZ_UZ#B{FwPlQ-Op=IP!?n82DbGRFy+)v(f6M#g6
zr-L}$iXlh;9)mKiBw}FbQODdXBgH1-8{}nH51d)f>ouR+NYd^?sYxV4zI8Y=ly{8e
zH~i+x!Pq~st$?2`u(KgvBHd)p2WBZ&ztn^A!TSarpJI$3Q&nuvjX(W*Po3VRL$QQ1
zEAA#iy3OjFg$RY~aQ-j^P@P8Vab@ifm{hXD6;i-z>narm>fPjgGoOR#+5EhqZM19(
zCZVqe9Z=fcM3sd&;=gr=Jrz!;rl*^%hVEsXJGwy*j4U5OXtm8ZgVQ@V=7iAi_s=x&
z#MRih!FfokdbD@`**7byRh%^&PzwmtPLdn&f!E>Q%y`zkagyN(Vc)_Ta}cZRd7L)c
z$ffX=-cirH>>C|wPkqIPvs>_e0WYATPFduRn@htF>B=PJbE~P1S_1lQzaNyLfpJA0
zd|@^?27`!dGBL{$h^Pt&P4FMO0fXzZ8?kj9-A~T`dk`Nz`%y>R{jX^|f(B|+h!8@=
z4#U^+gbsOwe)!12uA|w^VDI;Bj@>tplaJYF=Wl5<Sx?SphO@UP_WcKdx{>5PlYnu5
zD$9@x3*>I1e0wm1ez)k`uD$4NM10YKF8wnVIA3a2KgmiSmh0E5<i|Yag15^&JqQ_f
zOGGQSu8efRUpQyHl3}n#EIDB$k<<Nl-XWfYp$CBFu+Y&ZMw_m6lZ`2%nb9owM_S;s
ztcNoSt*o04?|{9IyLti_%8K2FFi-|MhSZ2PaQ<YJp%oTB>YH-y8M`5tN0*r3AWR19
zS`lo#^7#*9x7_!FJut*QkTsJc#>h9HM=J(-ktH>+m$&DttAj0|>9st0wJH(?<Gg<P
zo#0CxGGthY;5DBm)PS8mGfa3up-0<f)i&%GhOQ+kPs46r{!oC?YJ!$qCp7w=Zahow
z_wNQtL=3VsTV!J|JvOJfkL<=(zyA={17!&tOj{_Q>ZvAN@GX<9vzez?R=Wy*lumPv
zVK+z;+%pwXE4WCRg-v21rh>I3FpJcqW`K)O>WlHIO&D{V1pfU%5=el5VQV!Y@(rVo
za}b%U299aNA?V^iS+thkNmN0VSTAij0w?%a#{OCdebOaqnHy~=*IkScP)Q%0`=2de
z?7eqq5LOV*9b{Nx%;Au+KK`20E4=F@SL<sWQ;XAGK-dzNLqteigxDFl#|DbQdsvJx
ze6sew<$oHjYt_#a{P8AzMKJ!KT<8oPupplB>YXw-@5eiWN+`3LsBu+pwROE0*}>qu
zpr^g7Z}Ncadb!md3tj7IKB}A66bn<k1K1=#H+w*j$L6*C$_==76G_pJ(g}R?;`Evs
z1v_~c%V3g{dQnDU@e*GLncri3Tv39FK;(}mMDhXL5pD0ZmUpl|I|9tDnbL3t*FAbl
zLnxbUzy4Vmh>eEu+J54s5%}QT5+Y`N6DHI#EeGxHHhyNSskuIp-_`&}I$_Cp)^M!G
z6Z<=gl)cVuYMB9H^wRa?33IbhH0nXkn`nP-bD(0*`uABdHNTA<zW$_#oZW`P;q|=C
zcPPD^MUNf!O$YF;9##G1aHQ^HYNa7IfuZ7*vZS6O1p2qyEvQqeHh;^FTBx1}7B#kC
z=Ebc$bRK~7jq<S|*sBb{owwi$a(nS-)uT$y@lczO+YkWEfbkV*IL8*<^J6jQCKjGA
zB|T3hMp`#I^rc?g$?k_?;=BWvSLt5vE@*&t`jGh|aLeXEEz&VZ3oHP5I9}h~GAzDo
zUT=8|nvA6-$>)4)RW?Iwnj&yNN40p(!>RieTdBfS4W@74%+-W}qS@UrIIOo)MftyJ
zAlV1#zZhDLF44v%sY6J50U`)e95<Bg^&^`0hF53@7;f6vY2b|y-1!VLb?3bj!vT7I
ziiN6y)~^U*Q4Dd6Cy82GWT@@GGg$eEDQDv^ubOQ==YD692}{8(dXik>B|kCR%|0RD
zteC$`1FE3TRL71i91^*7@@go-362&<9FxJ6hpfv^WI`aE0|%yGxJ`mWcrVle;j731
z0NkSUA?HhUJ$I6x?F{%}IE~c`LpZ7(rs^sfv2-gk?te}Q72y4fe*!7*2-B16x)=ys
zKQLeLK%_2g7T;#@*lnZG`nrCDHh^2&MuvP;7S#TsC->giEy&9VoPE+Zp|UTRrJ+|a
z$O$=&L?T-%&hV`5USxSE;{DJ<#EU0EfmV}}K@c6Spp_@^G#O;^ErXNP$>Fwl5g#t!
z6Z#y&H86aS6m3$m_EDbTkl`LSt@h$Mlq$ex1~*+`1?Oe+2<ms}dsR=tkUe1Bh=E;|
z?y#XoMD3O6Ae<j(T#o0H2MJ3{fCGbbG(O(aysM7a5!OAK%oErmM5Mg*7L%j}Yr@uc
z81+PUE~@@;IDm~uNl0-8jM&h|#imR)fxkuo4p1Ed%%l;JaaY(M<cc*l7?nD)eiNR?
zbGI5Sd(|5b;@o-;D$nTo^f$QPpOld-x5zkgoQ_vFRHvghi833Zj4Wp+RxJnlT@guL
z3BlKJP=(3eb8rS;o}{qCzm2^Lzok&Wv#9w@REo-g%MT10rQuaK|H|dyc+z5!wmwk^
zT=}XaCsmj{Jb8|P`vWiG>j%MP(`MCLM_yhJCb{wEd>p>ajLYem=%Mw_*H<KPIMgGF
ztJt(KS#z}+CJjCqj<68Od!#$S<C%>};rE{Sb=8$0r_0&kCdvJ>9>7}7Mr;C(+IZ-P
zS3}gpii0;gO7EI6I7Izwx|;_cA7DGk+5&Ru1t%ALiKD#Xg-4(*N5V<{E1&$xL#46d
zCp@+Dz@W6)vCY*uJM5+3Yr~-N)MNY?#7-Zzni}r6uxCFf&aY%K8|e;&OVrfUlk#4v
zEq!x0?pvMKczw_o@F82J(hLjbqt`IdniDfB#Pk@1+Wpqj=PJ!eMfuNy*Ia?NxQ>^R
zfTAqJ=fjQHx+=F(Sj}a+Ise2V_OsT@@Hr+Z>|`*@sonBIG~+wN?TjRY!ail$p4GsL
z_3mt%kSIlbeY0e0N>v;mgUJ9rjxNXYVS)6nPi!QE8^dH8r95BSzypf<ZQN2;a7{3E
zB%6GalVOF4b=Jb4{<-wepbe@8mDL9d&%Hf042M-#qHMNuk4VM#7T5j92uKcp7`kTN
za6jq9T7|1qtA4>e++ej(SF0&>nLsIpQ(Tq=ZA3p1d(2!No1eJP4q7P08o_b{nF{{)
zx2BYBqrN-(uJ(?L<M8TMpN|FeM8((o9E6l2oWn}mwXMdMU+~Cf;eO81o-&b=Zm&k4
zs4!th^|5hT8kgp$)iHl(y<d^`&4N+KV^hi+Te}hDGjr=1rT0fxXi@c-NATMO%BV<d
z2s~~vy>X5zt=|R9u|QtmCp)DVz#-<YflKE4!x~-xMA8*JXgGe=&5zeH+t0XcJmPc_
z#x3n?fy$u;w|@s|j%Rews@tDE-_d*f^ZQpL$5%|6XWt3t)WA|l6i526_6Sp2X=Je(
z$>~<*jKv7W5Fcyld+NEIt+Ibz=iYiP_B{|(0>@p5ZcUYpTGeh~>I9Cwfg|S&<lKRZ
zMB{TArYKd^Uez0LBZ&?PgPMiUFy_mWbc?lj;C9BmC$<8_x8jsrkvaeHS&Y1)b?Fp{
zgeym2R5oG3h_TMw`fj0$H8MJgPMhzE?Sw5z!+t&%&=BIZLj>kN?7i596{Sx4&vR^!
zO~LilMKwIX<z%nUt~KxCmeTruvb+}Tyn3@HoC}5oq6>s+rv?ORwJX(OtEvuUZy73u
zD7}PKe0jDabFj$2!HK^`Wph3yV~oV%f0RStzfaeR;dOOT*%E5hT4nZ=B-VuZGnE<C
ze_q6BH7n`;-n1HrN*Q---E4C(XMKZ8U^zsDB9yAn_SyP4G{B0IM`%M@qnG=g=iVfL
z+G)8CrnmO9O!(zlCL9;|o^n@f<la~<kIh^3R0@U5?={1L=`D`&5)*-PLY$T^7Q{2v
zzm=Q(Blxs*{_s-MhS*HbPfd|56TS@XwH(VV<wLY6pHfbwMXJn~ca}+%Y`(XxgoOUS
zOeqh+hW7?n`^=ilxX;UiM9&dS|07S~1hnkRY+;R`QWXaO%V0i>Mqi5H_801<x#1s~
zYuSdY`bmi3v~m2j81dMq^OcSYMEX1|T&CLIn4#GOr;ZxjCx6goWHSqDfe~G1Dh!WN
zO-NQFWW#EpHi!8?>J%9v^zO9PFE<=LHC@!7UN597Zo;VEA=RUgM==Ys(&dT*=9<iX
zHPF8g;S#yrMUuw&UW15V?Nx<E%Ky@j|NDx+jNuZko;U-hP!dYP-~Q4^`4WE0WHW0K
zb|J;2p-Zj%e@L<aJHi2Fs^w4Urxq%wHwG|lF}Ghz4yykvG7@-$`=tW7`!sFO>EK!e
z2gbqs%}gw*lCN*xSu8{fCS~Lb32G1K@;qBF@y<l#szPx2q1!Gr9`A1|na7_x=^+WT
z&o7@k^tsAnjEPAI#$PhJ0XE0~?~uR+HO#N2a<PB&$hl#yDh-y>A$WeRmk)QIsUpGe
z3CcJ^laqjy4}xM}dbYr1NC`B|z}@VFZBG#xxs{HZmq^zkxUm%|$IM9WAuHCjCi6An
zy8w<=)4j#YGh0XDu)*=!zc%_?6pAvb`rc@U|LXO*aX{6)>rvw<xp>6Mu%jCIfr~YH
zCp8=wqRAM`qMzrmLnPKS2D9{2o&XvP>-)bKJAxW2f!#_5aJd?_2gjFiKK|g_AVPbt
z##mniI{!G%yP3&x=@{-UXTSx!kb2nAopT=&!u_?5QZur~(EBo?2V=I{%Cn5?&)&bR
z*e00-F8eZo?2A3R4!|fC`2PA$T4h4IMG$vkqm&EWNHdsSi@RWyvwp>dOT0M>RxG_4
z=E9f4Y&5~u`S%UA5m{!URe<{o-_05H6CR!W>C2C5<58w{y=MDUTzY-tlehXx@G3+3
z#Zxi}m~I&yeHdWO!UVP_x08W~whvcr6MN3;X35Gox4F_aHro^yx}7I&?R(C;lhnkg
z8vjjOmY<r&OrQOV>tY_s9{_^GEnS-3j;nk;i&CE5PSKmVuKa>u0X$Gb8)C}65uFz}
zm}*4$l4?Fdhzg<Hq0<n@dFyHTaM79u$?F@+_NeG@&1t)dAP*A_ZrnOQGSbG#<lW8r
zVqQa0opYsEs6_F8*d5!pn)~|ilL9B03i7490MIC>q+uV;7kc4)yI($Ix;+w>s0<Ez
zvfy&605X~AwBgSZ3wo0G?%k0mN?{m4PUL{dSI6uPuq2EEu|rE9Xp=vs<ABL8C&99h
z=40V==n{w(3eVM9QH+3;3G_SoXsYp$HtlCAEY#2aUq<My5Pvk6wRSVe8I^-jD|@yk
z=%mdDv&=uwte5qAy>6rjw~VuScL8i_rN_)Y-Y=lnT=_X-I16TmiqAsXMJLwTIg+)f
zBU<fH41IBtnVEe%BKA56g_%tokLOj{hL{Kfm1p9)RT4WQB4P<J6Xbz(LN*Lx3KDGr
zgQQ8ooy-N^^h_LfOOk6A%X#He@xg9u%Jf0-*-@IQbdUl$PP+%$JP`2A1NhinkY7=W
z@alZljqIDBIkDbt;6=e0&;$YF4)e7NzOQFMD?_<Q2JF&z(cnV`M#Yfm58^ML1{2s3
z9%0_hXZ;W_4%!g`Oxj8C<A4g(4JAO$K)p|KJB0@|vXUpbtxgG?kl#3LG}?3`?x$PB
z6ayNY1MHwjT^*d&q#jPX4MX-4c4y6s6~F}apE%>Qnwwac>b}m3)Yxagb__7gJGXxn
zz{_CMw+vOzmJED@l5N;q+UAERDZb9Y4D`K{Y6vet4p{mWa>vSdYqvh;zkGGez4puB
zql=-N%ykgv^rO-tRV&8JAE>*iL)U*aK_+#5ikP0KVe@B7T|(RYAVB$@kkCL<F?#l#
z-j(=&1Y&K7<cDZ3dsFa@=@z;N&tv(>dKs>Sz6ZN(7b;X54Flh$s%e&WG4PRPYHCg8
zT`dMN!ZaM#?VWSnenG@$nrK}d7zK4OV#y$fHoiA?HG5xDLdP5eKISQ!LaO}@4E7hQ
z$dV1FlbVX)zSW#eQ7QS0fWA^V<HrfIhD|3%o_LIuYDv7-O8w??>nlKnG?8+Hc-IcO
zUoGTsT|7;Im~7!Qy?OjKy1&-$5<t&l-)Yx*KE32^F5htu*BmSL8%$~b^sLv`HJg2r
z-P8sJkds05is7zZf|(bwOdug#JPni;Fvi}El<smaf`m`?{-ZnU_9SF9KR5a2)5;H+
z4W5)+isFoqIT3nNZh^L3l)+ArJl?&T7YFoYb^HD6-wm%8&fw6yf%TmGEQNCvjP8M!
z<|u|<#8J$MxsxQGoHljfhq!dCek7LgQQ_kQpH=pjrocm!WX5y3?VWDcIl^!p^Ju|b
z>^Ty^cP<Kj5dZzl!(t*QTrgq8tUxCDLy(si^xtp61=;)T6u86x+oP~hp@0r2Bh7b{
z$sq3b#cHlf1E7UvCJS-3IjkpsKE31Xdq(NTv+gnL#G~TP+jQdt%IKC^7Fng?YfU><
z4Zzt0O{!L)l$vH#FjfM<(ymw|e@lfB@QTDjd||)CBHOLD`?lk?j6BCCePP{RwB-=<
z;BeYzGVp=K4yZ_mgk-iqy&~)aUOYt*FwTXc)5-x^tY@@!PwzDN!~+6w4AJ{~fT76;
z5kL5~641aK(uxj79H)n5E=F8Q3`_SugJ7U)pMdbDT&TltXY9>)1we3M1pr!Lo2;%I
zy78?Ijh2C$C!&l4YReFRvCDe6|HgOR*Cp<ENL-GM2k(50&p;UEqu|Tf8j#N1^Qe_V
z7_Ea}lRzt}*~v;+Z+8b070G)lnRH*n(>Y`Ey{``^yh2xNvb4v&x!5ZLZ^&^-B!N6u
z@rfexyXS8j_I#iUeNFCoN$7BGvkTAm3~?o~8X(n11A8yA57B#vSk|An`hzY(495)i
za0DoL#3e27d_6w#!JsFyS*VJ&6BgTSXREU#!NpjDcObW(Die@znUb3yF4fR8LE96R
z1=CA7N$s)z-|hp@VuX0SE)u>@hc??rZ?i`X?KB~+@1S49@oBnHi6Fe%+<)fGCa`tF
zaRnT~hJAr_NH0h>UjJDD$=xxDc|%al2ptEu?W>}qJ@MW8qb51fS&Xs*e2nLih>L?c
z(J8N52)`02Z456R%~z6k4jp;>(qBzktls1a$l`13I-5E0sa7p|fUL+=Ra{Tk2=K~b
z&IQ<$J$q^BGFm0c44Mn;NEwy|Nd?&a4;8XNpB4kTGLf!wZpk3b!AZR^UUDdF=oO%2
z$ucZ~Pjd=)yt<s{8WENR>MF^$V2(40!vgxjCX<LK$Rrd5kf}C5D)I6+{XX}Y((Chz
z(*lBJqhZ*k`c}hKFL;u6D<8aW!Pl7tzKIDSMcPWK-4}3b?p004{ubnY9C<2;13$-C
z1Q^l$XL42Jg(7U+I1r7{%$>0WuN!N@$gt9@s3y{^GjRB)HWIN1$0?4U=8^g-Vx{Gs
z$`N+t!`)RR7wIto7)yU1xdNc~M0?U%`vm*AEBZBLbm80gY56Iom`J3!mZ)$d%K&4F
zrq62b6JC1t-w0#@s#lyV5I7C0t2F8_#}xTwXAVHR57Blw)Mj@=$(X!8&9KNbrX-A?
zfSP+AoK+Nxn>D4GZ<@_ES2ab2{@8Bx-3nb1q8?W~N942s_4dGcqCov7W0CVH6UYX+
zzv(o{o!eTb3vo+4{bdBM#|l&+GoLA-B<4USa&n`Qd=|r4PTj-#Bh}J%mJWpzFj_~X
zOs70m@Y;c)T7}4{7%Ez+-0g@Ea^P}jCNH4K?%UC;RjE6~!*A)~3q&W36lud^`o#In
zh`kj=*Z}L=goJ@yuLnR*#iK60E2W&BDbW!cq_hfm(q=K8fJ5zJwOSNx`&$WxP@zI}
zj{;oqj0%OS_O1R5vv0pYJSQf<d)s(3tV}9tK`H=KHa5o(Ic#)*1m8Cm(HphG{my~a
zVPE;-37xT-6j3U=KdM#>oF8-p(6<w2nf$bcvn3Nq?9E1&5uHOf`+)44*^I&@aITPP
z1Bbq=VQG%{T;$8>q+%vt8P1dkK)pD|aCC^o#;MJyA}u2rw5syEA>Yshb2^m$RFKK}
z^{y9P#a)MBBI$9=a4Y?=XJ5X04Go>X(a%pi`xS)4m`2yJLTIgc+J;g2?ZXsOn`Y{-
zq(_%YfD`MegOlUp69O{Vvv#JHNA4qgTdvjYLGp#gtOFDe0#k|RJ7PpZb!d9Ho`Lt;
zZ!P{F=QE^6=*04$Z4FQR{PkKk0!dRQew3+aT_Sl;CzeL&^%7)wf#k379!x7Ui*Cmk
zS}y`(_dZ@TYj8xXI5w^`Hb50|1-WD@AN(ujH5G<ujkg$fkj$wsD%TX`ja5_ceioR3
zgADr0t2R;6l3bwJ6<;_31uDH)sEfnWudNWGxwqaEo*~Q(Lz}qRsOPH}u>*odQ+Vkj
zs?`s{CXR7wMJh2P<cVywsW4z!C3!%4|MBSKZ3!^?xCpIbH>!*%J=30<rd8AcOq%pa
z;vU9gN>I#KE{!s6v34E5!4kF&fRT+`h<@TT>Ho~w`TX?u{c&2sLBky4=W@&Muu#m<
zxmjULP8m(brG|YVx$pf2F0+Q_2j7QXrsu{GGSC?EuY-{g3P@rYo3O+o3D?(5zRvys
zk)c+Md9@W=x(R_3jEqrXIm3QDYYD^TIZ89kaq|;-*5g6<Gz#_>xxU>GyS<>}8{nC)
zM<jh=UwQRX5*e-#G1ik5lz*PRcCb3zY7UwEX$R1%qVEMdEL6+4{ctTgC<dRyl7>jE
z_O-S8N|9CC5o_cu&~FPhA0X+;9Af};`aDdme6eP=)&4gMe>cVfa7~A2c3zv^BfjTy
zyZ%L5paYAE2j(;w2!s+jU(`U+w3y!!Udd9fWWie<KJORsg%s>7j@>9eJgcc2k$|XL
z$Y@^~+o1DC_h%C^fp0BNywZ6gv4rm5K?B%n$?&eUW?MaLqNQi8AeTr2vu57I_2mWd
z`_%@$lSJJ0(Ow7VU6jHgj|V;!M0l1#uxikrP)Nq(QRB0Fbd{vkE)Tu;#>>Jrex;Kq
z1EhOYHeI@Ha!)7U6jG1fL%J}tsn<dSa_@%#Mk0Y|v;u@oFOdXaNbCU5+dp7bCjA<L
zikY7hS`0;~ir`GI35};c_#TIu+;7E>38~IHVeG#`qM^Rki8}K4vOuSxPdmUy1_Go#
z71)tE05_p9h6xJC`rN?XDbP#^5L;McP(2r*)EKyJR%<X{8^9%W9k~RjlEjh}LOiB=
zm+uD}mstI;!>26x^=6~653$lT^5m&LyP~5nf(uhbjxlOD|7`4suh1-}3|m4aKYncj
z=)su%ov;~^CTZKYHL&7v$J{UMRZ$I_HPVAVV1meOe4P+6|13$c1qkDg`+lysOt9OW
zK2l~rIVk>2ga!p?c5TFJPLigd@?Vj+Oz4FL3Z?Rk6-!D(cG1ud%x}kd7Qy}Jp|BnP
z8o+{ce&~pCkfQg;x7-Z&gb``QvrTfI6q*5A&k`;T?%l@_)ctSte9GT<Is(vJ1&EJ^
z6FJeT@v^vMOPY^>#&w<VUD~k@9Re#qya0`ENI5^0=S;A)v{d;UJxhim{?N94!>qz^
zet7hwAe;_dc9&nUH55NitMFV4J{M%O!q^hJTVRhocK)Ix27z44_;^_kP78Z7Pa!Cp
ztj4OwF$|fcp2%@b1jqleFQnC50#%>yxaAeeyH5#0Czf56ayd<(uC2F{ew>Hip|`yq
zOyX8Tz|QM+2j`j~!`_v4{{^x2+2Qw>%vkf-89P51kmWLXT^UsBWy~z!+KPso{rmh7
zs2k_ht{~~D<D@S<i-hA;6@JHzFys4d&K>H=lJwB;3V4I5su{Z0G46xnf`}^|-rlGn
zOU_$2AD&2i#D0Q7zhLq8Br(1^<PCjIa#i5o)s1zv(i1{xGY{dE$9dX2*?~mVZ<W@o
zH}9=;VkaXW*KeZmI$2m3&v|s&)o{fgJ1*{Iv2MNz_yqyN9?5(c&)^6Emlyr8C^kt^
zuMS{<ZM}?E${LNgcMLGZ!ubTuzO19eHy6xZv1(wmXF{5s*4*P=qkS)Q%H=?_EIl+4
z#^C7L4Wrhq7g7cyj}0haa+_&67@0TwJE?!TTmE+r<gM@JHEJo)IT!nFqyVsD)E$}t
z`jjOwOeXe&gEcW<$6qQt1bGyqd3Q&ulRE*9Sxgipa|GiMT#2Fqe-a7AEZ#0?5t6z`
zZ8~qmmqRuS1|<j~{1`uz3@YHRTUgp)`0l?CMyt<=xZikQ4!!yonrGf_WF>Bi_5}(W
z4ULTbN5n`vnOEu<%#IJpmir?0NuweVja?`$$7n*o#Qogzn$0DKh=|tr%NycG{|U9H
zv*d*GIPMvydTwz3TG(nabTpE3H*`2mbZk1}?#6>uP-l4kajDC*Yk7HBe>HS3`cf<-
z3p@pd-CrURRV4p>jsHojQr6Kq3>S!ARyW$ATx9a~4JGY~>3R=p@28P64^!j!pmF+c
z&@=BgjOVm}=t9`rCYpsY#tU8Pt0TeYc+n8A3JSi<Ht}I%3ga57k-@ve{8g-nV}JgN
zwjts}_y%$=w2brhMx$XDGw;i9H9zkNHt<4E)!>xqR=zUYb%pWwsyI2tq-i&~IYIY}
z?zl2|7Nu-Rk?p<KU~hmsdpPne7Gkv}<r-C8-o6j7m&~(qp!~HV9<`@a@}~3f+W?*h
z^255Kq8{JQ-(E@KX7&IVJ}S)+>9Ub;5uek2l~cNcWz6F*&2pB0AGz9JaG|oM?AcT~
zWT;bgQcxPg>;=0F!v7Y#s2zc*3I=q<KUYfD?P+YqgGQVMZkFd_95wb4ISmw#fX}Nl
z1l@*M)ut&0`#!pg{%x1yQ4pHlrFrXx+vEe6;^vgwxm^tocYIhvw_m*eYDoyE*?g6U
zc$aq>rIV(9oxIa-o=(L)P=`2X7f1F+${UQ@9VlKepozYDLzCG_#Gf}o62AS?k)#>o
zIeWV`l%~x1HIO(R_W+n8+B@HllAUAj9W4Sk<9Z%iU$$2pD{G}x$xS&T(@vy1O9k4G
zd8evH7wTz0K5$)e#vdql_h{q?cnc}t+KZ`#kg!)LNk{71Yg+2ttFV+|P-CVSlrU`y
z1GpRY0NWX1iQXq?tO|x&<#{86g)|s+_4~4Wn3Xn*le=OCw>WN8Et%3xrO1boV$Nw|
z-NKCT$sDK*e?k5Ix-NUXXeWl=4J`O7G5LG@rd%lwFJ-Li4=^ceoTmv?EPCR_oWWNU
z4Gk#>zn7b|v+u7Hc$ou>Ot4Qr!&O`sEvBTDcd#?m?lz~Oj?$Tm4Jb7Rm)d#c3uF?J
z><{E_EpwL?M~(A;2|TXDqc+kL+hsfuR{Ah%)TTb!dOBT9!izBy$I@#NG=);SUpb#S
zUnzQ``C)9UQd^jOsMn5|1o)o^qk~Id_^u_m&7fP(z6%EBn9$0h1%L>dO6m`ET-RWW
zK8iC+wwaRRzv*RVOTAf<lK5ReyFHo_IoU&^n~r!$Z6g_}3!hPDSOcyW1wnm3dfje3
zP()Qc-21u~W&5^V5(^I~r+eYiH6d+&&@}kN)`7Nz@G<b7^yi7c^M45ky{;dN^qD)-
z>BM;$geMPR43x=Ox&=_J1v;w(%9YKN?rD{2;}7HyNeEe#Tgns7wXH4>TjtZQ8q#)B
zpnVC$VY05c1|maEYZ@&C<0l=HUJ~7(aY#(ID&9lWRr|Zh)BNZW8ci`TH!rWNE$fcZ
zR0QrN+@}{%I#63lj^1kUVy)cByyxGl)r!8Vaw&>Q-&F*Ik=s)XZEd~<D0FbJ#8ge{
zrBGQj-mxf*AW?!zd3SvpNL^Hs9|Im4s0!G`3CUupz+IC}=|}j=CNnh1tV#CV<e#F-
z1I_(1<7S_I)AXE747#GGThr>iAh)AvJLZCfEjvNc62ygAe>v;s%icC&0ex_nV~SBw
zWWzDCZbPL*FJivdU_Wmv_P7Hx<_+@6+oH(`AF&h4P>$7}<*bU7#tT`F@9MP``^70v
z+kY2q`)TnD6543U`o8FoBR+ss<W8P?m3c1bRw5*l(X8T45AcBLypn$kD^Ep(4R{sb
z@|q%3hmU>HDPuHKfg~bgC~m`HXI}fwmFS(Ug*fY`-Ih+%fvEv4ZEL>X(W~m$cf0(1
zV)eRAR4w@tP1GI%zcrBPN(<eS#iAe_eO76WL-t$cp&|3>@5Vn<+x+=@bK3LgXvM*O
zX8BT{PKc|hxbf18LZ4D3M3d`r6@#Z*ez<YU!eSs+y1uu(`o-I+;^7sCjUg<l%X53G
zxr=Xc_q7~)BCaniYW1i-<yTP`g(@n^jg^jD2~RT;mW#A4f_wF*;^tX?a-!#><AAj1
zZL?Gb=goSgh6;>T<953E{MIpl>!W{nRzs@=GS6`;!a5D(UCXOx{Wy_&0@yjYY3xJk
zsJ+ah1+?_@IVGAzar%ew{H|*Bjrj4!F#E0ABY#4xrmc!y8GgIPfEb@mFj6~GzW^1p
zLoQp1>I4|S?m=mmt@FgA9qybV$mCT}uT-iLnfj}D%GMqja(Aac4qj}f?RCt0mT4T>
z<8yCB$sH`M?j_pC7fo{^e^|d>UdroZzOWb7$@dyWJG!XW@b-T?3G)=bojaOa#)sO0
ztKgohjw8QgXB{o}d<q@v_m<@=Nn$@i+FvCFtKv)O)uz|M_|^FtmzG3+qA=R}KMf>4
zv0IXgysc(WCj7AA*|sn#CmI(5`>Mt7O#hb742{<r;dS0%l-Aukq?o7*&9RL=ys6+f
zhr-iF`nO=Rz4*F1Rjlob_ZL|CQiX1RHcX0}ZgD7>dJAc>E_Y*!+~eW0<KOm^EC)~|
zS42kGpQuzs#2iKgz}0<l`<R@0;ahT>v?!`+2m7JN+v(=A$a0EEY6OgoW|^TC%4V1)
z_QHlrz}?_h;cvD2dXr2YCLb-l<C<qHQ?-*gc_iKZBoCFOSR0Aaj_m1_8^@p{81<2K
zBfysT9D}NuLn@A`(@OP7L(u}Q`~zi{cx^>oQK;RV{Q=UWR5U%<v+8audHd{T@$|7b
zG$iCEV|L;rKYBW;rEOKHn<@DF%!=Ffv-hrk75G{CRXOk7hSSz5JNxur<Kyo=D>DXb
z&peOU)t`0pVR6b<;U61A@6-`41BN#wwJS7qWKG)gi`N%qG|(>}?>(kY3kfmVoaHi!
zpow0*gtK#os;iCXxoO7kMr~Rdj^*qea)ZWLU@(e@%f&9#lplB(oFO;h5U5gJz|;1d
zKxS6_D)sA$`FdAc_NXoHSfqa8mrSu8(5LphayQjpVQPTU@aJri`j_(#bpeYDvb#|3
zSy~_6{jdIcU~QlAKmfBDNWLu9W+=Iy<6>Onu#B_aw)wVrRFi+8@MF9bO_b}#PgGKM
zo^W82>G(L61Cc56_dw}OIKcU^$KS~GM}2lcUOLY`w`k!;JxmVdLO$0@8S16hjcGvK
zxMvm-c(XrHIy+{YTL2x4JtK&&j?&^cWjFVvjkubQwrJE|BaH?G)lA;~T~+SmP=|XF
zj9*{(8{*WZAet38E_GtqL><8wP|w@;NADJpuu=C~(hXJ+H|x(^0a+ABIxJJ6{3T<t
zYk6@ftPJGNg0m<&DThPr_=z4ts!6iU*n=&(s~VenWvog_VoT<GF8YRJUA{@X049*U
zs|^ZBe=&olKEgQkCFcE$r2R{1a1<zr2rZ4BzKcNe?IQZ+^C<|!OtoN+gieNPxda{x
z#hAiN{GAaLc&of8pw|VzX=qsJ;TEd>=CmIwDpxpsgKJ(Q5QAdsBfk+kg3<9uJLT-A
zV6xM0@pI|lo@|dkut%VHeOiHDwGH69|3x~GH)&Q<LUsS^+pB-xeDK%NBA@7>UF@^T
z&i0BZ-RwH~wjhg?ufJw-P~J?4VLxv&D3sDrAyc!Jxw&x$js0iGBh>%=>GR13JU{<@
z2k>fzf9juw6;g=iaO!&d&!0w1YR_3zSN4xw=9$AS*qy7!35mP-%Q^CFwlY#_ACo7T
zqY*gFxFcWI?sJLyPm!i#9Jcib7RLl2mZEw&np5V&L1`p=>HanT^9eeE&XW@RKi_Zt
z>oO?SWKFplFtVjp{Cxvf#usPP&3u}_-S>i}s9qd^eWOxy{14wI7kSFv{MOW-Xd<qU
z-tElr<8%>$-PgoPtUpECDM4c|qPxCogRyFvc>NR06jv?{?@Om_qjmx~c^vOWcNdkI
z9CGM{uiVGMSpPf@by(go2$+}_3YoS<7uy-g2>|)}TS&=vw;ka0o(J5dF;v|)i+1B7
zn+4@V!90Fn2b86P`%dukIv^x5jF17_w9lGw@|fQEj`OYi^-G%2_(cWnL&n6<uOSio
z%fg6^mi$hMnc1Ey{`qQnbn?F+vZUdx=<<(hLwg-P!gw9nd0nRx_}tl!6Sx|-)T~M(
zICkNtw)j0C_Lhzb%bTzui85U=A3+e_eV{}KaY5>>1nboC%fJXxgTPKo)4Ns=tTVhf
zRdRj9H`YHhJpY)o&oKt+(X4QNWv#NVck_P%ZA<#AWaffhg!*!7T`_9@pR?-=??ch$
zdB{_D&%4H~tSK6;#{m16HADA%T2D;;x1)caHx3qXsFt-ILFRL6z8UshEVE=+s<%Ky
zMBIFOJaU0tOitG@^#-RQ+fLrxZ}A)L?zz?)ID4K1)fZ5YYiZC`HiP&@TwdNsX8j&a
z%ZoJ6`?ZwKtOQFYpfUX9+{r{AygtdcEBSkUQGCSwj06QNYOKN9tz^BX`;82B4e;L+
z4uG_sRn@F1H30SDLAtPw>i&4q*RJc9w<B>Yzs$@A)Yj6QHNq$J!{#RsC6OZTyft9$
zXRkl3%irq;$9ZkKnq|RhtmY>m7qS8Qhn1F=b~9S((ZI6dgdCD&Pe12&Hh8ehYBoR`
z2Esb^M@qfnfHynlc2$ox`|!Ph&>`XGxE*;A*~2>g8PwYk>Bg^~VWhQXGy=xwqZ;D@
zY5s@cn_Z=5-P)AxQ=nx&#mY%>&fq}?rWdZhBi%+A*hT~iRTvJMo^A0zb<W#QFZ4QO
zxKUm*PTSPnpHdGrKRsAQ9&wbP^s@|68VvK@(YQ)^EYfb!hKBK8*3_O2X}#}kuLV`%
zA>@wW*9ziQdek1`+``Ma#sR0^utD}F8}-xomO!C+=8Q9I-SkZ-?o|E$+q7RsSbH`T
z>l?@ISi}q4WcDNF#-u)9kJ0=u!*f3v$Sb1lxZtQz^4=e}UtavA&W3z@mdPa8GDIxQ
zK($Em=9HQhguOgQ-8TF#Y4?iR^%AfCLGf(X>`ChPc&+;|?^V2B*n=Pbo}b}lUi4-4
zZ61FAlf$-L){=3}LyV@&Ol<=U_QSR)KU0h*7@F5Ro%Uu&*!NqURE$g6PltmX6j`-D
zf1-68`yf{H_2=<oE&gfI2ZG|d-T2I(iM!Q>T&FjM`&DyAo32ZD$-L}G72OYv)6rJ$
zd*+}L;-F%DoS;~uTWLBo0DVbX6W1Q8FA3xVEu}U0nodHB6EZ@%#**lF6jkO76ABOP
zSMS)!$6sJIuPit&F@j##&xeqvX%a~nrX-5EqSi+aL_)s#<o>17X>cEQVFM&YX6I9V
z+%=;pdr_VRBk?BjU)cN}4KwEDgZr%IGr%D+`N&4c4&oAhP6P6-;7m@mQK@{lySFzO
zt)!__d_pQ|Zy!=<m6q>#y8dKZUB6hMS~>g3#s=}Z((%W)Y&!lijGMHC))GUbqU-Un
zr&YgHPu;=~*7O<Fk&N~P%bFSanI#TJ^S;C55#s->)*rk<al!OH;CO5fvOnn^HeT^j
z^RU#4DQRdVVHiHHk^9ECe55)IA$J4eHF1ZNel5PXiK!{BQqo$+^OZJ)<~4HNcwti}
z5a%&JJFxQXUl69=6T!96r$`h6GMf9qk%wb*Le1AG4)90}fSyf@-CY;u($!e~LU5A8
zE<fo?hz0L!FQ50$fiO|C+{14MW}8)4MV{JW$vUgLo!8S_j!W*-JYc9L_5Nm)J<JhZ
zIr_{#7`Hm;rk(K7IN9?NhI?hPOY)7+o#U)YaKb0BLd+S0=e)#c_-Fu(dY63(C8H)q
zCErG|2-F?crKo7SC;lXrF}Oa~*#wzv`@4%U@+|IyipV7W;i|RABn<!f0mr5@e5Fh9
z`DT2&?&eotyjn`p=d6(Sa-1>1f?UoRagXlAG~fS8fX26<kg4LYr`Q|3OncX@-rWjY
zHDr`1bNvU`D5^}?dPD%0^AhdQaNzoHpodl!)Y6XB`**JR$hGslX9LWNpHx<TqX>9i
zX3t95X6)SErK#7??MPDC9`CoFwmRaQ9}bQbTU<Mm=Fl#fJ|sN>t)fJ?b#W7{2#f%h
zo~qUaa!1l^{`d|#)Ois3NJ$vlshgXt=BM_@*KAeI*=mU`=lkWuan1J|jE+(o9ybOe
zf)Nq?)x#WXZ<#@Vo?{Wn<-IH4N_JIE+p&!iebT=R@tb=ZPkG!Q8YUu@>;cIkgQm;M
z;tzY1ut_Pi$zlblj?v@#s@+0@Enr<+ftJxyxvoB_-a1orzS&TkdQ+Dj2kaC$0c2XY
zV=)8y#Zq6Wr*ax4x*b6jPo!Bo1&MQA+qGyS?iJE?`qFw)Wr|zbJbY_!h&QrJI!(Cf
z(dzL$WOzoe_B*;q^dRA|deLW@p-72o?MiBy=hSBTYX7IRVNbr5&<9XmeDD{^daLFX
zI+Ejl8J53f+RKI7>9JQcU#n`m8Qj{#x1$eZZ@BDe6*=KgTg^&^#R75xv^N@E{jhe6
zp!*tpJ;!dD4g1D$?PlXq-LrhHN`&pYqTM@0_IYj3afE{WHI>8a^|>{CaH!#ja%bB%
zV_UFerZMk38W|z|@26+!ezV$g?8%@1zLoV39QqQp?pLMC6WpO_gT8}ByHiR(tVEH8
zzZyOM)q=2WB+3GVbo{w@P;oj%;hVQHA15a!YI;`)pRPr(GF6;eW`JVrIzO%?$r3#~
zt6DhO<5UhD#Ey});>qrpGjs5iAc0XT*t3<`8FFf>JN@^svAm-6QTq;%2^^;IXNzpU
zynU`t9moi)3X_?7T|IAG&Uw3CJVQ~~EQT2~`HLnfij3{cH9R@Y*`%+OSTLwUF_+@I
zkuNu&VWQ~cx!oqUgv#(p0yV6MROez8Nr@F*AS*t75V_p1p2DIdA8$o5`aDFqqd`al
zJMPFnFa?LUg2DF!wDi<F%@XIe5cUBpk2Bo6+rn}lr{h+Cnqhv`E-mXwTqT*`o`LQI
za$w@dk#Q<MsaREpPTpX6x!|Z}^Z+^OjH&rZrnn2=Fl3_gX~%6k^9*ca#Ys7jT6u=D
zmgBzPFqs~&i&Vyqn;tLp60Ktg7mmTY3~5GGD1e-UXR+)M6bc69TS=hHsc2L6y5x(7
zaZxY3&prBO*I00+fTJ4$H!*K}qQ`CBHMdpMO;+XDwReApCz<jN&)!mVO1M`^HimWC
zgNOp%-zMl0h72(5omVAtZ<ky(5G&oudNr^A^z5uUi4u!E7DD3eXaENh2Z<%|O>F*0
zGKx#%boxD(Kg(8gMiWP5DNBJ)^8i4;8E;lpHAREVz1LPU8E)0yY{b>%c{`pm&#Kpr
z?`Wrc1#Xl3MRI7S(EwIq;cRZ|@pQLzw>#wud}p<dRr4D(*ZRacWUccE)=i%*r_)cu
zpFa)Dk<tW5!o6TE9UK6WFngit<oct>A4jtOj-4K0Y*)+qc#S^Ks=>|b_w=xoQnOb?
zlOT)FtkugptsZ%gs9~>QRleXbSE~U|yab^uFj#Kgs>)4!RbqG#U;)<~z8a5E)GJGm
zvi}nSoxZzk>WB!ok|)E$q{mHNxt)75ffrC4;;Gq{rUwRvWi|mHm-K{(|2f>JLPLYl
zu91dcUXog*y5sG+NmxueMFZOB^{mo<x&usB?){R+1O_U&LvSWA|4sK$*ob|&*YVv>
z1K&*J=Q>*{P>{GBNm8s@yd)n)ubS+3%GN^~*?Lx6>BsL!gqo|-Ymd0Xzf8SkYV`!4
zVeTkokMQmFNE)5>w)IY&W{ux$wgJDyW2C`FKJUqxBFw6|I^s9^EnhFxC+VUOQ`edB
zzsXy^raYc!FGMckT3**KCAq~Ygrtnz1>`uGmQ}f2d}QyHG(>VAVU)p^WUBtFi*P5e
z5*Z$kq$rX`vgb~F$*5H<P)|H<BeD^bS7>gaAhMMy9IqYmMb0>)e+&!?w52#!pqaH>
z$a~k)FPY%kY_Y?C?L?kFZqR#{wk`Eqw(V5&pAD2hX58Vs-Jc9(Drewx5o!URB?$fv
z81v5qLsM(<&<`n}5cRh8B8(08yf;1vjycu`SdyzBzJbd?=@ib!YqL&=0(3pkr#pW+
z1IIai#_UT2MobAVK2ZYa%zB6GGm7guq)o{S&N-{k=wOFyUPyE{yz6)bvm8fD(cInB
zIVSui5PI%aG3fpHT(5hnI<ll{H@{r~&)S7bDpF_4RQk<tTFbK?zNy_m%(^WG`V`oT
zIviUYw`^(0E|~h9BX)2a2HECf!}x9~sitz!>lSxv-r}(22rb#s*dI*YP`_NHsErD|
z0m=LOtcoLB&I_^M$xQ>6^#_}5;?{4;;hE4b-6G85rEeuAQuqdwg35GVR|6;pKok3_
z;~CORxF{;`*?6wI>a;hlSrTE$3qH^HId3%7>yxnJD(N4NcI-=aVpAbS^U-d%cS}!}
zYto5^YlgwLpgXv2<8?ZG4{!l4=u1e`jDP6F{T9z`WH~k_yo52=-fcU2H8rK!_#slA
z)@_XJ1nf7J=&i-LDzV$hH8dz=e>OLNuhUf$7O8Q^&|8I(l8WP`_@M8~V+yT2xRBsW
zp!tz`P??OSXWG1uY%*J2m8-$;tm}<>_ataTe13i#!oFyC;u6aLcrEdHYEEfiuSxT}
zgHX|0a38&b+DzZA9zim6b{_Q{o8&w9v5s5N5#><ij&}a;|MB(JQB`eUxQeKRC>}~d
zq`MUk9V#Go=<e?BE=2^SOS-$`kOG1N(%sz+(gK2SfpPEqUElk|F&M*r)?RDv6?1-b
ze&6&JZd&0v+gvD1cay;C`J9w?!J6LGXd_wpwj;_e{tHbHQ1_U$;7@WxSy2I6%@Y|H
zZB2qy6Be>g-dh_kY{H~crc~9pM2e+4vHX&NG%D?ifcDJ2hN1-FIGdX)m1a{G6BeS!
zkC!Szk$|UR&1AazB2qy+AW_I*>XC!(wp`8Q?_lqj%}?!apP_F&XZE<@qOoOwO}v|*
z{{DSsq}C*gHfRw-od>BtL6rH*_SP)ZL~3xK)HUIb`YRO`mH0@J;(MpG8`+j&avAQg
zPS4QA*Yi%p>29`;j8<)1cm;V$=WZBFrX&AI)HTce_RK#tePikV%SYSVQhFQ=Wrv~X
zz(}=1Ugbz_C{Y~6jD8A#afL-Z#OMty`;5zN_2FoILlr2*rRmUORtqFJujkNfW|+^;
zIIT><txWB$B<c@I*#}@nst+P`@23&w@&rCi8>QPK|KYBZd90xB)a-gxR?E&xI+gF@
z;3cM4E>VJUAUj-3_3is*d?s%3Oa6sS(s+|a(3V>KfvyTYt2=ReTVUc>YWf%SIW^8x
z>8oEz8cgYkm&p`b`Vj>Y#0O>lbsobR=3E=Cu$vfElo&?84DCdz3^!wP08=$Vr5o^G
zN9+r_rfCaxcpc2oUaJq^ntpB!loQ17>Ksq;Zt%nK!IzOgy*M7J{zm#V^II|-8<3b4
zuW_R602j@IyUJW!4g>u$9dZ^#O*bJU9dEKk$3flVIn6Gyp&TJ9JDFd~Gz)3NjNwbc
zT$$@vo-HR1R<*AWok0Y}dv(`c0leBZ1iZJ$lq>$r$8BWN^S~A?ETi5#>#w^VtWYuI
z?~V#AZ$BF!X<jVJOG{E%0UnqWe*TU6*6=-#Z!hp%R?!U-DCeFv<>o`MUahA1J{D1F
zum^5vpLv6Kw&-_hj#<z1DVO>mi`I$hCNGHNU#EdG6QkdrZ#BB<_4W5McA`Apny+NY
zYq;$%K-x!7ncuN4NN;Veg_1eTglaBvsMJ_$#~uCP!FRPy_(<Zma#HTQ@d!g|1<5*H
zIi~<Sze_i=>~XN{VF?<r!O*<qUCzZue+-{v(Zg!X5q`o+lz0fv2>BFcHC}SFFh3R?
zbw@4Vi?wAHCgV$I8o|pr>oW$7AJg;ORb1u!UbI6s#rsB-gYp+Az1F|A06bFBu7O&!
z!oziQ9o@3(;V+vtzfB)=RAABKT;vZ$8*fs}`{My;yT}F~c?H!&p;uf<`YIIj{2LZ4
z)y4F2{f0cD)YoC{jG2og2l}WsaF8vkR_msu_kbvM;sQQg?NF^z607yOutXedvDdri
zXI|0(T))dI9IUKC734HnkGSvCs)W>Fs&}~P(Vu>-Hi>z&cBq6MxC!xkG4q<=NRK(U
z21w;-_=QOajipPEq_3GbqoLdy9eNN!`kvDKP2y_D5~Xq2Qn;!oFpSEvgc$6tI9)eW
zW$h>>;rx6FHz2H}D2BwjGr0)Ex55!|Z|2GO&I75T?(@f3-&N*ii5mJCN#Hqhi{;~!
zz?oe3))8g=N;Smt-2ACU-64U)7TZd^1?RGM){MXWH5BFa6KCl<-<}lH2CM=@vinQ5
zGw1bzUHdl2aniMG(h&<ZL=sIEL0&kq;{_0x>X{skLzDMvSevX3Cx_14!Jt-|C2=I8
zenFw_m|nDUWp?0z9aaC4YInFT7Il4CekrEox6%&x#Zxj-LW?eJdI~G|*^kfY8&om%
z#;gHQ$hOO|3c=>lpL1@SvEv@uIR>zTJ;FFBHnRPh=X@&cS{!v7Tuhz?B7vo<)L=x0
zPeE2)JZ06owf*!5z#x}i@OjKY<c5%-OEyBQ$XC#0M3|@8q~6Ky#~b%*$b8`QeF8XY
zx)i11@eeRI13FJ56J7VHj_FJEYXZ&E9IC}m(y{n4WuL}(^ZwuzEJT2qxgX}8#S6UE
zCQ!BQBWugo3U}X<Jm)de&S{I=Gdc{>L|GoN5A34g&fo!|5jD#h3<<8rf{&?H(3;(b
z<ye9<Q~Yre^E<9cl__do+hjJUW+>TC%~%TK!Xuf(#<M^2Zw~ZKzLtC@su#H^v+g_+
zsM7jUq7-POLKPNU@GNXgtS}E0uWK%OXS7){W@zV}XK*dobI%m1wmyv1_^NmYO^x8*
z`GBh0D^swTrjwRj3At5{SuL^Ul@O?+w2Z3p&3nSNjVzjjNW25ZB&@0%x3wyxoJ?FK
z+i}+GEJ2e5wa^mfXl!TT$@D;$$D%t4@A;a98lwj6+bb(T1#o?cOr*T6wDR#Lj_NFe
z@;zMUUh#JpHZ&*ST%8pdXBphqu2ru0_q`-ATL=}~L62Y?x3HaZTH6qB7CLi%ho!W7
zQ>Mn`H9rR3!g({p_D+y50uD`3q_^{J8ESiX5BdfyB_knQndDh2nanEBa*hj#DQN^^
z_)4MNcIqPyB}XbM#VQb_6AYzYz?2XW-Yp`1EIZaF#MnkGFiZtXCB?wSPcrO^a0lid
z$*9jHX|j7v3B&2uox;{@<|g`4G(a_*H#QdGOUeBAZfZt5Cbb93q`vb`6MbK^o=c;~
zXW`cjqKf9eY;1EnQRc4hst_FbX3gYH2EdcY3s}P$?zdq8r{F*V<Lds2`$la(7gm{A
zt19srP@G>1s+O0N30qCW0r)&61Hx%Qnu&X=#*d4QABRd43UsJBm_wV+W=zw-yfJ@H
zxNY)r)T)-_$<BD(a|mL)pQrZSFYU`<o7ziE>~Bo~^I}%sTZBbeF2Na>Q~R?eE7Rff
z#Ysh^1ba0fC10kuLDUl?YjRYyh8htbvknC62WQV)MQKuWm8scfI5RkwV@^YiH5}vH
zWiC&ewD|DSs8BLKO9*8EMYS&CtK8GwF9O2h?#MRN_}x3AhwjZ-(Y1bC01nI#*h$gs
zhupUop3mG@5yie8E-!U<RM}vrAafQrLAxuVZNk>#%YfnI+k@wN@98Np)2z(j0%XFm
zAO7;BOpPSMV6~S{Eo9QxO*T=C%JnYRsS0vs<i;*q^oRpK4e}F?<(#l?0y+YQKI-m>
zZ7#q5H?*Wlj`MAIM~Di#)1AT)pjo0d(P15l4ilo046dEhOUM>7rW9&Dhn%Pv&`&4c
zGl#WmAP3L)n^EJr>ziwcC=`t2g-BV<APIG^P;4Ynt<}A|`a9$kxy3uo*H_l$_H#ar
zG@w<5!~CdP`Aa-+f?`^`oy^t(0p_4$qR-N_oN<Yu+7Mz&FD9mmtU|8{MN*BZU8l{K
zieo&r#;^ICrY2^K5?hg~XfrBK{A!SpR_)J{OM+DivOwrND#WNzf@PPuNeWO{jRz~l
z^Fy&zB;N;G2TD$XVjU7g{EKE<*ghBXO=vjQu5M}?=j#d_iWH|?7u*k$nk@nrxI4*>
z6Z~(GpbA%C10pwRN3JUK`TgaP;eyHvl^+>-H0`f;Tu;eN;#56oV%#g&PJ8T;_>ygq
zzC0<($cRk-l<+D^{wiXSTzfwRul>0G_<S2m+Y;_|t(EIW{3H9WB;J`q+96I~NMD(V
z&~AxBYIHk8);!vm$d_wCaFG%0@g5G@&nKmKeO23Pr2G4Kc)!nOKnr_MptC2tg&WE_
z;`A=nwD9T37BLN?YhK-Cy|oSu1CxVqMP*kW-%|GFbHeyYmir<2Ufrv!8Q&u&oD-I3
zDa9nVlR&V}8kDZhnY>=R^7bJXB4G%sSCwjN5bt)Je*fotCB<zwd7Q2~|1-cw7!u?;
zNh~}S=Q&BWAILCR*T%>JLLuS^rNu_2BY07VLE+SPz%%_Bhxln4Tm_xH5RhoqKMa<S
z4T;@1Me!7p`W=M&S3HUkox%$gml#_TK=X#hOXi^dx0&lmn9nsEy#+GIZ<Vz+Wup5>
z`1}kp{R(OQYK;l9I+m7|lT-Mt*e{ey?ynv4MjDh=*}$!(#>a-<)JkXkcVz5$r``th
z*})2Oq6{%KA_o!wHJpF`ltl7+9mu8{`4RNy&EI=d0?Dq}H8D0ZDJk}gq>lV+Z@7?0
zhd#iX3eks#TS}*$h|?eZwLj6GLq_?UIti(p0S}<Nr5Vt_O=AeFSMF;p(@oSya*NkL
zciBJv{Tcpa!U$2+nMy@B)m|2{`_IP$PZMe&n3)w+7~*-u>aS7V{@>qU{zRKXljQCK
z1dCIS`f4Zt?=vSDhDJtAbX{GFql>{IVSjy#IzmA~fxXg>JYt+<i8S8d#!UH|FD^6^
z?U^sCmXOllhKo*foo>Sn3`I{w@Ape0-hJHh_>s&G#0W)(AEW)5axZO+VvfL^!b1CJ
zz9#-t4-T&!oNI_RC|&@I>AG}J1lG4j8R|^c-A|T;oB|Sr&v^h}DR4TNV?B>-EqWX!
zQ@ZTy^Dd_qqsiJDtJ`NX{ypn24B%@1yM4Dnf<!%HZf-7crrkOL(<*-A^^Oq?7MN31
z#0nT_elwU8XZmqw#=tV<yi^&P7{xVfb2c$Cf%PPE%T(mZnY;30g_A@NroFt!YNQ70
zu{t;Vvv(ijmf_vHH$BPB!V(848yVvX#}#PM(U0%&Si3&}1r3n>%;Gd6BlHaU28STv
z06pP$K47-9gGag4iuLyQo8scdna;>h*qCJ~)w??R?iYmEQ^h5Ex~mNxv8w9@gU#kF
zzUg&!G@y_W_K}F}X$#~C_{+M<Xy_WH)~h;99d6q&3E=r*LWG-yw*XLbC|vRdl^Wng
zYHTL7Gefrn6?8zs#WxsvB2HYNr)+u^16aBhHKl+_*PDZQG-)FtqoSe%uv2!<6SL0o
zXaJ(N%F3-hba7EG%k1L<8FAYIh#|nQg1Vg@9o7auCKRgbyWu-s=Tf^>P++Ei@!e?|
zWGjd-(;R#PCZ-vvU*7*Le+<Ahup;%tLSsKRRBT#C04EY0yB(}b>4O5R641Q13xIro
z;h2k-1Oc%mJMBOeR@Tb*R5Cowm$uV7wQc-m=!51%Co6#)sC``e>LJe^JZL8}wG?99
zCCpo@p*hwNnbLBk1{scxjqL;sh)z6<51}j`@efH$&B~=bKwQgX!dI}&dWqV>ef^PK
zQUqxW)!O^jNn9XwcNxD_DmN@qx&#VZd7FtO&6-DYsS)Gmdo$#(*70g7B3NaS`QaAG
zq)Q>*+JUc8lLi~A1n_2}#zQ0_<&6D2Q+#=9Tj4`~6>Bz3VTh2SEddBJi*ZAQci?!x
z=mI7x1nd|uS;UGddlib|^zmq;iFkIKFa&`zweb;(A&Qdax?^(Or7_)bSQv2#c`t$c
zx0E<B3;2)_MU-jPob4sxtiiS808bM>GbuW+eSr_Cfvli(5DUoLegeY5`oLT$xYpYv
zz8?KRX;3VYYs`PWtkd&)-21+?mvJCvHSRJ3@&bR30g?x#_}5#AcQrfTCv<{zkb;j-
zKKTjw1lfE-W3`><ubN`3zlYB;C|;V^2`cXpE3lf4%aSmO&A-JFA;#%`x&StYK6Khq
z$7NEUBa}{!rJnaf?L^m)<hsdRlaD@7Qd;vbt>Z)B2wry{)Y6h6eChvd?{fI?N%Dn$
zSq#Tvn#VRHeTygnnMs_kb0nHe#ng_fEJQwfMjTqXzB|E<`h$D>I9UkWVQLVZXtr($
zpl0n8Amj-p@ZcJ#TX{mP1d<EMHU*h2XZ5?Cb$|*&Va@F8lf#h_B}1X!P`i#0q8Is9
z`-vSi4w*1k{8?HB*&<pk=Cz=(TX!EB(&;D{^!D{}uK_zz96P$%A2AM4T-X}*=RY5Z
zpMgzkCKxo?4o`v=E5VK7PJo*fu9qdAps!XU7zZRwfvX%k73~z?=ACj6AhfSG;t+|d
z+0byAzbNl8vv*MFMIsUP37?Tkf`Gh*Qb1~Rit;-Q4*N-~+XgB7Ge6u%wHgrd{9w<H
zGfgL(1-%3L>D=h`!{h>6)MR9~URNZeW5Cs_1=q;t1V8BaBnHA^vX1PF;hqYCKCru=
zzo1;5-cv!I<lkaf&q(j5pTo`LW^kJKT2|<eYH>`)eZmyN3fa|@zwTJ?a;#@N=JUEH
z#I#@!Wzfptj~)3Gm_%r_W=`d1Em(!;$xcQIj-9r;U#{}RkC(ciA44R#V(2QrNA!ty
zoI0D}4&^zL_)C?|xSw&DU{n*<ogZ;-R6jJ5E(DO5>oL8FKVX-~0$YH?fe@M1%&bqR
z9WcH)joKEd?|e@ig~pjdduv8a7f&PFhlHMcX{U82;=>40WGyzPJKZ9O#MyjWA5If|
zr#DFo=1tV1vu|M=p;xL9dMzzY8v_<?H5(UX6z~S;Xk9RiDBkNDkp>V{bNMvm9tsp%
z08J0~QX$Ok(&R8sB_Q4GPdzHcH>n*!w9Q7Zrd%}?iDPD)*!_NY*I8Q~f<UlGoXiG0
z?Lz-V&ftpq6CVW@?YytU6tZ$IVr#U>9i(lBrIlpG1v;)lUIK|`RGMc*xtQlBI5ix%
zi0KFAgzTu>YFe{ohgsY^d4fHn;S%wwkwwuCmy<V!WC&5wm>Fm>bz-4n)1_HB?l9I1
zg_BAVDk_<NK@-`Qunv_Bb0pq<wo6C)#I&RtTYtRV$UNyHbR(=?h;q*xW}S6TNE1^$
ziawdNUbZfVQ^x=N)oWys0$fA7+O)JGJ^8?o1e6&&h$F5ORSaA5d!1Pfm#VOOgBLOp
zr4D(?jNPoYD;y;xVtpe0gCz>!N*qM3sOAtZ(qUTFsxQhEb!!3SGh};|Q6W7^lO;zl
z$hWmghzb>jy<>1pNXtJ*x~k=>FumzdAhCig2p`IlLFBJ}Cd&;AV%NDc+`KRmbu+@u
zn|8{4>8!pVNjGFcOvcZKA2&1yK(`QN&ith^*3;PTKK>w(-zA)lj3R;S5wWnu87l6%
ze}?LY<Kid1jsZM6PreXO?LxzO`o?F4Tq32GG2AZV%A!~r)2+Orqp8Tn4!8HV+5Lq<
z6-3);HF||yhjHabL2R}fb7Q+CkRiM;$1y_T$7ik8Cpzz?8bGSaFKoS=lb0tNBS$ap
zOjXfcWu-F`AhRFS<J#hP|HtV_-i8uFM!w~iU~}vkVDKiWNGocJuy%_b>Wz<-8Qh~l
zaw-u{eK%xc%D1F0d|i;98sb9w-Y4AqI5%iS!@_}g|7^(`+sDPuPHs_WZliliZJ5{p
z_TtSb)pq@kjSX2m$gt2Zx{#um{Iiw`y>Z9&bnM@Tk3T<Q0$57ox@IJzL~aVx+p{R?
z!oK`Cfz81TI3J#sP()ix?fI7|R;ujA<76N?f$IU>>8yB4HI&=$!f2u2*TqYt`iv!<
zg)xPqzvoJqR!H@)OIU~R3CmMV?+J!;s>SI#rI{EoChdVBSWz5Bl54v^O}JXzy2N0)
z>j}znIj@ZSO#=Bia1%;BiJPw!>XiHtZ~1jW?2fM46F?_4C?U>^)@+A%Yfj516)bB%
zB;(kW?j$^HEsIc0$9yM1a%(++_L5jPl=4Blm$C>k*0Q}tt&joGxidCuU41s%S6pgE
zl~klY<IWKHK}$vUwoTkQcYrZj8l~<JYSe}Ez7YA9?kDn`Fb+({>koW=_@86hKKiSa
zNcY0>Bu9$04?~?VT_KXM08j-40z;3J*;<g+;@U4DjE$uX6K1H+`$n3=*Jz>FmHM6$
z;u?WXYZxq$CS0c)q6<%Rq=rD3JB>7Q45HOq*je+1!>1i4<SHjiBO)}g(IJkD>SS{-
zF*dV>8v}<{lB-BVwa&IGZOmUlm@X5VaC8i|C!GqFk)M}dga$;DC$0zc(<b#Zg^&eY
zBvQ3>pY2G9Sces-L>R@_Z;S0HKQu}UgxZ@j-y$RUpwwX!uqL_TPi_*M{Dm}+thrC>
z9o|jbD7^P!QMFOYnOHSVR4B(?tOSexfJ;xP`eAm5qjFHf;wA?CIo(L10=;mDcV5Bl
zNxZ1jo;_*ue$i0<Tdjdm)R|u{jh+But|E!(y(PLgiWt~DHDRcdC7&_<baPdjU<&h6
z@)WEIUVtYQ64_c)z8&y7cn(S*sLlgR{I0P<UpDUM2Exu=g|*w-m>7a<;Sy1`L;eS2
z0Yngd*D%f?S}sy*)7mX)cb&)?usJ(xckFFvcJ-v|I=DrLc}ht^SlTX+xh+*2l#?!e
zOa9(p>#1Hl=JAw@3MJ3G;QIrEgQkg2%Q5hisflrqVWtp6vS^Bo9@pgolVd!J>CX2|
zBnxceF;87XCW2lUb03X<LRNr`=sqNA<VOfs7l8ycS=p4USvitMwW*2f+#gw9#;$a&
zKDsS~xef7M^d^&{*(Ti}N0zN7{~<ZwjHB^RRrcHh=hG<;i=*}8UAx0~@}gpj`?!d4
zh3<rFyjQMB#5+&zC`#dvl=iNeyp+gH5Jb|?lsWgTn~~{D)J#hR4{ysb@uz5>Te;I`
zzE7<QQ1Xa>lnvb0RW5{5Mr3-Af<#LBcHSQ+pPduJ7*iwtnEMo@A3Z`Q;-SjydfLOa
zL9536?5_Bp%+N|l&bd;VK6}ZFT@&vm*F@7D6k9H!7rj!S#<NC7L#&kkg;<DQs_~!+
zl>xD#@R%L{h95;tOPJA&b71-?Au6H8P0YyxzNXUoT`Lps$s~{Z1x1jP<D?*q5SMS*
z#ANSUH&!JYFOA>m_@dr4Wd~9RV0nvzviZtM;SpA)J)us>OtGkX%@b1zSaR-oOHm27
zCK5E~q!{a`58ms=*V~>h8*a;h?qrocs2zCiTbAi#L52HN-mM+S30IeunwKYe%`=5~
zS+}UwFq1I2kfLl`Mi3ZPR(br_JccD{h3e(Vw-JGE#XFFCb*CkbwvV41-7aj%w~g~8
zueVoXxJSpy%gE?B+VnN#!Qg6Ns^t&F?fq^k!^-SFy(nIT+zt@G+W3417$K5#1*#%b
zI7)+Gk~E0<b7)Qa19yRFheTzE*)ef=a9%i*&3%VAKAFT&fuSp>P%;IPv?X1orM4{e
zV0(&Y39`|=9j!f<ZH5@vB-Hbsd+N@lExI>M>j4$oPe+!L=X%3wI6n!)Ys>*O>Y=ye
zbSCOndBi!qjZ67tq{4)u#e>tWovGrvrvql;l1l=FZt{CG1Cf39OLDvck<81*waO*q
zsp>xuZoWc9=Y!Q4JLD79jd!scX5lC#j0P8CG8WPP0LEYpiU3d<II8KzKT3S)aC)Gg
z9!f}D(xfsGK^-O5oBQ&47vL>#e2!r()2#nm?^@px`_jxQrDf#)#df9d40q+%Ber@M
z)W-?D&cfNmzntB6A3l(HoP`WEj+1@hbu(ZkrXJdIAB^zPuMw)f=cA`VS_)&OCpDHK
z?gEjrFuC<w>7388#j8YG^Vw<R&B(}DBFzvQZ?oz*?W+V<p_b|2o;TxqJVc8wyaHVK
za7aLQbGU-#+H>EWDxRx1VdW{roCM$yj$*zE?Z2A(-biG_RJHE#E&Ik)H@j@Y73(#d
z>I0H5nGs2xG?`LVtwJh4pL0XLG+~xyl1?N1YFv*&KFM}e=*ZrB`aDE&sH&Y-u@t>A
zv+8{gNCmnCl_Vc{iQovJq`fryHOYDq!xEZ1L&L-R<)xvDBsep46%VvQ`l#2t<RG3?
z6X*=KcyU{Vlv%Esv=$<6TwJHWTn9`}Ye*9JzetmpHF59gXat<IG(YqV{`vv(leI7$
zlL%ISJ6J@;0KECg9s&7mczoDj^Aqy*|Aju#SH%%QDRg`zwlV&=WJQc-JCW<YX%+Z6
zuOee}y50k|U`?5v)+DQ69?WmJ$&>Gxv{<`YWFSw$Exj=V!^|4XWKa<R2vQU22*}Kp
zG4f}`Kyl>}6$s5=&Hq2x7bOKkM>GQ~kOv8xB(va7dk^JllM^wQbWfT|T&t_7=*~8i
zzCeuUj5stDeW04Umn9_pzfE}ZJ%n5$Hb8j<3Qz?9Nlpu+1=K~5&rb)>sSrNFD4v$g
zEGQW&hiYks^9uaFg#UE@8e)6I$5erUu41iE@4T<EvgqA=p09;iVC0>YcBiDiKKjxs
zi^yMW6Z{xER<5*kD>~aWdGy|YhO(-SC|{&*21w?H&`HZ1Wwu7lje;*a<+7z7Dvf}?
z2vf)NrE+LN>|qvBhxGAeEdMhVU}>}meVnYef!WxIiWf>%-%Xv`qShM3rh1IvXWU#2
zFhO5(_+Ne$@)1;bfvKsfN%7<PvE-_lh0K!>I#9|0Y>%8L{V(_*2qaj`#4;@`j0)f}
z3}>6Grp$9Mn?cQOnqo(1Ay^Cw##eyr|24u0EU!o!h1j(L-cmOaOj9F(jF_ArP4y0C
zaLBPzsUw)t4S&AXNUokec<cZA-DQ6%Um}!x$8gx?VkyeUz@THK40D3=EC4fM0iq@I
z>rJJZLNAr`_~E4@vZ1TebpJuqlD{KSvV8^0|6PK3hYn3J8JRRRvB}M-AMzIGklE7o
z-5iA?XV4)7;L2}CZ3|}a>iQoh7JI|+I-GG(84iD$UH|^#p`M7*;E11yOjwTLaXV!*
ziMvdsWMhSA*3`7vRcB<VeFpyxk72c_2jEk+kx@yboR9!n@=!`dak0_L$A+t&{0}Dq
z9D@ENgx3X!q0XW0uRO8eJ~_P|oxR6Kc!2QVJ$fC94Z&W`f^MseS}OkDe_8_bmu!lp
z*qS|z2$F4=9eeNpXUPK+L?YkwOcrGCy9(d^u3h`f1MR+iskz(-mM-5`_SSH?uW+*e
z{W4ghgi>ZVM$6|QawQbkZh*Y}f6dX=pJ3`q*K@_13X_9;U?=!^&VNCZ`FtSRx$s-i
zVLE@gRO$mn&C$OMDT2TYx^Xw-wWf@F>zl`afX#n;1TMPq;qr04y8EWo2Vb9>{%uI1
ze4;vWMkXeonlc~VyX$u4w*J$?<=04lj$H1-PXuZh%o+T(8zk3ja~+7E_0CyyF&|vI
zVE;2_FA=h2!%xHQZ<0SnIwLTe|E(KAp7|yaIXNV#o`dcx`hVu^8UhxK@2(UJ7R<Bp
zXCjI-!tkB4nwoK0;i*c7tOFHz(!ZurQWZg&NT>*;6&)LcW2~ixg-Gt!b)}J2A5EEu
zt#MM9tM{!K-@KR|4De#+B=~g=b^igbnIK>9wcBbe%h=eI85x;dAq35|xEzy8s!=qy
z5RTgReCo)){E&+;lPfBQ2M37<7heWT{~2|%BjN!gtY|q?A6T)nAKqkF4<O4P7JCP7
z-E>$=aHm|ooE@)jisUR-xE;LvFlXQ8#l3&6k^cL=pWwOc)B}T~ui-EbAvL#Jv=^EQ
z^GF<%L%}5HmkUag^12_39rrrA3Cc}jr$5svNsKVr5Q$_<!7@;j*<20>eTBHkUOLy3
zj}gBUxSR-w|LZ+9cdt)1?DiIHJSxftGJ;O~u}6*q8^jL;=wLioO|C)_7jO9}4w&7g
zxbb_bCpRIj@`xYXq;J3A3-E%ojCrV_(MwIO6hDDn{-*lAq^FkBXvn``Lw|j$XRSVH
zunuV!GE^5tk#^Y@dW`2qaT%_xuIj($T@Uv0Ch_t!RV9M7cn$+J)A8csHspsfmoX<>
zJcJ>Zp(%aZJBa_Rua6J5cWQc+dY2LLyze?Bk+ch4zL%d&lOzUY@$z^58pqXg^rGkc
zI5kre#E-)!Jytt9Y>b5`@7I3U|Iy`ltHJYHH_kVynPAf6`tu$65UYAN$XC3uOn}z}
z3nKe;*-i-Rbp*7P=K&x7JeItHKFd?I@hDG>yCApliL(|)k{{&kvLm)H5WW-byty4o
z^v~85^*sBqZG|g4q|_)F@Y*(bjuGW*uHT9fhJLJOr>5Ea^Q>1PKX|y%w!Hrso?9`#
zswkh))Aab~mkC1r_F@`8m|Ewb?b-s{o9l%mSG~+LNh%^9wcXMY-npE)h?`#EdKf)e
zAIeL2yQFVR)skK{@@tpc;vm?&QR*B7!;2fqa%&&SwOpDo$rwoQ@N3K1zs<ga*~y)?
z%LE1nE+60HH3fBF>QaxF-|GZH>AL8amsQNEoK={?dwyM_TwZm@(-Us4>p5B9$)}0X
z8LpdXg%_uyVMBMde}<^AUL*sO({k~Pdrtw~=Lv7a+^zkOukOWyno94B2yEUxWHyv+
zw$x1iJ;Snh!D4sWfk(Tjf%J=(0M^JCo}p3Re1T{jsxk_JnldP(<yT$igp7hTnaN_n
zm{7k=rr!sdmT@W;wO!@q74Jbv$E=RXp>mYn=N*O2nAjQDVAbBYQ^()uGa>F}ywUc{
z>gPB(HY>jU*P948(X+RVnq!q!Q&ISS1}On2E(4w^h0A7{3~soZvk%rwBKv@s^ck=|
z+TBKYfyo60)Ep`Og+M6O%*;$5s2ndxA5ZE4BW7za9<VXu^pa?w#+g;btPRu#gWE9h
z1myPXUhEI6v~loo>%Rl}32vQl+*obRB?-h&dPz4ua>BIN9)UDd{V%gb!%>6p$pWI^
z1BOrstx`?-3w!t(AOW+PB%hu-Ujq31Kv62@%^MI9T{&We!{PdUsRGO0oE}<H3>t~F
zE}!Ro<F+bhx(~89=1xy58-8?&z5g0Jp#eOtOPvRJK}k1AU!Jr(p32P0Y1`sm40+Yn
zoisKkE2j_xRB+;eIOWHszHU&c=Y3l6MeN!{Ws#AI+0%hciKifNQ)FajW~Volv=GeM
zeQCUB)?|*;s?>CT*lVZRZkvu&P}5UZR#vSC%)8jS&F}`aQ&Y`YmDsKci#oH%?u+*;
zgFSwq8<g)p4(1FD=Y3R}<T^YUxW$@t__dhm9)jGFdeP9dsQht$(GRJby(bO|DAU=Q
z9UYfZ_OttM&u|$PF%09Dd*VjUHSJRF8_g5Ej5P^fz^VW0^i)5KNwCC(R9q<C(A->H
zzM)U>yi3U-(I^)pw>-9zXw&MxVO9Z!XsWxekwsf1xg7y$xh5v&sY8&{LKkc2@+&AS
z=XQD8B}J~hsmJ&-VLK)*e|BJUQr9OGS$`=v5-^Y|nSi-$?~zna_rCtTz#<05IyLy=
zXlJAo=oEK-)%p~e@8#;kQ#3ADA8g>hng8ky^bU)T!{ZIaxMOm0W*`E`MDiw>5tM_`
zDb0VK#Z<8aAn1B0yQaKMj~^G}o-qxYUNHtvk6e47>4T&zmKv=%py~0{xwcO~FH|K4
zMB%zjtCymddtcmT0XjL>dM|?>7}<j=6#J!EYU}Iko$M>F6S4{laUdzL5m|A<6i5m)
zYGUjI_o`;=hfO!T{%o?B{@(5IuFrluD&jY~K%i@?9<0e>WKPbp`o~tE@9Xz92}B*2
ziBI;A-Ztg<IJD5(&H4Mf2FTolVa?OyjRy6b&Z`Tm>88bsQ?G30jm?!E7>1SHvIp&1
z;r9q5^~cUST*H?&lk$~}^ZPiS`R@A&v0t7bk>Y&wooG<|F60@mM%T5EH@duTKhn(f
zI0%K~R`XIF0NSC3EpfNqh(=O*_Q{U#50+MZmv{^)xw00&Q4ZX_8c;2!RxUHCRS#Yf
zC~7SZEQ9^QeA{h}ugrLfdcVx1NYwvXnqZ)HiM6U%FR#b91%iZY^Bd-+taL70)mbXb
z(r$M%$wc+sY$_+1*Up8P0Y8}OGF0XSYWz%AV09?G2cgE1F9<JFBR&U5YdDQ@9gQxS
z|DY-Q-l1yQc(QzrQ@r}MS!o-2cWetPC6M}K#8c`Z&gf#b2;SKLrtxDv-(=cpMUBF-
z&0Em!2gvYscpgG)5dp<7&M|txG3YjJeqeaXku#v$2T5dAW)fVxudBZ{?XcyMRyJPP
ze7vq<yKTgSUUup*DD_gQ&*-)Z_rX~@JgBGV&sV6%2*K+57T%IY_qG|P%)I$lDA@>B
zR_6fHa*3BIeg)Pwxe-tc$XekeYAcieJSGdno0>jeJL--p)l0)F2^#|-YWHsPbvN6>
z!D0Jd0%)hQe>p1j5*yx@g)p8fjSw~)ua!04>OQ0}tWJ~5UgU_R?uut0*ld__*9R;`
zyKMn$HEC%ztw#OsP4+fwQYd`vM<!z8TiEIB4T@1A+CEWpMb7Qwv@zqng0Q92!>#D=
z-#@^E08wJ0^BzO>XE9LD)sHcp+->%#!%PHW*l8tkBR@3J<-%z{x`8W-mL^5WJAn2b
zv{(n5bw?R$d_cLf2Pg+vVZCV_WP((3a&j>|!)PK!Ha+JAm9QV#9_z;-7tx&Og6&5Z
zcN@yduNy1d5+P?YRhlT&GVOJoEH#H|2Y+%##!z8XEa9ppxzOg%ESUquMP>MI3tsgd
zh$<$<Qf^+Med?EKm&%!BAxcT<M=`Ua^PdNe1}1sV(-dhM7k#j*k#d5D@<wnBTsxyQ
z+5H|}rkp-BS*T)WfyRI|{;3b3s$L^4K9BFjP<NE4r~){EpLpEcs7NP8eg9Q)u_B!r
z&=O7TY}4#~W3_a#K?vF)hI<5s8WIl*no^VW>5+77fg>17k>Hq0lf}k6!*OOx!kX<9
zfQ^gHyN}c9aCQ)UVOS&(GnsjB2#Osf0Wpmj6fgD|pf%W4H(xmXW1zkD^?cmPFAOnL
zgi)hV?I9u$9^^lfI~vA7jU&Y3`DtnEA{ZpuMU7d0LhBj=e3d)7G#w9`xhHfNV5;0G
z@p+?k$I}iP{rE4Nyyu%7LrG*_K?rp9N)rNak{nYe-MaHY6H`^OP_~hIyhdc`IKOgh
z%A`5`M~#}s+4zXJ(%waW6;#BcVLvF~=>iQdsNjQTg<V|)sEuVE@jbR)Z{TwvJAx(Y
z$K*;UfJt?5-`r(6H|>mj`Pgmn!P)pes)pq-=Xt?W5DyK|rHvG>EK1<rHv(V$gU-Yx
zfa1GA;m}Z174Nap5}k0T&my~SfDp6e&2fgG>!Guei*>hV^Z1h9aAm&mZ6lUe3;39P
zhEa>X-ibkyozSbp7!D_BB*!2-x-!i8lo33fHktOj7J^w(_VPTNWH#GjwOciD7*>fT
z&nzPoolYO(;0Sv85}bb`$Tu>GWCu06ru-#K+ktRqf~a)O8_Wm=R(6U6+f*!`M7WpZ
zZN-t)a~XY&YI{5CYFe;`IyOr?==ELj%GrWQnbt}0ght_JI&&ZNbcQz#b;dU>&YtWq
z#4J}#Sg<CQdHW+YbAO$^p}!qp9o2lRyGvwiNB0#LAT;TSS<o=onHQMey7zFiX2?{*
zq2FK0dJI>8>N94<+3c%(73NZS<Z7vl98i~TQydzDa=Tz<kfAU(9t5gQCgQ(2>KA2*
zbK5djeu?jt?$m!eVzQJ2YPWO_xdB?C?UxRm-v-|v1z}}&?RGtl-c2;LPm^_`xe0)C
z2#RdF!9?M4X6TlP<=T`?rfd(d?sh^81nVCsLq#5X^`}Ecv7hd7b`GUIU<BX4LW)l{
zBRJor*TUnN6YtxRkj<%4R<)V+3u(<+5HVFtR(%8i@$KtJXYZkoTj!B1ehx&+RhB6>
zbtus?Nu;?#Z2h8V@WrZpSLqB(rzImZ6AeMm_cJR4kQs_#f*tL?N1Cr_X}V&$J0@**
z>bCa_DyQ@>g`8|QS%d6mund-gva@{>0bHI1C|qnv%G0PC8SQ!~t3zVSO)JCl^{{DG
zHJcNrw`<2Q*{tuPFbUYLUt@e&?n*Gg>@fIoKH{fPx)}PlY@)C>Ho<<pEmEysyBZAC
zeT!%R-tYUdnS$_HOMak6crFVv2f)<76n9}5Dh+9m=R3igVhzq*v>O;scr0i6j@cb$
z5+>Vj1G8oV<I=3LpDYX-s~vvgHRy!6?lROmT_rEIZt@Dz;M9faIA?quH;u3Pgj{I^
z)JGy+^NuWheE9O2_eys{oNcp6G;7jU=^J2Hhp4ox%!{xcuJ(P}?UkOHBaJTZ+6)UG
z#|ipGa-4y?u?`~cHQ!k3zk|dYz1FGCD7@o|OLE^k1kS9|#J;d4YYCNV>x`a%E_|l@
zy?axoQEe>Tz)BMU7~c?!aKa1Cayu=B*stp}P7ZuF#lg!kR+htjNF*yi$`xw#kpsd1
z>4E~YawU8rOAoMVCPP{)WuFV?8mAsm$W>=g=<YrNsz>ExaBn&462Y6gb{MKyG>UE7
zG=dqT8a(qI=1#i`6d#Ov5_e0+fEm=8ZT6D=I9FD$AD%?bF!9HRr1z3WVrj;|4v!Aw
zGqK4tGI>@^MP@Vp%Vk=G%|bWK?KVqVYpS?vw@dSE;`*r->EBvS!==%G`Iug=Nc`}^
zK4WR?otB%_1fUcgxfYOvX}tt8T<$VjU+sa_=ZLy)HVZJ;>c=$Vfe2WCd~8bY>(wvx
zM3Gp=kV<$lfD@B>6C5NYKMxWBOom<4hABP9=D7Lu^sz*=^qA+TjNp^AW69&kLJ8gb
z?E(FAtq3KR5c{iN5Dm%>J4tr-k>_HNYvJUAbk6q?Ie7ng#XmDHYtX!8w<W81?czSE
zKyT#a7L<C!5E?U0gOKol`|`h2Grda58SU0Fb%yCz0wraJeRD+}3V~47;88Y+wDi9W
z?fwrm-G*ccr>N!;uF_h{3ezg8dXo!b$$aU?fEee`xp(Km|Mq2%JXx<s7@ZQ^>vchu
zGBy;>4U;RuP5BiM0bcb#?jt}(RuMP7r=etGDtQ2%4GN6^-&wePIQa`mfXXZ&^U{>j
z?Uem5ZVY@|VvKB&QMuG{bW`FRBtWm|-|zT2;gnD>ez{2Cv@|wljwa_2{BO4}f5J}|
zk}Z8F4&)?efBf>-)JK1b<e~C%Xj#6)?z4eQp_gBG$v?w-E0|pE6euh!M>5L)Nr3+M
z!TIkeC0!Bdx&q0ifH&*fU;f`FF2@5gEHGz+`U6k;zEdb*-~965e=iR3Bg{y?=kSq=
zhqYpu&wK+!V%z_nt^2!CK7IuG+!AkCo>AWOV!yZVO#jjlAK;DV8xKSo0#Fb!+vu-w
zD?f4X@T9(i-xDNgsA1?|P(J`Yc=D-)vy_xHni`ArV#EH`(|`TRPxv)X#ogeR84pq;
zIRZ`Tzy9mbpI%|;ju*`<usk%5cRny-%K;=X&tBUvG#9@V)bwaR_{kT!e4O$rLO6wo
zq&{~lX9juknR#uT6Y2`_Xap|T)*oQyUmwWJ7V+ZZ^GsK~j}5wM|8?!3Hgr1qN13ES
z)zP1cP(SC#iw$Y>+^4Q9->D4Ubly`K|8LVqco(6E>$p58ujoaw-wkrRKLhypD?vkL
zBlY^TAG^fl9z{ys{@aqd)ckpTwxOdB(;@MO%~9rD|J(Qe{6cfYsy4Qw0t!-QEu~Ta
z))HkrUvMI~7;z}!oBvNcv;8#mEF>@A@fZ8urua)d1k835a%Upv1o(HmG1D?)@b_~1
zXPJZbs7)Ngn3+rc>`vT!OKG&fcIQ1`D$G9+>^RLZ+5blygd4Gl*ifdvblksf74+NJ
zO*D0Ww31QZjT-!Kv<>uPo^W=4L=iPEmW?{??%$dKgF#~Jh<)55dz{?_Laq@%L$3e&
zNLPREq8pYCJ#y}y%gki;hH>%d-2CfD;F~Qm_@Qg;Of>r0GmQFbR}&H-prO6krxJYl
z=k=2Eh`6|KVlTn5RW$|(A_`gWxUrb1C?1pP9}M?Ppr^Uv(X>gCZc0|7`#9I(piJA2
zmB#~M6oU+;j=J#*=u<PG=>lqZOm;~B-j!bdguQe1`djB+gGMr4K&nse{(?bBGg0qw
zzwtQkeN+Dz(aKi`0U_CS4;hkBeNfQjh-$Dn-4UbGP)0+;qTcEIuKt%0BC0cpo{o%6
zxYKOo?M{%G;vdFsmiXq)5rsv2FJt*L3qe*JAa-E$=-{ye5FL$Gkh3}>D!MC)^mA5v
zn~~lTcu4Q<I2Ui6suZ~-R_&O1n|-Zz!7wy4eoA*VM5o}Fo0DTDA#=ztmz0;62O|!(
zd|BSb!(;{!`mV07%#xD6x`XO?jD@*x5&I{?GnZhi(y&nWS5`&{GMKz&X@SKLF5A^v
z7$)uxKh7_)t<a8>cSCY=%d;=;LV<>m$1`u(SN-n5DStT;G&K3oec^0p{vGYB?KFjw
zLrz%%4ArDNrN6V_Wg%W~-lR}9<Mw=<+8_0m;X>c#ckJ=u>xhL8s1A+SA>whvwubVe
za|#Mz9nYHr5z*_Mk2cr#){xf$mARvPGhx@#wLkI8rY9c``D`;T?`v(pOUNiEhDqm*
z6Q}|1Cd1w!5zp4QS-Imb{w;bmNfb<*aZ3{OG*%nwmsR9=^Yw$8jqTdK`VtuzpG?ph
zr%DaOiHKj2q%A7~+Hlx|#>`s$q5gL(UtR@03I2W(x=l1H<Os?(1Y>CJHBxwGS<KXD
zf|}?0Cwoi!pwR4zEQZR%mVx_Ztyv`Z>cN?(_R8}!t_3V&O&^93rC$13>S4ZWP0MaZ
z^wj|-IR;W8LI@>k5kT^8h_I}Jom->e^8i|@d7;vpwUMHvvT~&wTcbyE%c{2*goEEL
zgeRi85K49<>OM=wEYCMGGz?yo7QcS7C@%ZfC!l3DoWO>D!6r2j%an9-QM$YGq5H9w
zchkhH8N0Hz$x?5;lb_;ZSMLV-;Q0piXdBu!g_<&YFMAt2e!Sb3IDtf6Bel}poo~*R
z+c1hGWJ+8OVdv>n3dKNJ;!92;g2%yLp?y)HZ$qVxq18$~f~TZ#Ljw`RHLWKK9G6<@
zHt@*YfP}!f{6l2Qb`?X7OyveAGQTGxr-d9NfpRluk0&1g>#|DbLLxO~g~%Og(2S4%
zs8tWED(Mr%!xG5fWnO%f1|5_y!B}WZU~Hvu(2oE|i!?@vCs8OVZE<>cf}6=FWiL`g
zsdT6-7oO5?xk`hWjW(=UP($zp6}IlHjvH5*3g!nVd=fhG=ID=A2wDMQ#JC#iDf)5S
zHn~(?B@)#4+7$4V3J~R19L#(m;qJ)Jb7GZ-iAY!+#P{M8hsgtmsQihz8H%B`0s~jH
zaq`fX%DDoAay4vdBPKoGWORoCLw`C=n*ArUtScz=k73t;Ur3fhOs=V3WJH&Xj;bXf
zpGG7SNVaOjTOCnII(Dhv5o8cat|5LP3&4?{pC!8g8TCnhy={|FuDu*o&0T|F=l3la
zk;wPrm0Pb4ovaOvI<1rx7)Fw0jhUGN$WUc&s={lZq@GbFjUR}5u1M3XsJDa%N|v8L
zN-~W90*qT*(pmGw{3=K^jcn`q$d6^6cEn@md^=&V8*nE(4SpYy?IE5Q<_2Y&X!a97
zwt6zein)?#qCF>557wozp~^M2WztOJ^nwXeCzxql+JkM5sSrYfE0WTiH{aA!T+ghH
zq|Y1E%^+`6Gpi<3Q!8HnjADWLF2(qYAok)F_HDy^+erlxdtrnjG+xl!BdMlb7B#`*
zU78z99KDtoWj!1nVy-*f$Dl@$s-wgBj(;>H%T7XdL3m=0mh*vxKA+EHf&`W0X&Z?>
z)dBXAm=%@fzET{7$d?}jC~{Q8=9BN+8%cAHXWAcvp{+;l97#-KhK%kER1}NKJ#tI^
za~GkI%>LkEWxiwMk@r+-ZGA3aZ)2x6j6p^59;F%;;<)m7ybo-b<hX^uT%kz)=14kn
zc@pysP%v8t)f-^$BP*uDZi;YYxeC+Jqn8hZcTB39WZk;Rii(N`7o=*}SE@>Mx)jf(
zV;Qx)tJ@w2X!cu4_w!_Yb+2S<=1pKq#GGP;-0UeW((GpvwVU*IYD!voBvEBv{~|dI
znXNJXZJ{K@;KQ|rn?zITCvhAU{X!8Xtk%j*?K7QDyW|hztHOdYbbcLC5hT~o6T~mN
zhHu2YN!e&l-1q1>=kd5F8MKfhv2>XhbuLmKLr*mN1(!64@00pqh$6{-Lbj4&-P1UV
z_>f)0oO|n{Ha0dbu4DFs4<#g3QASqIinbb=UhnH+Ad9d+o?&HZS*<-?@xg1**nt?6
zI(}Qo=$Qz@#p(>(voUx_F?s)xJyJ1zYZ7H7ZO1B*LT<|R6(1Q!FD$Adl={|fWXA5B
z?Mm%p!aP#@^MM}sDhkYUDsJyDp0RTcgS+6f((&s8&%J^q>z}LKON2PNVWcLySJYg}
zVam>-AFF0+FZ0}@;W2tO*U@!Y$wi9-<k%lC&o#vqJ&qm(_!SgzI(B!l?-nUP<7;|m
zsa@b#@j2L(DySY)*%#-jwBL=fQ-Wy?Lk(GtL5>gC(w?XjhKHnVHAa7CMrOWnr1B`v
zug?;dItwfOP=O=uoA2x07W}jdmL$raXgT5P3v%0fyTlk-nD3yu)14Y9!7<NTgjp>_
zYLqrM7tN{d1|B#P4ci_`rMO*fq1AT8c%OVMZCxo&z?(Ox?2aAwXBXMFC`80p8Yrxz
zu;l!98$k+LmZBy|RF(*1l`uHYqPiIt!Q#iW(fRaM?c|E=@Z&kL^$u{meTuU=eLUi5
zZOjZ?mJR|!+(N})bOq9NFfmzdKX@aE$Ih{pWIPR)kVKRvu<P~-Cvd>bOJNI=sB#0#
zXb{o%#y~IQAPbGjJ!B;PE!evW#d(jxBx%yA&Nabpa2bCw>>KFSyg{hKBtR@(a=QFJ
zTD$iuh6hfr>}v?8E_6h^l>3X<u04tu=6@#Jg8KB-g+N>jb>yo%sH;xm1vOFEIZv&5
zCo0?0$Uph8+&*LiCW$VI#iK8hxxUyfYv`YL_mGIvb8#Yz?GyqJDkj8UCV2EB)2-2U
zu1D$&LXY$P@k{)A9{jZA-b<Yt-U*j@x3*#IGa^Ybgwj<-3K6jqgl<L(NoM9dcJ4C}
z)_L)OXj-PE$7SK5g2G#+V$EmWLmS7ewjC+@Yo!^P85v|Ndo?lPXQ2%-`}%$A5k_GD
zuVdnId(kkTIHTrJ1aY7kC6)eKW`+EG_f!sy4T}Si75fA|dIC<aWzv+S6z!r@s=B?+
zB6w&5NNL?wyaU~~7#`JALVRD(SaO0OlNBNL-`rcIe2emWk1vOHw}&H%=OrB;oFV2E
z9Sr9pxf%A=b^olau5~Qh)T{*5@gPpd!y&U+VzSG1t<56!xidA1`GV3B2rzbj2_@-V
z%Sw>%?R|XyDeY#YMT3<IP7W?^J`3zi&6gOtR^3AqFCnwVaDS^iduzA#etD~Efu0v{
zPJJ1^b8i{HEc$RZL53Lj8Z|T??~7QYqg$^!0j~hd?E=Cl@CXTwro(2bmsQ=a4Xu6A
zly>}n_ae;hK+H<<vnat_ixb&pduUlvQI3DBu)8ZL>~laSa=z%rmxE-BZng-4L$L-_
z9Rb^C8AL&z0L7eyHK~zLtPfA>-~~fsanPyGL~c&&7~6+afvRGuVl~k3_U4$}$kTs3
zUOpT$P43k3d@qM+bW>}A%_;E@j2N^#K94M53lyVLpFK6XUFqXJd3i8qT#8=ki3^nt
zOK7E#7(a@;8yWdEpu3=0F}%>}W<B|{<bEWf^1i$jt!zAJs@Fa&Cd2PX)XyqS{)tw4
z@|7U1)@P<@8J6T`O^cB9>W>_nuMu}RSB`md#?YcMOC9Y>eSfVKev!=bW@bCM49EPt
zL!HZYqAI%YN^VrldUz5khbz+cvJ1Pd6qA)-t@<vWYwt%GbIn<$b0jr6eRoJOXtWuy
zkT`tY>L@+b0Y$PUzr?l@*m=gUl0|{LlDvoHXfof9$4)|z?M1Jnm8KaS<6ZQIj;TpI
zu(AsIl`V#W$y#z-*<LyZmiN*gGS@@eqMor9jBbt2oeW$FYAwN5x}oRma|>&&Nx63$
zdK#jQx$3#=vN#()97grz?I^QQ=PIUITfJ+o1d%2#FKbAkThgEC`K9L_f9;+ts(~Ul
zi{gzqemfi`vfS!w_B73Kv~0w9v9{V$-Ht4d!RY5QDGO3(rO*9{8TB?ys9M6s4UtiC
z8|yPCvTB|nKhAg%ypwfG%vvD35TISKw9$3|hWuoxr@o!kQ&T3UH9_h(7}a}@4Dwro
zb@CbO_nxdnm;kjA?~*&ILsEoX;ZPgO;!8dvH6?rb#zzan20mdXGaM#d5Va3{*w|bj
z2*ca0n|DN6yQPD&MzsT59<)04muR~zK@82zSU~V764f*G_nUwWfC%5QQ@T0dM-*tw
zbfLq;Msl1*b7C_(N$HH_UZPO)O`gbHF-w(47^^_j^(ZH#$LfGg1s|eRfzQ&8xbQ(J
zU8-1%BBH=fbbzf~?RnsjO`cH?S>H*&(hO6#HhJ~4wBh^Mv9j{#2iFS@>v}Qa3_cYv
zPTGlk$F+ZiIQZpx=;7lCqW*DDQOIVu_fZO7?<I(TWQdp|Gap;R?jIb~1*~Ej?}2pQ
z&BvG*h29wXjL9EP#IuKd*;Hjq17EsDvW1E#BPrV(a*5=~<G1SvO1JN32hSHONLE=P
z#7<#v=HWd5AcQfQ<=mek9ZWHK?{Gi`&F6S&pSy5(aR91U!+Kqt#BOJyWA!(3m92^}
zxnh^3lO#0S9DRDYlVm@Z1_}+-^mWHP%^G;A27JrYqpgXYbHy6l96)rFqjjUS<ti1Y
zk|kW-+!d+{?Zw2_?KkVWRX@0Ztb7B@nN~WGu6s8<m$$v5s_42->(+X#+woRq8nZ+q
zF(kVnkB$<mc)wQgXBPKQ^uP(-^3C-?m{UYLDAy`z(NLFXe5sbLma(i!z;N+})VN0I
z4ql+_b74?Gj?>ILC7hSOggj{1Q6bf}$J_dW1czvr?ZneS_|q1WP{Bg=W+AKKtOsuC
z-G#8^>YQTDZ8evJgU5gTep_CIng$X(Hv|-o0e!p5*MV9C8L8s*pl0^KDo)zg^Yw`A
z`xRnsH^qs}vP`lb*kzi~e5iY1hLhH9*)ZU?(^)Za=(jZ$z^9^;XT!3ko~F(G+rZ{a
zeu7LH@4dYvq?Y8Vnyef<Ar>cRUK62N=6txWSfHBvSVU<!d#eg#!OxenmEv*GptKPp
zB(p8Hvowq^1K*+L_xj*-AW1%BpBhn$d$$cOZ^sXoGls~QtVw$Z21U&~3vtuRNvjx_
zDf;bV3Pvr+Jm3{kfO@?~<DIF>?BfX(#%s?nz0TzP+a@QJTt>7d$hvGVwC0|dF<wK9
ztlhIlLElUNtR#OpV#|^`=<PVQy*Ag${pD|qt%1_$NRW`VO;D>7-OO9^+duAJ%{734
zZyj4zJ-74v6sdx<g)r7XTNCdSC`c7vBoKg&HPkN{g=d_GxsC9SqS8j8nw_WmX&NGM
zxrE*iW7CZ1tILFKsIBw-rXgH@enK3C*cUZttS$t;G_v%IHG<PnyibD9Ol#6LNj`Ht
z!vPLe#Cxc}Uuumg(eSqcpIt}D*bF<**u&5p2-WBz$OW`!@<qyOvP4|bJfS~zqJQEy
zEZ4a|47<f`&sh41bY*u~n3a$e7sVFxpf`JlK2!vhZWQ%B6?&+0Le_)pZ_=#wxz>cO
zT>Hzk?g5dD5KH}N6vdJLNo`|e6O{T)>}d*P;Dr3%G+z;!3Htk|oJWHt7jkTSj^zt_
z&8JdpK4_+WBJmO^0*O2z<(ol9#pGqwi?HW__uZbVxUtw9s4L$R&OgUv>B_M;i-+hh
z{843s(lOz49FPN)kyFzHRP)Ke>#Vn%(<P?<SyKs~^8>47R)X6bKoNPnc0SgUclf*~
z?Da6Bk=e#$??7Hv>&U*m%*>X-p`q$%9+Rdy3FCwVj<F1JC=^-~%@eJo&GjhBnu}pg
zh78@XvZ7CD>ac{1AvDsG@WDTrx+NlW(xHuVh^+_0mNzWb1sQr#%r3)3)AYqSlasVX
zX8L7GEq>BoCTvFp=85-88Y7GQ&jP#VB|;venmnaEW;+I=Dz4Wa{>fwbGmL8pQ^B4e
zQX&@dUuzC@dwT#j7X;mMo23H3nmX+Fy#AWv|5Mj>Mm4c@?JFVzUI7V3I!IFp0t%5L
zMQIX3DAIdZLNgSRj&c#{J+vU5&^staiV%uO?;t9jP$dw0`G)(x%UAEZ-<<PfvewL=
zGkbRSI%l6}KO2Cv`9{nw-46`wE?EY`>7=rsfK8`bP=AOcG(=e8e|WjSrS}_??u$pH
zZPSR{CWGwCfbGdMADRDcSNysq&n8ya5strs<$!?hbfACQ+`hOP#~f745ZhiLN(u>7
z&jtf4QP4{@lz_{@5kTn<U+Daw<^MgxbTrw2o;0jmILuW*eeDvHz%CpZ4qu3@u+C)z
zAVb^|C|b`p1Tc<;%yyamoXDJ8&dUvDSr?`rWZW?tQx-kh-@2{{!4;mvtt5q8mHXaE
zQ7vx}AG@(fAZx0s5)ZbQY6~EfR<U9Q*+18u|5Z~ew7JhgWNZM9`96#oVYCyA)d*FF
zXvGT~w)kWa0n><MQmx{XY2MO{!fwb7w0KEwIApSO=LSj@rj<|f(+{2pwwM1TN*;RZ
zm^Hs7acQshHcmhW69x@4jVLIosrkMOOyjARC5T;-dB*iIu}u_hBS%05)?{%PoB9<J
z$3_pM5zU|5qW}5OFJnMITo`+hB4fdv+>>L3UhudGV1zZ<*J(N5DllKn0Ny2?l(^`?
zc(Zy`@Kl283IAQ`WJjGMJ@a!u<|s8EzTSd=5{1YrlRmwjB_6&ANuFy%t$}uGP~dLQ
zQj-sp)fhq>=)D5A{cXs3Jd3M$WzVF3w4&Ga^6bX;@GUk*{mq^ADf$5($M@$M;=hFQ
zY2ty1T935|X5b4?@34)1zxgU%`~Y`Bj}5RYkGFG|tN&9_2ZCfJT6raVK)qZ)^SZ?4
z@MKj2C~@J^at41JN8|81{Q3h}iiLkJ{8z!U><}Wn?BEw{sA%#~+XDH5q`vWE>jJyN
zJ69l|z(cG#Z>78#>Hmu3Ze*c6eJShC^_|F2f0eOmzUJ(N-md7JR>H$?)=POW@AXV;
zPkKv0>^gm+@yb7FQGlpwPqc{c02*8Wz!uPr>X-Zih&jSjCrQ5P^rTW+`dhQX7?2k=
zB}HcX{;9?hXzzxUTclU|xfbL(*~@c#LPS%>NtZ-LMd9+d+OfRGPBr_(C}2b5LD%$o
z;z{>9)@kHZydu(dmckcp|K^`){D;IMyqYf(J?%7xIuJK}4tJCn)%|DmEJwNQAkX0<
z{hy7cfKs_Es%07z0K3vSwn`7cN$NMeC^rNdNoxcD$Mhqr<!?<*m5#Ou#GNbjx4{R^
z04se`&)Ovfz*DO76>{|-*$JKUW&m%5BW~lpar!F<Xj#&j*i^}6N`M)so3dSKw*WM=
zCniUwyoyY7;Vy9<--TN7NnJlCAr%{6fRe-}x;lFm7K#EK?K#Oq<;(CKz(YHY2^t4H
z`o%?^pF$tVZgbeO0lq^h4Qyxwq)ylah{uqxRUrH1MZJI|W@07+bW!%?kN|{8B?MKN
zR5wCs<Eoaqo&xxGSo7s;38ehpjAS<sr38VRU$yMoLIB+5H`|j<e@rUYlIu|dvUm2h
zDi)*m#FsqnG{Czz`(My;FtC*_FXx$O4SY6qt0PvCBZ}#Qz*I5yu5n@0Ei+Wimvodt
zcWrM~5t3zw?Ki_ae|);NcBrxlQ*5jmr%CeXTQz}{@<rUmJ28n_-2Q}(BFDI->-REc
zU&(q<l@X~{`-~X8!<gt~M`O`y23HH>F1+@sZ}>T5{_mJ&m52lii&x`M&d#PJE7u{;
zoPP%Nq-Js<kiBRluI=>LCtX;h`Es5lv>2=4WNl^Fv1!JGZalT5rLFfJKNT}~%X~iX
zKa#^&0!`P-ZdoLLnMt1+%X<H?Zyu<pR5rgoNeufw#oc_pU4O0~(KrpMu;wu%`n5L+
z5)lwgw5Mq-kT{!|4o;eTo)`Megh+#~Znk_6%xIu*4JAu#ekbQI`R#==-5g4ChAuRa
z&GPtpatA8WpX}Fu+)Ea>7na+eiZ5h_1afbd%s{1DfV+Rc>c2mm6RtIzw+8O&jD&=)
zHeOxq-WPxO@0WVJ5#7(%aw6avPoSUSgX?7nD{ln)iDAwE(i_A6C{4&GkcU=H=ZuzE
z2lXV(xHy7@F8oX1KO}xO4kk1Xpd3pF<TRd-d;7GZp8lqrpL$AD^Y6&m7sg;UH%S&f
zZVLEv@wxoK{QGc^d5E8le$Ss7^Nux<{wxJ~K7P!jYM%2`)^8_KnoZv#>+Nm}3N93O
zIWf0eTLy)D1^qvQNdUoAHrq+8L4(tx)&pu@?w@MLeiQf0r;SugZ>W5&l#yF$Th~|%
z7ZA7o^f4U5B4ArTojdJFpTXtn@bB-+^JA6tM<VC~@6{$^h0e_)WgPzSMo)r@i9YDc
zCR4<*Cu_moBv>ZZC7Bxa5)b9Aj*9z;20}{H(YGv$ay9b3{cWHf%~N>{%SXxptx1W+
zB8_HaYYAw3T^#tlU{*g?zq81>NG5C2RRUk}&d_v%F8vnhs~F-dZo52-34svzP|OOr
z3}#*PHM_ZDgYGlxd;>upO^Gi~zmFvI>2Rg>es5|eH8OeVv%jxr>JuWpe7Xd*JsQsS
zRtZ+8<CBFTHcdaVuq!uNjFB@Xx{krqqf`5%q5OA3A|)+q`B8-;hbF0g2{>T$DmYYd
zuk=e)>Da1*Z^`$vsNRpM)o#W?w`U*fRxZw}cDcK_#J;Y`?^K{zh%&y7zYp${ZnFPz
ze9$zA!zmj{@!<>kOjU1C-kDr4va6p~HmTuU>Y|V8d^chL(PYiBy5J^e`2?>cWBtja
z{q~yuJC7B~Z&EO6RE(d*htMZj$<((ig6&X@<8pI}{oO+ZR6znla<<5o*9rF6Tutj+
z;&2yw(OtSMSeM6dEu(veh)=V{z@o9qK2|4$j;3I3Lk9@7bwDVbl5&VZaQkb*{;g--
zbe+sH=Iz_)KDPqRcVnt@*hs}JF(kT&KDYtP<B(k$t1KZSslppOx3k#8lxk#>i*x0J
zrw-$u4|!I?-S4^MHg<W6M^m^$t{EW4v`T3a6L+Cmtw_djO2Td%U%nyl2`1<86GB)Q
zvH)5|O}_GF+GPuiFQup#%U|ESTv45-SV5n<6CQ45m!1vv@qD~FJ8_0{C5dx&bxoug
z$cJ3t^|DQ`GclgdHZ}PG7c~6<y?ps{GzG8dvuE<C(@JEI08U^cw1j(;R{cXQqi5#U
zb7#+FR2`$I*FDw8)qCp%)w&NKLQiF`RX!G8Ro+~{D%90AK;vESOUkHxlah>dF8yYd
zJyvSEnr-Zs@Ii8f@W@%M4{kjcEoO=dy;6=^E13*?9@>}{>;0lQ+bh2+@S!OrJb!Zw
zS>^B{;L3>`cadFjPCz;Pico5w7Ja#CX<UsC`c(x+N6F`dmy>_j?Fcz>(r(2&G?`nI
zAxr(5<AF9lRNro-iIn7Myu^p4LGj}03Y2tDg1-qCLl3l2+&g&zk6@2=kce-^R*H<5
zRP;k0J;zMJJ9y_r#KozXws)3fd;yB@={;K!=Fs9#1z$bZ^85;<%Iimwd2LRRFt=bt
zyUe;meZm&qT(t&c(N4?Ot3XSVgDyEHjn{Mb!>?Z)YL^(mm-aEPX0RuWlH|-zXQ$h0
zIJUE6aVfv8%}%xskNs6O>W7k+E`{mF<s4-hR6^Aeb%SaCLE++R7~LGW>2B9(kI|fK
z#&e8Rydcp6lg3U!L7{Q6ONvUvzLJc3%46MG+sJ+p4ObB4nqmZQSpaV#XUo+=%}`C-
zzCi1-_cGvO{jl1J@FAD;h#F3J6HM7n%Uw}99@WB8%DoYT@m2I0=63eWZGuwF6i4fK
zOOqe9z~!Gg70H@{h%##UC%$YfT9SojZ4R<>3x4i~Sa8vl3%CVe4=}jfd%c~QZby@(
zUXBteOk!)vjb}@WQ}VOdL2U1)3wZ6Mj&^=>5Qt!GD-ZXch@H|W&%XDUh7wt@aF-Zw
zx{P18PPr;9TQwML^#)dZd+<tVnRvTkMOW^zJ)cwW+qu?LG{@jTs=g+}rQw_Qx-0ek
z@gwEdrFF_vl}{&c9)c|&9qptpzgtoA63xp6vmGbYa7Wa|Kj#!q(Jy$E;T;xI%nmZ#
zdH_@z>xSU-B=4kH-3bg2iLeZhvAF7m&-3P{V4+C2zF>edv0yD>x^#5I<6?_BELN)9
z{7KqY%k$W+rd+?9Y{<O@)P1$uno~^nN#pHpzqGE$?TKfP2YpQw4x0MrZ&|ViogGv@
z!R;@LHKLKsWv-$OHWn>^38TpjR*hZ_6%CKJ`7dm2z-?1pOUR(eOd*Xog5EiER=1{v
zU-l=69)!$YU|G8>%po`hW7oec%o5pw98{FxeK?;1uF!+ldb39H@`>?}u3AeB{Ur+D
zcO{YcgpG5(8xKNfrtMhpyIIc$Y-fyqmM*;;o5^;R2-a|)=O{$4(Hee~^@*#iP<xwW
zvUo3gJ7a2HK`J^B&SQ)=jjOvs4^-6cWrjC6vy=xo==s3TV6j4<W}i0mpzPgIFOApy
zN=~sBbYT_DJM^c+rWE9=yI#)F@8060Z9d-5^6G|h2Jny(=lItMU=t}A&2u@q%r3K(
zOfn+YQ7t0s8J;;C?o<MDAS5NCgM(~DK;k^hrcW@Pn}HyUTuA%Nemc4NB9u%HR;4Qr
zlp)}u?UzSZ#r&|?qyc;l)2a6vd9#Syi@RROnNB@8RjkZ)>SCanaJcUS#`Pwx#w4^5
zTFkv%)vqBD$@RoSUE4Gt%McW9#-oGQ-@}@-WBT;qDZwlEqpxZo1u}E!&a5rFG4NiF
zx#er2ubukrSY3(Pa3}H^B5za8$!SQ_=VC8dtboK~RAY)_-bh?W=EcN`LPWL!JStvn
zSCf7z5v`E=Rm*dsNXv($f{9K{Z?RzPp?L)7&GxWQU8sv)QZiDHeQIzz6q%?*65e=B
zsk#!G7R^b#G6%dp!isXJW&cr4mj=5U`BvcqjG|nBvWUJy&Jr4XPt|Q+whlIB$D>kT
z)e_yprTQnn?ZR~rCT(!-s6HV=j?yKls4+<dp%g^Sg9mdtCBzgnY&f<8b<a!VUkk@N
z@tDJ4mLDH&^Ouhd=$U1bTXwbZ<boaW_aZK$Wl~V&_HHl@X`cQW_83(dJUU52!e)nG
zoHPhG!Pbl2R__>)Xn(;F6t-%xz|bmW+zo_BlzCN2map^sBD$F|TcxfMD&QG=9Z%-g
zVVhwIHW2<ruM2-o{6lPhFl4WT>di{i5H7cMK`)|>Uz{_@lWX=4!L*U|;0M-R?!gF{
zs1Esu2sRoB1$sLBsr#<v2y2V+@l57v9_h|fLyFo2ZL(r6$pyX{%eDCE$V{=`*q+YQ
z?mX1Ne0TP#ByNg2GAjVDp`<gZYoy{105+hZSBvwYYauSd?T&{UTq3JN>`t33)T*rd
zG<Z1{w)v}=z|k_BA04`Zqm))znVM|fC&~o+jy2ao#`<gYKBWT!8yfUo!M<{JAHQl1
z5-z`cBQLlXAA}rbm_L25zXB(}bhWDN9u(&fg8w-#N*$eY6f`=lOCa|MjNFmeSLhJ4
zNUJMM)q@_HXTwO1<I#x@)n0}X676s?b?nVEWk$D{ESe5O>~(DxqL(YoKDwEet%^|G
z;j*Fxd7NUC70aPI15x#2{nb&6e%6)522Dy$zl^NvkNQK)Iy7+45Ydm()Kf^Xg^?Y)
zg_Bu-ShvQuO0hAyS7}$Vl>JAV!()cSKji3wXgjDT8%*;2R~RIsQxtAh(mBliMZwEK
z$c*eyur;J@-7SN&c~MMe_Ec^ZrA-T6o8YG6^?DywJ|saL4fCwL>DDJSs~juD?0a@$
YJjA*8t*6BFAHbok0GBU$X!h#=05p90{Qv*}

literal 0
HcmV?d00001

diff --git a/docs/source/assets/deployment/anything-llm-provider.png b/docs/source/assets/deployment/anything-llm-provider.png
new file mode 100644
index 0000000000000000000000000000000000000000..bb699f7571f4034f4c26f42c96df213017d0eb9e
GIT binary patch
literal 112470
zcmZ_01yCGYv;`V0Sa1s=NJwxC5Zp-!!QI`1yIX(|g1Zjx?#|#A21{_a!QFk}ci#W6
z>fd{LRYTPjXQun~>9hA<Ywgt&EH5jLf%XdR*|TRD5??+kK6{2}`|KHl3(9lgKjKWe
zjL)9I>6(B1C@=BxBZa(!t*N<{$+Ks)(aupKQt%4+fuj>Y<wcpZd$CzdC@QfTzK)BQ
zmD;^gm97guF@F_yo0R<11qQE%W37oa#`5Lo&s`!!Ke3XzN#07a0jFi@JVJrS)X47E
zqcm5)@^2$eZxCn`tk~0j7L{1;6+a@e;a_8aFJ%uES1Ni}x1z--mxC#>eNfoAx6SS1
zp;Mt6H)}r3?Wy!|>6)5uHDZ`cu8h+sLlV{1Q$z|vUwV7UMTdXVwgT>_@n<K?Xb~9P
z;Xc8zk0Ll{Jb7zRJHEL!-}u$4rs4&qH-(y}1|@0ct53M7g-QJ{#&Tb6YWE4T;~h%B
zHnH39>LEO(?Z_XX<$3+wp$DDT&)~(IHIy#pSi|Xd*O^mplVsl9f^^NLJ^^_W_*;4=
z+w0`E8We3EyTu7#P6iHROGo}Sds~cSQtpwG=hx!jHq)Ts>DL7sNX_Km2K#7?wTxZA
zBb++{3(jq#E@3Jo^Nb#NkMayI$ov^1@D2|6dj<SGdj_8g|G)o1w9Q2L-}eYEe+QX2
zf*GDY6MiP~NkrKVZa)nPOgK3e#6=oIpOu}ROOZ`UK~3#((CnfQZLK#nG*}$%V3vIG
z3CT)XSs6Q?(44U=GcWq#vIgr;;A|wBOAh|A_vroE$iA1UT$zUDf#P5y%W|X3fo6@{
zi6Ohsz1!LF@v7-erGe??{;cU@v$JXU4~#=wXkYjbp<tAs-=2C2cVg436vwSrIPHw(
zODD4(+HR$}a#~+4&hOVbl8$E{y5vv9P)Tt<+-zl-{)&2&Z&smeI+Dt>nc@3jj|TEL
z{~?J3hlBmp>r3LZkkk+YhLmRKy%b+p?uEG9i)lSBpATQ7+}ESUhju4R7WJ9~>C&E>
z7cg1jRqR6fEdSMt{lY&vG7UD1xs7zMN9a`$KM6nas;2xFjq%0`GULW|HI%|J*d%G9
z62lh!<VAl&!AJAE*qa6^ws3u|jMnc(q)vExdA^~<F~a8&q&6CpV%L3s;<UhA{1GLM
zeBS<Ft3aEKm<y&$?iVY=eQJ@g;lwj*4Hv(@d;0vs161-UbOrAyDd3-cQN@0X<Dxoi
zEL3@Nm~sUrZaCp%lVaXb(PT-AQGJoI3ya<o!P#C4$7U=+$xsK?;LzL)CJ#tFlZotn
zMK({fJ5r5j85_^Wu%7L6IZf;5VGsSKI*D0<QB0$?CB~8mN&`tU8^l^mefA|`HzN-w
zn!5FwadJ<u%&ADGa(D^{`$Wh~r4}pV@iH5x@#d+^e2>ZEeC->Da#YLdQVyvECia|7
zmiN=6$+nwiQnu7UE|LV<n5nUsfs{EWEtNx9Y)|OfE#}x9VI)$+;5?GCJ&Oyr{0S-!
z={!>o=?#ndp7;ib9kd#YQ^T67EkmP(-08M+dKLPowxwTLu<+t;-mw1$iFC?tWa%H{
zQgG+m=0@6Xl8f_R(yRHY57br$MprDl=MBWHX5~%CqfMs6Kh~SyFxu$0<u=c@yE`j6
zwiHKiz*_B+iCN8qn+msZYpCOU!<Z^2IObEVFBhhi=gU7EyGs!<b23mfvu*B(nL_8*
z4+c|8*+QP$kGV3LVy){VB6vP}i7py~s~6&GoIxDJCxl%T>tl(Dw>niDJ{N<@EwT&L
zmwb(<=R^#{X0cO<Tum34o(WuP^%Yi-xRF^7k3LPuM`Q+RG=eYb4r7Z^snVb53RFsV
z8Eq&DxN;jHuP^H~+m}+Ct@3sn-mxU<R~2Zg(J|V9*Qg_ze_1)*U0POmrdBUooNZoB
zZO|pxmhAt}U@`xt_1F5&(uGHIqaS5oR8uaW3@Us!KgT)M4`#FN#t!`*8`MfjvXNOO
zLwrVRAO_4bV+)nVTys<v(d6lK9m0l-mCqYP#AOUAJC3Iw`|=~cm=#t(q&18@FttoH
zk5osrU6oB7PumEFI#^4f5V}J0KTL9p9zpc+sU^x}p&)&-64uVNklLc$fN=YO68w!o
z0;YB6dD_wZH`erYb#h``^zp^~3q@ZTo{UP`8$`|hxGGq%#^vZ|cB4*@nh9pS0gJuW
zk0~qUQ@lc>TDGRlLnUAljR1aj<2(j(su8_Vt|Zl#O)EKNIiGs7Qn0yk0A6BAl#xSR
zso_`@XZ(k9t-b^`W<`7GS$4-887ME0Ul{E2SDuX0(iCYP%IVIm_$#Bmr;`kjVPK!{
zji~cisa0|u=*EvAyPLw6ba-)=5v5k?-19$72Snw&tZd6HE9Zh^+6lL4om`5ebCf!q
zLtKWm5=>K<3PqKy2DPo@vpZ&NO5O~VWP}_Hd-c+5PZ3ZAg@3g}tMyBEmEd_gZPHY_
zZG5g-(40)IwiRQ)FQ_Z0h@(><dGG1W=`iqyV3^$3_I>PQSVR@O&n2&!`Pl@I3PzR9
zt7#@n$ozFnqZvVhv6cH-Y#OH5!7jMdqYJ}$lP(`qj_NwHOs92fhPxj%(7T+UZ_-sV
zxn-%e>OiW1KOZyk$!v$u;)m+n!ytKC%dF^`kD|Y?b>DeCIih3+LgNVuyp(~o2zMcT
zHY=B&H43WE*BxeKP6%fa0xz^lnj57Bq%LJ;T|5*S?lD=HlxwmE8@|>4@}r$yb;+yR
zFdl@-AMf&=u3G{m`TILl)wrLMZd_Ndb<JZmHEG!_kjq|-`xVefe#njgzh>qHk8@R5
zuB>o6@STI%3DSNE36AybS|S)a@~s$!gzvDC%(!Kx-I9c+d>T7hk<53vQQRC`k{S*a
zNJtsa(6aKWKCwUKedEzr{At6uTEJ*ym`^G(ne~jQai;^%Tq}v*>)b!^M{SYu%C_&&
zw)r41p&$ZgUHk5~KjVqSWBD<PE}X?Gu|y>K2y)6~cd9ZPhX2?0_JQwdLX~{KDdVR1
zo)j*~-o)_Mrc}?RLZuJP#q3x@a>`Mv(|1lxwj80aQ`#_BS9;^Ho08=4wH4z;)GJj|
zHLGUOB@j=G3z*|8KJ%M8$4$`D<`4&ZDQ<t0JdRM`L)Az<j?1#M?=X9=43#D0`reYd
zMO#K~4`WhGPm_37?JAa9(T&SIO3P7mn-qq*(m-FG%T#fC=x|7I=+N3t=+G<e2FY2k
z%gBq~Gr}fs63=_<|NC0hr`!TDn3a~SL#%^KZM;hEw2UI>IDZ{)OgTI^fo#Pp1=*jt
z!%nL|(l^cKw525Q9|4i9k|zr0&6}f6EFvgD7axsM=~#2$n*xPCBw5O8wROF2kug#7
z9WPa6FLhXl5-<#ZqZ%C6#OF1K0Oe&!Odv67T&se$UtLB6cHsCyMSA>Vpob&>u~5a-
z`c()>-C1f16fVca#(;I*UuwvE{Qnh1)qu+bNfH=0?6Ma!8S)gW5zfMC^j>jrl${UR
zWV{Q~w~v*y|JY1k=5Mx<622)<a<n<0G0lsFIgf|B6ER-`-nqYYNHh+5l_%p!Vzw1n
zaq6-CeH;_i*0))vPyL|K`J!&GMI)^ngOnbG<Wf(kU5@s#T5;(&b{sp5kj2p#>CgK(
zc3ZE8F&M!ocUd0xiN3ljU)(3l;&8w-$g2lU^*xZ2;p(-nfwg0HxN87`Y#d@*z^*cb
zwN9>s3MCP_^1!`OtD|6Y)bt{K>SbPqqjqlP>5h{+mD*%!;NGw`f&;#Yg7cQER?#aC
zO9&!WJAS<0Q`kVi<ZWe6-Hb-%&<}GBy~g5rZH{r(Aazp0Coz)WKI(l_quoMo7gI=N
zCEilW^3!OC0=`#L7y>fvt$>I<g%Lkd@W9d8#+?1peQFKhaA;NG6_@ZCd{mg9Mq2}@
zU)*C;BQKw<;y;C}aHCH1)lW(so<8~)SzvDKUbSjZEr>8;NIU@LdPDT$L!ZJ3KZPBD
zxxL%UX?^<A<xBs<Ox62h?Wd0(gBNv5%|m`b@bm&3QpvltSX(wdkE1sk>I?6y3?p=z
zL8Ok;AKwJu1nj&$wJy%3INhk_Q`d9C^4F4@4dI##V|;r@ApN=_JyuZASKA*^Xl-v`
z{>ZNW`rOC%z25l=+UDR!x`v>h)*)MxkeOv=`hrvNl*5@>!f0Y5(^0Ay50p10v7(d3
zaW2tX(AQ`ElIxFqeCo&d?Y0^tZ0>>?oVE|rXGo)1dKSAT$J}}So=O!~0kE4S-L~t8
z^#j|+vXh;&{koQ%J(={;#MlPl!yA{&iOE*UT6(58Udi_T12(yJ15|B7O&2ty@k$Pd
z&c6y|(}ddE3cXJH8F*Vqw-P%&95q!MZyff9&$*H2;~g_Xjw)Dq{s8k$+tc&%(6i9h
ztrBqQC0A=dzgyc2wRNpb2j6(10$-~~I=fL0PPB*4b9Ow+roL^YDbV4W&RN)lhkapg
z6XHSjZVwo|zVAG5^KGIV8%zWVf)g5A>G)PcewA^}wAt;r`y!v2^zjvx`mY}_e|2vr
z*_Z1)898uGlBjGuZo9URXI~1~Ijd*U7PV<jjyR$?O5IcZP{GeJhta0<;E^yMX$QGp
z?({(_e`O_0e{$nBw%~Ke<U=fdG{18tc5->=e{)|C9@KAb>SwTR{-ouQVLMhbnppEk
z*Gy{GxAUy~NUAM2{o1EMHe-;<*0;MU0QPHmODQjLxnQK_#pdWNNhyo%?Que>`+P`E
zlg~|b8bJ3hX&$a`@AMkw2DlO3#kR1f5*ya_AHWyu0&Q*I*KfRvdj=Qo2j+>G-?(|P
z^Xc+jAVw<vopYb{FKgF@&5p=**c;#4+!1k4smVeOb_1M0NR}l}<;lP7PXQ-tURfbp
zRp99jAB6;ja`m@-#;1qnmH);A{aCU1)A3MEL9ro2_&ZMQ>EVzNAQ;8OG%ucn13!fc
zfuj$=r&RwR!@%56^9#R}kUgCVe)$7O*AP}Be0rFc{%;y0FrR$;bQ;pY24WfgD}?gt
zVKFfv&ty}Uc{~aDK2e|pM>mF((Rg||vk>KryOMIQO61cw##jV6I%6hX=+nda1i)<x
z38~$E5_x0$F#?2?GV*`c4ak?YO8H?$3OV^jnq?_P+U_JuS)xcv^`*>ay&m$)P5(H9
zl^EhwnU<Mqg|4+~xwb(9la2);%fo!BUVGbbl(=Grn$12@1|6io$x|2(w@|sLx=5*^
zyhx?Q1umgD``_0}@rbxNnqjs%l5C_}rkVWMB}(<h9S=S`1K3Seu^l5V`xd)Xi*KeH
zH-OOH7)(f3VNgrqu$+4NVfjDxlNJqL{UH&c)yl=%v&u!9lgdSkRH_H5Jif`ZHKu(&
zL_HyKTu34cCctLFe+5@+?K0o$H){&A)+P-lWMT^?WVE9W2~;gY`*-p4dx3%tF_=GS
z;F`B-DI2Ua>_t19uV}|lB8Z6NB7XgX3P|R(n(!)R&sz*~sF~A;i&aX7QiUFC1@n+>
z{ncw*6|Mgj99r~#kSW}P!w9|<$#{C6qi-3$DW(Y;k$(4*!OBbeKDDir>MEt`tmVFl
zj~W&Ta3>A+Moxufr>Zp~(ySWvI=rrIOk+^fD&qm%Cbb@gNh};d-l%LeZx&}SXj}!u
zyr)WIX65T7{~c7Kh`R^Xp+&l_&ErDSX>2^pgxpk$?vBTu$e<CfWh<9O=jq1794}&i
zx5}MAH3c`Yt7YirNdWQJ%LPZd@ssS(AwOiYZPUz?;3?Y~{m)*Ht?rjiTSIYlN@I#E
znfD@EKQiw%T3pKR&P)ieV(L|@%HNe(gR!KmW5nv2)yh*>D}a0A^1GCcgfTK3-wRHm
zG_N=VmVU!K=ot<R#f`v#Wl0;RVV}K=7LB~m&B2J1`()QE9-OUNUC&{;)YT*@VXssj
z5vuvGfQ%x8tLRwu_CC!;-7%4jDwgh*j6am9teD}v@`PfH7ppLsxp$-U5^{roPZz5U
zerSLYdkyr35#te^`6r$(x-KtjyB^eS4zZ6qC2)aDV-wpCN|$GKJr6eUNrmo@KT1xc
zL*@;FNQdYOQrGZBWnOV{npkT+)Y_0xiG0<Y=d>Q8O4PNmm?|`Q__8WneYNBX`iRNX
zQ+dCUsJAvlAm*#tU}G&0sk53L8%kpBzCRlkik@7fcAP6J&<qU~NZpvw^Mw&4mzxf!
zm_EX;xR6PmqDY9;jqF7~p_X3GnvpxI-(8xrXbG_@5whsMj#r+SM1k|igEE9<+i%*o
z$499a5&vt4QAP>+<xK@-7GJ#RibpfT^D$#EsTo#Jx=G=*-evsQ7fxc<7e=~Q+~trh
z67W19mKBr&jpQ4VPUfElUtG?!O1;GAGt2O<-}#*ta7J@?*m5*vt17Xw(dY{soT$)S
z-ZvPeKHj;%IfIq|qLR$XiQpgOzy0;rMu;W)sO>79Sir+c5k2x(*J%}wNgtVNv`A{?
z^1hUnyvUzYcSpw^s>CGMW$$S(v$Bwwi=FM9NKQ3vpQa)M0s%eW3l+ub8m|vr22-)J
z{C`q?A8sOGKXKj;E&D!_CKFQ4BVh5bFd8*qZR7AgtoAq`IQ=2gam(Tvi9o`)h!nU?
zBEeEO3(V7P_1KKyJA99<VaXV=*PB3$mTR2AyUKRnvW>FBykQnk*%$ul-|@kz1`i(J
zH+3ckLm4YQep(gautY;h7^&<mZ}w|OhoFniO-AcF*%s;<tfjQ_6Ja%kA7aaJk&%!g
z$x3DMFQzzYHPXFKCD^U!YG+QY^~)UOmvR|s-DLR=J|w0ldX;+bYH^kGI`1a&xGY^}
zV`~Sz(hAKWTWkUCj2D1B4_l|DQ@L47iXZEzsA+wFBFFtApeym063!>JeqUHK!aF#W
z#54Yagp)D<caC|FVDo+Tp!6Q>(lG>t`&*mi4U!+PDr=Dbo4l1CI(_qof28f7!ok!X
zd>BWZZiL6>?9c|i8Vg$>)qJ}6?#4{}{-=hcS|anvexb|3+z8V4Hw4W0Tj`$Von31n
z+csA3+w)+hg-`K`RwR%d(y~GP4O}UtLCm$IeM1CHQtt_c&3HAva1$D)5K`@$pLx|w
z^l29AiDZo0b{Lxa1t0n>kdd&>>D4!4Fu-f20bTZH;83FZfq;2c&%NE6vk^YNP?Azb
zdEG2817CTFCSK{>)a}paA|i&@2}{k!>xBxY$#s%hKH5KKA`B5isi%T{WTLhY>gPvV
z-;bv8agEiX2IAg)5!a0q4K3RtB4mn#MVdTA*0Ze)4{U17P#JmuoLiOlpfIDJ%yEcS
z7F8TXMV0YPa!#|(qU8r0Y2$x4fh#<CJ`=pEV|H1ZECHRQ-rY0J0cRo5GXB6MwIHiw
zX4@!T(~nd6^1*UyNZS|z|ET4tZ3L~r^=N6Nj!q$Hd^SSSU!5NOB^%NGW+PDs0K2JO
zhL0`>LnRfhgi5ke32w}d>-Xp5awCR?@ha~o=FMrgQa&u&++9N>(?FQzg8gKkrJ>xg
z^g2vYVuBZJ_eZ{u^ZUkSU*vPa);e9~Er-00kO7(&>Kw*S+`_an-?dzc&74TV5a+y*
zI)2^>diTPbbd~cJaq-rG=NN44WgKV_3V4#~DhM<Fr&{99cyqIoC{YoyXWOlsweK&=
zoX3_`o6ew-LMa9b7ZcwuaHFe(#cDHP^VSWEsqy3cvo&dhgd1Gds-nx7<Tjr*rmHI*
z=%!{w)w7rj3agkMUk9XfW^(kG<%QdSFL@~`h%YIqZ~x|lh!`QZ-zyG_<`_F)Xp+Rg
zOTWG&O-BpH5%)`i<B=LWgH7^cE`|*Z<QI-PZQggyMnN1y?ptYQm_qySi7Fc^Fr4lu
zr}g^jK2`JlLKLUoW|fQ-dbsX>;k89NUa9cb@sMYWdzx-Ht%@!klaOY}YO=}EtRPHe
zHNX&|lRk*J*lyNi|D->yYYU#_DlkcqPhD&zZOZ@G=vBnUNSf=?4m^WgWw}Cz*KJYL
zu|K*tah#Ym`E5w`R$0SRh|&^ve}Q={_4BJYh+@w#IP0zRmo!{LHW13#ApKww)=xvp
z>}4f5k~DE#Ne)@5orZyerInI753oCsXITj$Ao#APxNO9$H9s4fKp?+$%MHi)ZkTJ(
z{bN%>iwlyMt-O@^HG6pIgG;{3H=L+sH;F_&dof1c-#??@4*N~TKt_$l?x?6DpLK$x
zHdOuS{*CDTm;#?2anstmZ4G>OSMqQ~@}ojirBjd0bh|$8#u~mmYCkH~IUT2o!)c^r
zWx$(bjeB;sIn1u7m4|&Nzwp|<2PawuTb{Tfjm8R`X2YpEdVK;-qeKdQg-<e+`oVcu
zS?v&&CUr%--Aja)%tE7qYS(MB*(6K*&C)0d?6&EhdJOpCcF%?+BrS8_n8`-|chrM@
zNokgD95HQXih3UAfh;v`L?t}s_K}jDz=rnd&l|@$XB@sRdQ<FMm!>*HbAvKR`3Ug&
zm=v>sNFZItLtZhB2FYoSZHP$-2mVPUw;+-0{&s$f(Ase~DHIOjc@mc3HN_YPqtFGt
zZ6{tKfLGPF={;qo{<FiR-mEO!-}A}k!{+uGZMP{xkn+8$`%w1+to4oXNg5y4)#Qa=
z8^p6SvRYDJv#K&8-oeJ{_2ge=&2%;+4aR0>rgq*UhZ0<7DK~B;Xi2{<nJKhHAXPFI
zDhV^6uP;OQ0=Mh<80&y4^P8q*?IYxA%#&1hvx3OjtV^Y)csRM)(cUdI-0y3Nhz(TK
zTelvx`5_?l4icNrHt+P27?sO()^=XrI+t2MT(v(Yvm7ugtK*WIG9}wZcF|M>q~)Pq
zG30C5+L$$T$rL9#%XkjcOI5z2e8vXBWpqH<)MhlC7g}hxG*D5RuZL@EthNtB(O>AM
zPjYF1&`1o>e51tad?o!KZlZ+-_ZR_Pc4s@7HZ^nq2u?^~(&9~gxe)79sY(q_l}l#p
z?d5#aCP8l^#`cBwnS6p<BHh|<WrP?;jovC*ZHse2*99G;dizjq2%af~a<pOD$I_<(
zf{8dzi~n|6oeH5`ul7Wd4;>ddf@e9h>O}KbK=m7LQn$v4!sYSOn4d-C1Cf+J*eWAL
z3Ta5mq&Xro3;n64q-%+gFgp{2gAhCoAzr5gG_0-_c~XffZ}fbUFBsL!$KbEALa7)l
zo@0^i&;8i=Gb;3GmF~NWbXH?h)k**B9aYA+$o?LlcTdE|H{bq~BmU$|%bIGi2&%&J
z^NH5o;u2{FlFaLAJzowMt<YGUx1YHiNDas0s?eSru1Qzht5xGH1SF-Ch~S)ez^+=H
z<2YqXyMAzHez34*^0gaBMXTXsosS{H9PucTM@5s2zABCJOD@?)NMJxF_o`G1%lziH
zyHgM_Fs@0ny!$N=9+I(g<b)=20!2cKdX({1_>R5(GK;x1z`wfZ`IpNoPyhK)57NN`
zK^Ojgn@Fq#QFW9o;hf?BFdU%0JEM(ir3xYOzjUc(m%_<RqHDC5yPfFs?Yu`ZK{=F_
zu4o;jXZeW}P@<6UkR2`-&;_4W8ZkxnD|9)61Fg!q>s3v-pg{~(2+o}MpFpC`INQ53
z77~=C>Z-#x*ESM(aqO{W9GHsB^rLoHx4%bR#PqOB9aa=@zv~b5tQu@GX^%<zD;B-8
zQy}?wun8k_>cLZ~x(BcY%*I|Wnvsh)fSp+kQ>^?Sao;{C%AYO^95&>`!(|(mH5V`)
zdg)m2>~1J&4gcU{%j8h{qw~P5K3CcQ4p+oSLGt||zP*#b|HZ&ky+Dw<R5JAxZ_o^v
z!5&Ee_S0TZxQIjm+pcIy&+O83`p+m%`vjDN6^}ZTvdAOK3xb>B%X;`yjMn*DZM61w
z#NDUC^*O&g6@!V>XLNB9L$fNZ_Q^^ED;-YnB}L~ao<G6-fBg^!kWuElj<oGYXvAME
zXAfI5c^yg=_rAM~VW?(QU<;qIZJbtXsaEMLbj;~n8tyBkZVeZDE0z|`P@L~CnU<!9
ztWoY8fHMBl)JBx&!$e=Wdy(f=hc0w&{JqpfR?zFUID(1T@Y5f+dTu9TMCwG-jEG|0
z0E{3}BM(6~{GAy^@$$WrH<#_yP}nPhx#4vD#KtI}%-d`qL-dRN$wBSOqS)E#W>15i
z9{jb*QbklTOdFFO2x-IeA-;%+RGzPD8QUoTU8zPp-4tp_tuLu(Dd>hlow>@g?Il~!
z@EpMcnRZa#e~=GwPwVmF0Y-QRP!tfzrg>nZOlunEK0~WQEuH!`y;8}Lz0Ub}vA?%b
ze*{N`h9z+vYU4@fPgFD;<sq^<yVb6h%^6<8ck1^mKQ15)6^$KTRcS7Pv!N)M{H}-E
z1z#xZC8;Gt^)+(J;7#LI)F(<c#;qU5p2IL&u79^{Oi3b>(dHz{zTQx#Dw)yW-J^__
zNswwRx~PVL#1cp)S>ZXMs?X8$*ZR_SI}os>OC*Bk>A(T~5z^;r=6|+kPqB%Nevy65
zr@%rBVc9i?I6q~!u);u;%sqV`N@VJ8_#;14Kag#HWc$M~y^Z|TcO(;(am`07vrluK
z&DsWKP$ZMiR366#X-PyvO!N?9?XsjMw80E}XkTC~H+o#rl=pVDI5~$*nW3Kxz-xQy
z!adISceESi9o(`6GUk$#O=pM;lQ2RqJv({V0nX=YqdT$atfX(;zRZ%G3>i(Dn9_V>
z5dFcR3$ekoI{ZkqRY<es<=lSYgSX8h1Z72PQGFzH#^YjAEp08R_V^otWRxPOQPdXN
z9(Ca0%kj!bV>2sJ+JI9#mAl)$3K@(NCwjAhk4D}B+MdH4Ho+z@%)3up(}qJ18r#JN
zEapw)ETzhD3*<5c^Skn&4XoD2$?2`Yx3}-J=Iz^IZ`idFvzw);9h`5fpJQ)p{-<yN
z1QUerr+7)lzul^6I4AR7Kkk)?$m+M4H<CQq(m`X?g^ee7zeto7uMd0%;%3yBqMlzB
z(`Ly?(?pn{s8Y~={mL*eKXN=jEuujyoa8`af<Y*-wDz5@OQG^Zjg365nC9FykDy+o
zF67M;W(e|b0cIS5Pl`Q6%m8}U;$c1KP#>O~qqDvh=}l{^|K&9)xf5}3JkhQ`xzwrR
zb5%~WRQn%}7lzMi;<P8kc?!}zikpC%t$hz-)@g3kNbf3_I?igHAg8j+B7=MsiKk6N
z+BQAU9*wv1-F{mm@#oPP>TlFhm8oqCrEJe#$N9Qw>N{O10m6@?stB}*{wJ6EhGHz9
z*_5`AOw4h57luw7oxpWYsi2+W<ndFnlYwna;%NMOrT@Wo0v8F6<?-G<t4v{vU-!-G
zc$P5tNe^BcZa`)~;+@!G+udRALEdY8&U>U@85}ZJjQWzqOQg9kcU{&mTFh^%cZ`0=
z1Rc=UV^-TJ930^9+^LJ<`KrG;(ASwy+6A{cgHpcg%q1PrEy$jV*JT_i#EjTeET77g
z=Ri*9<wf(f%13kZ{ylR*{E9>To2Zp=SMZ%k<d>Y9PbJN)yLnYv8ksR}uc7|s#T|op
z1I^hup2)5r{HZgX#w0e~O5SzzA~Vpye{MhSLO1=Mwu-(H{Z#(=l&Sw$w|xJ4oF2)N
zVy*1GBfgCo0y3skjj5PMajuF!4s}=a_FmS<#feSnUY~Uw@ucRdc#C9bmhSN4Wsq(2
z&ghvDtIGF9g{Pb}m9yBiRjjt0B>onKfWGL5BR>R!|5)VzuVN(m((k?p92|+F-Nl=~
z`Ls6NG4;0%ROXFX`1o(><bOOqR4IQQJ-5e&_D>FL@d5g3{UuK3!;@{IxUcXj*2SO5
z$v9*YKV?>;fbShx0IJ)PgoM5cVjtz_t4^`5pX~!*E%D_m7mWdCv8>H>&%;pZ={8ta
zzEmQ!jb6JCO#13jfO5R*1fYuSX}r!3cP*|*Hn$g^?9A4cBP$V9QVB`XZ^cspIQ&~W
z<pW(~iSTs8yba%358mQXx+f=>N{K4RR)(+N4Itm3*J2V-|5K=rA_lI34VpIrT%i_<
zUjQcWFXdw8fl`ectu8X)yTkic{bV+e&CZs1`IfWQdFFGq!SlXXi>}GGZNxT!OD%@R
z8*sM`8$}9OKoW+V&SO|g!KhQj%$e5noSVBPkCtTK=d9yp^LVR^HxD``Q<wDz-O};&
zDuyQ3EeG{g&Sn3)HTYO?O)>;nu9Niel$C+Fe+_Z&cL!WD$!sR4=XM?68jM0f{CsaY
zt!64aOVz8oVAczbtO;xRAe#>bX)YG~t|DdHcAZG9y~BV;TuQk`ayBj}M4)^^|F7l)
zh16f^9aVPr@4lhH@dG}ck}lBFe{eXqA*%!|e4}-dk90c!><62L3D`mNo{j1DZ-Xkv
zf>Z}dtJX|aO#b5zb%m`|$8tv+kf06(^b-G{mm<vixABnFQKE^1?Z*V_P!K0Ik>%Mi
zo*qbGx?)q)ws&vi1^d7}9RG-s8ky97tt!wun!6BWcizQkw^>kkoHa?DZRB1CCA0h1
zpk;iv@!c8A8}7yu`uaKSdgT7$j<LaZf9B8Tg5zj#{A3kWtUQd<TC>0Y;&5@^`V6hW
z3G(JloZNfmwDo3_WOJt=gAI>5j4PH#E-XHDPyJ}Ab*Lv;C`ITY$=c`cfT$$y7Sizx
zT~#WUTE=c;!Pi&qY$Nfbc8l3)`p&KWg8k#EgwbH@rG>%mrT60G#vs*ly62`{2vscu
zZPAjuS!Nx5%I4gA?_1^Z_D8Mfd7yu$1xZS{UNBv-xR^EIyPL5>y$3X9R;p)yzTy$D
zO_tnpDlj~>As6Ncb9MRJO%56tEH6V%FLo!(!+v0p%yo3BEG#d&x&}`w)md-^bZr3l
zlM}ePp_AfZARu;=9`69=1c+A2j}KQC0iuCy{%gK?!aR!(zuEz<zNz{r-MR^|MQ!3e
z+8%BPlPn2I0I#O)s}EoG;|0l6NNgJafG?)z%dt2tChXkh8Ik@+HtZ$Rd!MN5k$kfr
zE4RpO2-kWs1+r&0j|+tleQ-VW3U!BFv@svxh`%<tJ>MaP_I$oY2?anw5}^4`0G7x3
zyh-1G<P3R5lwYU$xvx>{J|ce752ZMsv#bCSvzs=yWzu%y=6hW19$uYp43-fLJ&qmH
zpuI4yUc(wB#V6POfpOPY(U(B`(+75;#y79xYWe3Bq|=74v``64u`I8X1c`^K<?9oF
zUxS{FwpT)8Bekta^<K8$fY{As=QggvJ`zLe3OS!=DTX6kWHP)??WOI{0A5m-_j^ON
zYI5`-=iu>fxtsZC?dVY=trBp5yGLHsQzTseASrj<t2|g1;!40aU-mj<vQ#;%8MPsE
zTk$E#G@xcOCC*_sL=kwrM^CV%DXAsi8p~GR8!wP!%uxMDD$dNsj@pos;K8*BLiyQ2
z_;CeT(JIN%XnR6VTJ#Za=yE=gX3Vef7-9h`#w6$A|5E7W)6#6d;KMbZS@atFYbSZx
z7bJ1CYI@#fox5hc6XsKFg+CmtC`rpz)vGd%Z+1&qWO6=Zq!~g_2P@8dlbi;LSq+UD
zu^51Fgu7oYUM+C`UCkQ+eHo#pEw$=#-tE^!g38b>lQBsnILUpkqow+N2NReYR9TT5
zmtB`MI#<@dE4~PQ`@Lwo>V;@u1gWJg_v*Wb5nln%%lCi|Ug3V&wDmt79zdQVk&<dz
za^KQPyiCxtVGnrjy$$KrL~dYOnE0nt6xabkQB;ihKKBO5Y`xe?LcV@Yrx00L_Z2?p
zqgI{aNFmu@U)-T*w^Wcs^YQ#-K<w>bNhe4!OLyn2M->Zptr2Y!VN9VF(TJ9=W<uU0
zt7y!+AD880QXK|F?yt|W9^L_##IbxC5ckILsL<AP={^;!KiOjWKx|aaf+GM~T>UbL
z{Gok@mmyKx!F=Hela#l|$H;lz2)CEhMswElsC77r)u;<}q4f_@JS6<D8{&s90Z`_w
z1`vPt``?Wun?1}A;|4M)<@=Z55wfEkO5Ib%0&1K<eT!nAWR<gO(k-9l=13Yz9Zx;e
zyhGQ8O1U<VDmB9@%xCLw$Dmm22a`T{Fi}pq%22G`qRc6wZz@NXUYVABK8y+<Qc0`P
z<X|#kYK`~*_el5|z6CnPRat7if2$QQ9Q?82o3oVwER#u<IF!Fl8ddjTn17yRAR2)-
zBgl04GOV|`6L$;vL^Z3^k9@|KObGFN{clYteI(l623ORK<`$F1CT2+~>z7c%`0^KI
z_sNbel-__?yT7bbRhq`Pybq7#y5o?TwrG(}%(gBSpn@T~pHYzR!JCjUTl{bhm8$^=
z&kX0kGQY%|s0Ta;3G=GIx0(VT>B}>*&Vz>Kgo2(EX#f06dLBQk*#T(@g`Fi|p*hk4
zqM5oqm`~lWL}>rx*Q5u~Y#fTqefuLULutk7aACH$UZm<fI$=n-u_E@yP%6(LUG!IU
z`>!ak7i!FGflB4=xRutxN5ta)g516*dLSjn48RSxDOxr=`}G-aYpC@V@fcOUm;&e6
zB#)bj4KP6Dt@Z<f<~#ae2Vtr_eh~`$%0QuSw#BH2%bK6pA5m+mML=$Y!t-qvefcNH
z&wL5EZj{1~Rq6obb%NJ{wjuO5P&akVbh{dKV&CZt2tjwhoYWh>>$rOn_qOsMf&bdz
zmm7sUX(iHqADSAe<C(NZo<B<`Yhc<H7m0Vg-7y1{Jhx?}=YqS1IWK%gM3TRQl&KTA
z0Cmz)$&dYlr%QR|9w|?)Vu|^fwKMXlX-oIcwE<*btW>~ImM3eYZ+eQY^yQyKg(_S)
zn$|WcA-)lD=v~)G9(MQZSrN%s*mdHF^{VhgUwT5>Lr~VD|F+<4rn1W7`+-i_*e>sa
zKW@Yy+=c$zX6Kzz!&-yxKD)~>A9jYxtf2aaajSNl6fRpruDf_OS*j2Lywbu<$Qylj
zb{hepMC9wXrWszry<|HZm*HA`UCMa?xa*Jh)}|L+=F3K4A)c82h8Tl1rnRSIt)5sd
znh?6R`+c?jw+A1wV52fMhvOD>|A1vwfx>@HTboA*6cTds6Z(|om~P4MLR|p2ZyW`@
zbj47pr-DX8txZk)9xN9XPZB$jsen!f#K{VzQg+}OgTZN5rk@V-D-AZS85-5b!Jsy9
zChb{trcMiN7--aDsPYVa_XH$EYi(GOzX%P3_}xEJHU%mCuvV?zz~}IL!L)UIb2J%u
zp!TX1G@jI*I481zh7LOu&D(0(=7WGbpLva{U(_-9LDhZ(67MKt<Bodi4{tVGZSr82
z#gqr8`%p`-{*QF>DKiOmUEE!1#YRbeUb#Y7z%+f-iVmpBu8<VN&D>qUE{2OO937$<
zFN9x4Ybn2(`MsAH4xKlSM_mFxo<P<=-=gN<jK<0t5lz{>dr&9*6~}@UIg48;5zAEQ
zk}6m3#3~?-PuT;<S+~<Qimm~gM$KouY20*LXvh~pE&iuBEd(UI<8SedkW1ITX6`w!
zPNQ8GJ{J;_+Qv^|R$O76!JDx8w^Q7yTd7C?73Tc1-^lAjNVN0Zs-Q?mwwCreZk%F{
z&s0KD@R8s~%<}Co(O!?Z{#TC>{)0`6y@6rRYFBhzt|&I{Gzn~H`3XcY!@DiMt=4}+
zslW5hJVssDa)%Zm6cl+SC-%y$l7E;4>~><`&Pb3jMX`;V<Orst<^OF?)57!fVz71d
z`<SE>l5Fv}HfeFfG42DFiw)kL(MP71{@wpP{Q<auz36NKd%{LO&ZZ^*xZhuWy&e-D
zD3*?@F4AvDoQ3}xdVevGyc^<a9=J%oD%)~a$i;;&pWv_8+KLpBdl^+X?0T<b>W|}v
z;(t~?UAQfd3$S>&GU?Ws%KT~2<^ZgH_zZBqOLu$1Rh*yy<HhSBl%~*wkdT+gy?hDW
z4ZpI`Z$d{fXMM+A0i`rsfLypg*ul*^jtZ1<C<G1{@tN1f3;t<{p%2UH`brLyrQuY9
z-Chj4%v-CiyIsy0MN*R3?o`v#q5#&C58)(S^o47jgAO5~A`>^t`O?HntsZq>A^?o|
z)5*_dXoZY~N?ivqzwlV7t#gjBjMVrOnhIHs<yO%ZW1%8>N9u>YgW@iX)`b~WH=#mH
zoOXIDUL(1aewU+$vJJ7RXm#&2<SvJGuQg&3p7YdEk;cX>CHnKbN_pr>N#5b=NsNDe
zg9ZOnJAdq3-aCK?;T&(Zwqsh!E&`XOHO$oDG;-Ge@w90x-3YM87&z|A8f)<|cP295
zep;3+-me{x*aV-A47;AW`9V!a2CMoX2DP14Xdc;DPs7+{xq@Ch*mc1I>!AMVBV2s@
zl}3K}dU=xZjSgK#6gq_VfyOt`zjU)<$s?3!CpWg5(co-UHT6aINC7}GKqF~<TX2Ki
zt0_N4`51vGl+v4k{_SaFze9QztP7n5#Z2P*F6tvDYpU7o7dNXrHhD~!1CNZR>$T8f
z@2f@2zvcs1g1PF<Bb*>IkFSc%oiOb|0G!N-?r9ki48dFybzCIWy+m}8iE97^E>T+k
z$%rO_qBf?uENeaR5)Vn4El{nQAm}K#(`%JCWKU52_m*;Hq}G{7zL5}#Dw6x5snX+A
z5)H65d=*Yu?6uwsUugVI{5~!`zN9m$+zTFi)~fc7CCS|O*~vn_ZCYDb*<zub1vc-j
z_Y8vSxPT(2HTFSkZDf$8o$;{Kdw+&?GnD0rRj^L@9$rX4v(8~y4I5gdU$$`-Q0|g}
zPGyQXnfr(GG_CQ-;c&QExlgF7{D1k~0<&&JfaF+X8q)n+3#&W&hl>i|WFE)OM$7A?
z_D356lKyKQt@YDTmbUlMK3L`K#lc^J@M<pG#;v+4wvefJa!TK`68>|R2Vl-Y;xelv
zYooapMXSct%TJ&R66Q2&U$50=lK7lyRh_nTDCN{G$JIUtln5&y#QOC!0G&j#^BvU2
zf)w_E{vQ``N<ThfujID%n9wwhlOGs+!@r_)h5^BO?=t$KdXU+9ig~E-;(vLGH%+P?
z*nF|Mt_XPU>u(L&`I8fr%w^j)zx8PO$50L^-32Wy3y1;KkLNq%GC`Q)ARv|SBtBbv
zA#lC&G2djcbtsr7{p)qH=$s$IGON&i)Sa;Jf_;B)=@&AuW4JRw8-4{pcvk5GyO`D(
z%x-30*0OCqz*nz*qmrEaMhLbHbe>=6?J0O&Z2uX*C<5xej;Eajh}XH;(zn`&%XVoY
z)d4b)4<NX5YdmV{%#fsamZJh!3)TSY+-qBB!-TOmA2wT-5lBT$PQb2Q>k{B}782d0
zJNqL@_vWPU0Pl&btCcUHEsO&-xq+-?ru?;?FkkmeS{>ek$Wg(&QfnjGZ#HAr?XXK(
zk)G~9w-<Xuzq6K;<OFYe2%-zE_rGHpeZQv_QXu<`O8DhC%T_60nzf?sHbv@3c3h(A
zZFkj*$wgjbb4wMP?|KBE4OfnMWPQJ@Ogd}jUU^F+xROGEWuwk=+J$Ac#&nnsXlIFr
zA+hw{9W8Tmnm%4bv91UjU5^rVyo3900hJKi$9lkbHpDIilx-an`>Pcy|J&_ETfW0Y
z1*3&4lkMMGfP!SJy}T{tb$4}S>vM1KUvv9)qA+?7Ib>SU`*y~jbu^77h}=hA+P+7<
zmlq0n2soX0Cs@^V-2*{yVxd8lL?>HE6Gh`MwkX{R8P!+%uGV@(7a#1}K1~OFdZ8F@
zUXF1LXfX|oH>CHlZY=2ANZ4GRrOusFnjEu2!eg0s)7U=HEqQLTlBnlnG8&061f0S`
zwe)AY3H~5<AEu4Ux;Hf3=TP(Wh1X8|9&=on%FtMz)NswHpxH|3`%O&V?Pr5+rSq47
z39H%C+Re(b^-T08^(@R}q$HW$>{iD;Avl5vl~_%C6nI|I>}70*5AO_VDD2X519CF`
zQjd0?&jtMJB6<%7Y-%FD47{|02lXudIp0T^k-LH*64k8FNI{0r?d+mfjE?2kk0W&{
zmSX1JH=)(m6NL(!Alo1P#W`UBi%$mf6xUR`zTOP^>mZ=bA?3DYfACj88;W|P*L>O}
zYoE3~ci~g=oqGmQh<YQE3Fs;UiXxO61*%5As}>4b?7wSnyWMXqYn02XJ-=WrIb^5+
zY%i!h<c;Kao?naikWI`SFiI6oNL<P6nC-2aojoZ_?qnwf{#>2;AvbRp1$bkSa5_y(
z^+_GKMOkS>i!U=C@B7o(`C~JU?{D|A%8QMs1(p+5s)5OF0yvf(;rR=vQI6nK_9fg!
zazbY9xMTOBX#(fgRc+IuW2^YXnlP5glP=h7OB#R@7sH7TsQ`moq$k-)A11(ZtONlT
zfAt!#t>XJ=lkxP!rz`vNQ3a#r297`MFQ7?xAEws}w|bm147>z12{HY=G?wsXW0i~l
zf-rn?I5K&+_aEK60uKNB0(ISHM*^_9mWg!T{s=5IST?kK&vO6fAapb$J9PU6An0@s
z=+Sn4gYEKCKF_bELLpa{e*R+^(EJYHqcF+vzSKB`10~wy<MD$j;u&o5Jb!MFZxLMK
zx&s!t7UXMydd<;E{@Npd`HS7-oc0Pn($HX~!|xC->;No3{s0)hnm9eU=xt_-An$~*
z)-YausPulE5U9;|TkS+xH3&lXuV{am^E&@S5nH-^oaf6=0jRR%ci%Xd=o-F~#}o@R
z1{{DL1zf{EM+^yk2Bd+kJG9YIug3=jZH?m+RVJE|e%5{`yO08)%Vtad0-2ji_U<yM
z*{Sm}Lg>M5Iv;RMjnF}v!N*-IHq$P;gr1lCCVHxEt}J!@6BNO7eXO7DRht~B1`nCo
z-H!EZrgc3xSVOn`io`+*^8wa2<*<P<4rs44(`vf!TmWJ1>~_yvXIWzYPn$5CuG^~k
zbBx=W7vzHc4lfWj?7(gcFd)ef^BsYq_y)9yIJ#q40L!~t89G^|0=d|Mqdys?dAHx6
zhLSjzjwf^m2suIz0io0h`tmRgE8?VXTK8i$ztMtq?c;E|z<e6tgKLY+!2<erPjKu)
zC|%nnn6?d|%CfT#!yMDx8^U>#1W?APmumdNZ^2P3pF7;YEQ2f4jQT_2`+peq*hUcd
zGpDl5UQmRH@EWB!zU)P0X__FtPurdue^B0vJu<2vY=3;1*OB9O+SWIE$-zl$qEOGQ
zUin@wI*eqb2%yF-L`1q?#5wQ9<(Ga7>G~T$1~I!DsgEiRx)(2k&$ZuX_tg(p?g+(g
z#ocruT(v;YftW>d?VpMzWx?$vQTw}`!_pwm;R_rj43@3#Am6=11X^z#h$yifIq}D1
zkvARo!ZD-_jC6DJp78g-2J6s}aub#Bcx(asYaf|AtS~LI{CqAHH^dUbD0okd`+>+L
zGfZkC@tFJ5o49T8N0IM}kU>E6J3I{b;$G=S@~xC|A(h_ZMjA!eQ=4D|!oH>Lmd#6x
zIHT0DOxQeJajm$d+~Ea}i!Y$v4G|>QR!Ta-R$j#oa2%I4+_reo$fa8WHkK_Y&eq5L
ztF%JT>?1Q}(tW7l!%2Szci_8nn}@&QPwq0aI&yFdXt?%Q<oIu27bpY4PuRYW(A`Vq
zS_`=$Uxd+UJQ!aE8DG)vN;<U~=uwZd<%-`on!d2ZSR)&&HOF(@Eh=cdBflbl^!n&g
zu0h*C%(Z%_aGfgEC-BFc-(p{>BJq<^@a1o2LFT{=Ja2L(f$nSGT1I~OJoJjM@r-}T
zDTo{{j^wEIDn0pwd5m|F_p~>U4S)l##<~Jtv+Sc2G1n%%h7JMYwb@2HCUt5Lg~DMW
zy$BimlDdd95IG<p@ft&0c-fL6EY^YHd4084PcRM_oz8mm!$OV%AoU>;~|jMKTw
zBWAt31?->;QWt=eVtcn|tOI=txz-wXi*cD}8MYU&9OR6o_g&ZPgzBoJ6J4CwWQ-7>
z3+#nO$~&Y{&41DpJ2kgTpjYc@{|Fd17YC)QeR7Tgr(gW)Aa4Q{EtZ>lH;$%^`TzbF
z040LNWO}O6c^ltnF!<ym&FCD^Q5&0YJP2z)onm}4KL+MMm)JjW*!gmg?oo7eyfT-R
zzXPy)P18Xuwb9V93L^z?lWK;(ag60k;5A53aJHGA$^(RAabae)<eiO}L6!_k0&1AZ
z?y3~e{oZuNQK{_pE68%DNWcQCA5}l`Dg&>zB<FK&(L3@49RN|L0rF+8r-AivimeTN
zDpcp3FDut}srvi<M%h)eO;QPr;g*9hEWM{H4G^v>CH212K=KY3JsT@(`FkikSr0j9
z9_4XQIbvB(PzV)-9`9@T^xRf^MfHIn47lt)6E`#I?xs>@*+JQ&B{Vb;!Re`5z4y64
zv9WoLII5C?MfEv*PXs@}sqX`eG;NpxoY5o5QZm~}?r)J_6*N5~F`-q(1B1uO@Iw{+
zDfJ<7MK=3D-)WZrW?kvBx1oV=`OBz!9^<9G;HiJmG3AfIKEOe|c`M6X8ukB1Z~Nwy
z=rFn@Q6C*k1lN*V!18{=08rE&M&}1)Ss{aig$g-~&fhJWrnTeB1eqX2zCQeS`;PTX
z1c%)3_~Zr#`=9lZzTj>E8fEUTq^?K>Rm%lN$@7~MBk&V?$UImtS-_uetA?5q-{x^L
zITFT27qaA{M11TssXoILW0JHQ(fEGX3pj}vnjcvvo!_TH*GJLkVx_bDWV81ys~sBE
z9EQM1?-GpfH#MJG;$!OK>Ikv5K6iN3Y;g&19T|1G;(&~9w#DwJzf?iKSYIG&b0#b+
zb=3R9QNF4P!DPb6$<Pd31)5=->8^@8SB0bKTYzAsyK^Y!r7se*A?M<&zwlWg@}?PA
z$M)U_|9FRg@cMk=t()meI-D9KKJY^h?q{n(<Tvjm5+B+`vsZW$cjx48Q=tUD<dLuL
zaxY@67^8n+#qrZr8;6~xChaa<m{@9DBck<K7axO)Uj2_cu#Iv}<Y2KLn5W%zdYQ!`
zKI>2JA>Y_U$seZYWiL*Y8(%T}Ac9qbWvaNsLR)7s=`h9aGjCpyW?DvaX%~X<D8H+h
z;BJ516@X8OjfjpB(r}MrgI;ka-=X#i5zoixtRq@Bd5IL2ftY1!vGcL-phK$K=Zxb3
zz^~^Li!q5jjIvQqb9l~S+MDklPejM7j3P=9VEqE!?3kwA+8`EFotCAoh4=nw^kZ2`
zU1)w4<`3#8-=I{**5LDF;a;~>e<TTYyMnuiJDjNxkDsnCGKjv!X{A5%Y^CCjtdS|U
zN_|dGli)-&;%Cx4Z!nOi^cwysfs5b9mg6Og8-AgGyW#=ulX+dR!u`1@ASNu4<6YzE
zR^VGzA)p&ptE74=_A(C+E9P|rtC0h~YHt4|5;{tpefz_02@9peKqD4ekbX8Qo`cTp
zmu}Xaa4W1$DQ5wcgaHp;qgR80nb*rj5#GgTbn2${d#@SrZPcZzzKg}#S>NuKR2a7W
z;_|+gt6CO0ble&VPV?0cDsL{1RocwtUR^kyo>JfZEt$q!H_Y1q+9)2h+-UDV_B`-H
z@NS>f(db-bj7PWgEMd?1*rV^7QLinPWrThmI=To1@SsL@9<Qo4k^Kb<8cR&$*_)zE
z<g+R=D|V|UH3J?0%x@Q6ZJ+q@E_$~vpzkN$gJOu0#(+I^I3iLWOnVy;v&)+_Vt;yo
zngBZV05*B{eVC)~==q_w<L3QF47==2iF}qJqfGW*KhSNfX@35EK7QDnCP`@VV#OQo
z>TP*TQ6C2cErLmv@^~!=w{*Pe@DFnz1lTXTDckO6Z#4k4h<uvYES}fAgg6Q_9%U2K
zAo>hATftXX0BCRWUfTT)`+3dcrgzc;dnwnOcHRjk07u?cXD1BwDcTJ=>TPg=71LT2
zF1~85V$ZC_aha6S@o|ahzHs|Wiz*7*o#TX=z*3riZ>hO`-T2<e5rAju80+M&zce@q
zZ;$+rQhouk(rM+s6?Unehl=%jVFnwY^J@9=o98#01a#)lJu@9>h?XdAY@Vo_MkLhN
zv^Gm#XEIm48AS4A{@LG=y~%oRvJa@V)0fi^+0xjWmMAXvj$Ym2{%Q*Qq!Sc#0b=2P
z$m$``f1ICf7IBv-5D>JcmNM-VS?w95VKKT+6GzoMZgFZF>`hzs!@Xi_3wv{Yc^30C
z5HUd`EEr&0w8cq{*8E|sxSbdC%b>0oUeDWT4G)*f7awd)iD&Sn;5+9~#X=JA$vpLB
z?FOp4V^B(7f9E_~`{95n@nLFl<maR5X!^!2nH7WH79A1Qhz;itMatDd@4L%`1N2S{
z*P)edolKWyZ>}rn-&<n#-$ZG|Srehuy=_kTptcvcJLIY6<XB}jW{-!i{Ru_o$W`A#
zs;BJvtbXFSq@IkoT}E_ng4PzIZA4(OLpby{_KTalcrrfzOAjhDJ5M`Us^f@@<i|D?
zJgl7wCAqo;W**3be)G!hlIIb$yd6wS*@)!9^+lIDAZ7=6$TVDY*(5OyT6$NB86a0|
z6v!6EN{Fu;(V9DVG)|>;yVt$s+2AzwF?VC3Ly=Ju_kt>Q>Wb-)ySNAAo@T?+Xj5h<
zznAS7XHv#(DE=Wrdro-;?K5;yAO}}Uuij=1b40nAcrMfZ6jb;jbQxSOqTZyRS2fHT
zKMeHNH}F}<^JSa~op%H)?BsyoC&Ea*V~|`{o*pi|LpYFej)bgx#O`bSXpSnhKdA70
zAg&4tH~OwZ4PW8(j^N4(f>JQx!YFpZ>a~nIowpqLk>X0pU^Onoz6kJ`H#XhmgR}+n
zz4oWdKp=QFZEV)JGh2bN0INT&%pZ?bZyy5<L8pw5n)k{bv2=5HOb4mFC&)WHz)o#9
zriuAV1+qR$Ruc1vEmuo3w(Sl{-j%u9(!&8`V<|I&*DC?#GF)?<{|{$x8CGT5eh(|s
zAuS-SbZtVqyIV>c1Vp+wEiFicAks(*($d`>($X#69ljSc&&+R}@&DyLj{RXL-1n8|
zxz4rL#Sda0XS&|2rM}}@#|^A0s-9LIW(K(+>3Qploy?-M@29Y&vC;24@uqWcJ1epb
z#HJ#?^VH+OqxW=lo`U&afgXHTYb?VLs^E>DUtfd>m0&cn`p0u;i?6I=Zuw8|SSs+r
z?hAEXc2lQr>+vDtZ;{4cTn6XW2`UC}33&t>Z2MSMe%a6yxkC=vcm<C7+Lnp&@JKN1
z$b~nxEwFr#D#}ZEe0^aO5&RAKn{S#=9D7#Clj6m)4ZrBt#?haFYP7T0b#v*pTd7w>
zx!r5%lrR#EtPzcEy&oi6!QF#@ky8JPW;z~sW)P1#7LJbsYp|u(5_Y61PAjqMCx+4o
zggAp5I<?O=D||YKN*#Wt&vlzwwSo|22n$ebahC$n;Pf>S=Xy1lsyvMwUmks}14Wml
zXhd=6&TzcyYnlODL!=kJ=~69x^RiUNP3wl-qh5?TKh2Vz;!xtG3#e_JVk`>~dJzj1
zLCRiIbGiAm#>>v*_9!*K0L!3SH5+auaH8>Y(*keZN9fBOoS#?55YO3+L-IG@D?uIn
zJ>!nhuS{qoGp?f*R)j9rM&j=aG5i9ITUnMg?fQHlp*AGFoOqB*Rn<56A-l2rk4<9o
zp#n>I*3V_(xp(tx@m}t>337I#JmKtRmWu;z_~J+ofJRfjjLAS<H*kVOPmjZ?IcmG@
zi$yZMohay?!rP!J!NSzNPFZl?eGI*@&xjjsXHVXGGy(DLu@76|%H&ZmZ#^Q1Jo1>^
z46ISviFmCDB{V(-0EhiHh)8MhMyR$W){ZE)-YG7}=MC_FTkPmM&of&#$1LYy$Sd?q
zAfXnBRnlNZ3QrcBn0R@?HWmNlGOC~5cB*uB_q*cF)SOFKB8YPe{K|tV#;T~em?q9}
z>D75$p7u-u9UNOt{IY#uFQ&B+U>7yidS%vFOU#mftNvpj;HGeoJJ_&oe>gW^;`KD`
zkiaZy7w3Z}cCyDI?JiGyS@_TW)g$T`*-I%jsq-X6S6<o`#<XMATfGydptj+`!FcWV
zt<Xo(MWHUuKxWYI<&Yr%b@8=6;c5~>VfX3%gemwJ6_y*OqE#S0-Ff%zf$4eW{90Yt
zBtq%hxTjB1kq#M#O~asH{lpWFV*Q3zk?GGbVl;4&Y^DqXS*DI#Ft4mMs>qE8db|7L
z{S!_D(o1flwK^{y^*m3;I7m8F0}~Wx@<Zg=zBM5Fn*`31;^;XVu*}y`?PE+&exx}Z
zvTr?^NSyw%6cTj(a#bF&EtU+68y)pS{E?CruPe6$#2pD=**VhP3%U;@epTHB=Q^bC
z75^2Ty|uvn0DihqUA=k@*Vt~x&$gTiMhnAWCMP=S8TWc83Z%|J{{y5%W(q-vqwr~)
z=mI5)AXIN9!6e|wKi(a73$*|hT5QO>dhmwO18IpenI&8W;Xs{fHz?TV=>((4Em*3c
zoLg|_wsz#Zl1k^7J@E^*yPHb|Tp0xjQi|^17-Nc-s^o2T#jDmZTB&8g=4Nq`GY+}U
z{S@vfEYlqkQ5_)|W9{xye~W+M4!p|%m0*0}F+BbKY0=caYK?LL+VsB>BWbW-UkRhM
zJ9|)|qx9`!i})b0&bEHVg3N~=vuEMH`EtiAPON#30^z$1ZT&@OFN3cUV!No)g0E|-
zMo$y|ygTeIjf#gC2_jPQCLA6;jAEtvNQV|)K&Mb-z+g=648F0j+{aRrUao-eK!3SG
za?341LlF~pGE=%VQB{e-xqjFSaafM3ugi7WI$g>3ukNG~Fbh}1(^uvW@*T%%i#1|s
zdEf(AqE<&wx=tGM;P%}9W4LWWF;mUOS`xZcY;ZenClkXl?eGa%Qrvt(_=YXPpA%N|
zQK4_2=~BP};A5Osr_vbvEmZ~$Mj3qE=ehsQV!U8+fb^6_4<7AtdG{BJTQ2PxmgdJx
z8Sj|v*BRG|JBR6;q;W{5Ew7quY%|1!gM)455Gi`kB}*zDH&7I`w$4oK>#kk5kmrt5
z*M(A{Uaip)6^DuUvTh03r%I5I7QZ(;HJ!1K-5R*JY&CPRp^%+DHNr@zSR!c-Vj(?h
zUzZ=lOC5lyCcEgMyGn=)%l3s9XVyG4b<;~nyxJ<_R@lHR=q07E2<W^^WthE7opjQc
zDAzfisA|1lx}mLb`w(1l>4O6LT){4HIh%(fsvdf)Q`^}){pz!Tm&+gD3;_O()eIZz
zqXK5>b=Ls9Dr?<vGZ^b0zDirKlkF*bbFU8B^IIXU;FL~}pDMaG9z1c~DJC8~ySR`B
zd~fe{t>(@;!$7g!egHzHdFpl9iOYBio>JyC?5$_NWITf&(DR*6#ci;mx^ji*4d{N8
z4hj6D${koFVm`5b^SYk&)pvpziZ+qJ!`n0OpX)e44{8@6+aK&>f<{f(f+B!^IicMS
z=Zo7TW6|S&#(!2df9*2;rSju<bzz?q6i#e^j(PiDM$-cJFJA2XL%j{HUsCT>O9qdo
zgE)KJ<#|IMO`=$pxxAJr**I;(E+No)$P0vJVZ`k)=r=SzB&ALn{%)5EW=q$57b1Q?
z1$NSPU9>D7$<>tfNmFu8J>#7!YROz(@=r1)l|wI8Wj?e$PCz@3w|1>NC$?+DJZ+g|
z<EYOn62$yuI|bM5$1(J1uyule&fa@U;VNX`-obmK(6oMsty4k89ueo5J<TccMt7ZR
z${><b>$66Ysm;r+ByaDkWWMr4qs<~UFTn}F7hfwP-YGAfT~NBZp07fPg$?^<6{-pi
z^a>@n&n7|_8glRUU<*%ByEC?aro4J?m3Jwh>big?V0l%?Fk-IAT(7@ggSq;-8LtS`
zy*W1Q(opq!xb%+J*J&y}tGxg{yvMqTVSjo=i`7g0%ca=}ik0KTo{ll{AA`H9RLg_!
z9{2en2g!BbQ+;K;^z-(Lw2IDfIIEYKX%_{(R1Yi6sZsDX5@&}|jd2ny)KxfL+IFJ2
z31o&O3#~0B*$ug#i{-%D&!sS=a@M()x3DjFQGTY?ytOe#7rn*&B6;Tnl})MfAV3GP
z%|k@+{ibailU1*?v`Lpy^BV$C@Wk~-GCDX+917XsGX~!0*0AyGDVr3Bo!S>teCC`g
z@kK}O7IRhIQ5>aetXKXNc=|wBH>cQ?aQ!}{4Ypu?IRQ_P%5`JqlUZp4@Igt~t~)YU
zVN}&cK-uIP4&R@YcJ`DX+s(a6(78+T;rC2dvUa9sTzpskuOcn==gNckerlH6LeRaE
z4lj-zW<Q0BUBt|iM$dk_8~e34(GIL*Y((bOHlJeK1D?^0O1c*t3`j)lDn`G@Be@c=
z^6F6e>@YErZej$}&)E(o0yOcqV61IgyEsne9UZHuJsQQF>#~sgQO}JK89!Vp?yG35
z*!3L%KycPCQB^@+&&u1qg783`_N<M`9XW!CqS@45Gc(FikoYUp%>v?C6ojZ#Ge7a$
z>3xv9HioLsC_Nl0wO=Go3Gin06mxAuT4!H>u&q%jJ+3q8eLya)EaT;BpxzrWa7E}~
zCrURh;I;^dN~u}XIlMli0*m}ks5HJt41Kw-7~W4-;wz)_TyDALGKvh{B2mBCH`=3A
zZ*}eVuFdig`LVs{o(ehq%4oqVGm#I))jux_@sJ3ya4J$Lqvr&F3#_Cj6a>&syqIP5
z!FqaBW7<m|_^M8W;Wt^->Jpl-;~@Iv1BU^J*k2)<E>M1KYt3TTS-P_Wpz8B~XWzIO
zE5+SfqEHSgEtV&A=;^btilI^D>16xBIn5#u7Ha;hGH$9B`d5o}iq5xUW1Zab+cTk$
zr(;{IQ+{|ve5M{*##Mu#er3Zt*By@Q5xpw^RZuzc+BsWf?Yf!Y@nDIPX+dC_2?!O;
zwX!#5nHjgb2WoA<i#4he2R*;Seoy$7pU*j3SO3$YSt;Y#$<r7MvJ?1D=WJKMoriJQ
zGbDvi>mb*>Q&k<5+)*p*HIfO7*BdjW(aFYng`A6CnQkENxw%UCxf4Gc|79tVHQMgX
zNsO`8cs{3lP?N69VWd*B_ZTTwLD2jli-^TOQ>%gY%@Rnah^Hd;3$D=7p=~CMKG?!+
zI}a+wO_43dwsd;E+EdIJjk;RX)_lh3u|2+ts~d;Rlj^%N_l8AZCi+4i%BH2c0?UB@
z7V&rXWiJBY>>l}g&t;HvMzPF$rhZBmH6Ohl5;YcxamC>@kHcf&oj&nhXnx*!n}K?z
zrmh{PMQ@FC!T~3+lCPxsF4-#a{htn!WGwfW!ubxnK8@opqMxy+(1u+k_fO2Vea)N`
zMa9NZoZUfq{}OVNCo7Ze8}Xp+GU{zpoDe`6A^ms7u9U>j&!;+S20V;&Y%0c=$qoCi
z){BOiZu;K5DI_ImqD#vZclD-8qsqs}q``nm3|K|`-UGn@;hC4q3`rJ#yKBnonmF}t
zC#CmMcxwn)5jSg_y<8~<&{PYe1>Y}R`m6Q+khi0wdQllxakaPSk>eRDrr!Yy_S?Q2
z6|cwWI3B#6@5Cyrl56;A5F&YRsTdm1+(^8I(`RY7$I}fj!4GH+eTw5C;BkT=zg8My
z9E?w#H{sxWOZY8KSiwME?vmqsk<y+f<->wh=%KVj`)WKd+(RZKO+7=y72~%nmGwQ4
zKNwq)N%tU*Guk1ldakQuPx4dTY2r$-Feg<IF|OkqUuLkRhALcMIo?R6stIQ?^jwJw
zf7Jq9B4Lb41|UYIxj&0oqc&bcr0X?h;(Dnt#P2(NlN$D(?c0mZg0E}e5T-SmUA}X1
zZ%A1X9{0U=4XunLFw#wl%q2)=Zr`2y6g2If;It~Dg@se#KN-lb$m46j!A7{vbdC2u
ze#01KqfAG@Dz=sjQ!|&@Pp!C6uIJt7rz2KV`%SS2d-C4JEsTYwUz0@MS`Cd2JZqmA
z&pUkw*|qJ9b*f0d5c@B&2QY>RefnDtJ4j=F#zV3LlD*tH33Kr+Lemx$0)YCAQ;xHf
zG>(UNiWb&lnjmG#%m!(#4~a=_<V7zTEapSxkt?iY3^;M6VkiNpb<z|7hIJO^7X)zR
zM|%Ze@Ppv=UoIg+n>Sx7J=VYOlKehzb<t<rI&arWYzD@m)JTJ;`w_@ptDb<~n$XAS
zb%ZigUR91iMs9x6)0;Jp97NERJ)OsB*kAJUvq|>cGI~3XYDUb@Fs~bsQ2fS&%pf0R
z39O|Aju(JO<M4=8^EK4)a}x-{ha5LYyKU@cvzsM8VDTm}FJt-+(v75F3YsQASwmB0
zlt5A~#|jG2<Q2YXQ*aGF>D|alUc-@W_OzO=VCS5%^{XDfHF*LX`BKJ~Gps_*7J4zP
z@G~qejm}!SZP_^gxi12nae(o~9~K(J1qoUf%a7BJm$<NG-wM6dLoDsEd`9EkqOKqi
z7I#Od`q!g43#)4k#*&I(thpt2QgUjeqTV2@*>R+HyYyZ4&FOSMP;Se&BjfhncdkG+
z8U$t%!_;9qyB3TuZ4=yP6yQigI~r!4r>hCF9=@6wfM+H8NDVP*1j6ez0A9SC4$4?O
zlDDdBA9%s;FwVgF1qy77XP?hg+z!3yhYFM1fimn>YRM|IHnlnj;%)iz_VJYF(Tv#_
zC8h7zRu5Q7Uq8TmI&W3_%8)<#P)cJc`YnPm6M}XnGY%`~G(JZeLj)*r^czTzHn9o|
zk6(=?O~T7n&h^nbd?>Mq$d>Pib>DU_yPu}qWVf9s`r8qR?bEN|+S=UQ{)%)<q^v%f
zxCdodA%2>y60JZM*IKOW&Ufb>#}JD;so}vi;ys<OFW7lw+Gn`h(KZskh0)F{5gD9d
zQug^C*C)e;n0NSvz<qy9*`2_lfYO6-i_!D6si+d!p76_I1}KzUv!9b*)E@@!^br-e
z(TRL$Niyg`^hvN2Zn2`JGLKt7q*e}uN|e}-XiB#|0OTvtna8HfOX%bRL-gu)M=?$b
zA`3@AexI0MfIC(=P8TEy+uo0@$hSCtk_z~<%@{($x&VLoAL%qzl?9TRndf6V-Y74k
zzv6ARfv~B1<JplaWR<xUbEF<V3m6Wmv*=OTkMr(64K_9UtW}m>MuZyoe!#6*(EUZ_
zw^Z+eY6+l<p;<);cI;8)l;_`q>8rm(<>feK8@|C^B^UW}`1RdZvBDMngHou1Nu&13
zw7>3S*z;<Roiiq8hqXRwHpyX%rwVA}5K5ER<~V|YA?Ek46FWRPhB0c#GL+;Yk&ibg
zr*wRfVyA4YTYhGAEFTP<C)%r7DoI-I?t#;auQGrNn*4-27nO<O5fIFTm`@h_B*NE_
zZ8nAM0;sAq&bQZ!?Bwc5_oXXg0TE@XMka;DUO(sjOI#vBZIjd5i_d4Ez~fxfBXHwr
ztHTXwV8M%%d?%wLcGC%rd(8->%&kG_cZ??CnFix<?hYbnO?Sewt%lG&$>v+W;~~FO
z(Kgw_ZTiBh8Nk5Zu{&8=N9d^l<U+|u!_`Oq@Bvfk<C{leoXFI3n$S9gE8?kK##f&N
z3)xKjCVkzq3jfw5Q9K>06Gy3ZAMM(Up?o(}In`blye<Cf=_3+KaXZlwrAdsmsx%zV
z*y<rVD-Owac`80WPFm|Us;S3_Y1dVNtG6TPieMEtYXyi@7H1sO&x6ZC`zqJ~KPExg
zJUZOtpdF9n?KS!ag?s`PVSej_FNwQ=GlNC%-bw7@7?L;{h^m2HDj58^gDzJ#o)>ML
z$xt{iXbHYu)sWUYqr+YKn0H3w=Ks8^eENNa${|B*L-t>l?+BKM!uK@ra30j9JptIL
zOJVTb(kfu+F$0pUJyQ&g%+>LR@hts10mD^c-668sm!{r5Ag;kayBgQFaj?swscmq&
zT82A0oA<6fD5>dWkODM4&8^o{<NcX|842;rcaAlH^YOYTmeyi&aQpRaO?xdeiCx>G
zdWl0ZN_6;F*xk3EB*L6`I*nd-M^iFEFF=HE1@si_6-V<()I>H)1<#91nxsZPNuu*U
zn7^L%QvWew60f`2)8To?PKc;h{c2eq&kgjFXzGviHHtXyc<BW!OUT87K!0^_0v#rI
zx#7Fb$hK{4*c14PCk0<=mVsM`vrch26bRRhdtT|tL8@1b&ZZF(0r#Mg<5m+-gE;Q7
z%WObPdWcqyV;<m@tm{<6`uzmeb=^cmA9@D6A*Hs3@0(wY-Xcj{)b4bY(C*A|_r9k$
z;|^z?_h6eDi|Y3-U&Z2enDabya<Y~<#tW~?f?FNn-LpOM06JZ3Hcz1IDM;qGjlA;g
zWlWxx3j{Qjw<n;aQa6|0I-DJF<EvV2pT#F9HC<)*^o~cD2<S|_yPCtc*Z7IaJh~ep
zwaektv#8#dk^=%)3}x}jQ<&-XG|#!LHHGYEA^|z6lXw}Zo+K^#%2uqpUe+N*JUyY{
zMgP?CoVg>JbOgZ7Wud3#>*@q$2F)<Mx9&A8*aGp~F3K;JnT$V_ED!1mP-W6ZoLXCa
zF(S~{(crMuw2=rWvYPYc?0n4N5gtwABWQPWCp@|_4UMkw{&~yMFgtD$@WbGT;9pvK
zAABFiZ--Rlynjd^gkbOMg%>v7^5o<K<NHb-p&bAneE?P;?OAK97Qm2+Sv1hY9GBjv
zD!##@u)3qVe?KFF7u`--&#O8D>1Xym9c{#}8^Co*#`4FT4=0A$n~=KwAkX%^64f<u
z!t3mzQ=ODdu&E!Lf6wcB`0}V3;ed$?foN!e$m+q@vLe8eOD3d(tjd3U=Df<fMGOh8
zdvk~KS}ix8?o21%i5fkCY5v;9y1wn|&pWQs&KW%r3)iShr8D1gk^nh1+!_y30Q+2$
z#kY58G*5N5w_d~T;u?1c_091SV1IInO94s3HB;x?8_i*$P3wK{4KzSic&~ihNIj30
zOUfmQoibIK>p_wXAFqRBc5{XPc1O}x^rr!WyG??hQ630NnTQ#j=c-eT1?^X8@}Khz
z(b7+LmrTueNVGB{Q{vsE%_@&mm&lB08dmUsZ!H1h35Vp2&}E#sizB=bcO*9sHVGw>
zI|H$swf59TBy1`k@q|6as?deKp7)h3U?(ZHv!&vts!DXD0=&M8osAzbbFCI_Z=>K>
zy8sn)Ay)gCl;GJ9<Mq?kUh!aN=WR_q(N};yHIRWMFlOn&1tC%#WY8jHCEjaZ{;<gx
z`dGEWlYyHg6fQObwx60zpppBm8s2|U7v~G-tm7wDM>KrqCfZT_pMZ7~M$e(<8tV$Y
z<9JqQK?wDuL$lWTYWiqTC|+=wJH&)(JvZLjIU08JBbQzDeC9tC$@p4xuQ2ZT;ZCr?
zC+J6kUU9Sm6{kPhlIA#>=1kqyuKR$I-#m~da@^I83(&lN@|(ePF_?F0d#fs0{Y=CY
zsyNx1p6D--xCJI=2{DQS32F{a_YpqOs%G<0qBJUt4UxnOYl<g9mkv1QEeRV}@tiyT
zg;rrtPvg*Ol#5>&c5oL27UN%@8#CBRv>z7xvZIb8&Sm8S$qD?{!%bA=3s=K7y1<6>
zH~V$ZjaH9_vk4AC(MEJ+bO6@nd@)FCRr04MCQWb`Z)?XMnZnpzEvi5uTX7JnTvkNy
z)y6uc@%r_b&|S`|N9j5TkjlDiBUisHm>X(mf}Y1!P;!h?jkV{6d^Fj%M^{?k;sGyj
zo}^NGHzNM*5#y<2r=PK3RtyB&87x_9ig^Z@1Vs?-ikbZu?+ya9s=7Nqj;cZc%H{3V
zULUBCFEi8>H?&|O=i|q^*a{^86^eI28RZQcb}WO`!x+_B9^*jcLm<Gbo)29Hazm}v
zVXpx!^Wz^Zh4nx-@-7*HqV4+%_>sdnXQs3y)>rcB1BK*`q#$KuIwBncxPnT9gU-Z*
zMz@p46YSo*rC|r-K)O`IB~)c-oZjXP3GQi)M%k)J;dkd2r;H(aW>phZ7T5ZjewZ7q
za;bM(%VyeLCH0J4N-yjU%Kr;(7l5p2S^5<a!4!{SyqYYIN(>vIxBhY7mr$njQ<*01
z4FMa~Sr%?Q5)P84C9vhnqWi#kp181r2>D?s%<W6quaT%@IvHUhq122yu<UJo!tD5i
z`R54Z9lO_ze1-EcgW%C2OFo*naPy0&G&Ewx71bx>+DRgdV;cGnwS><mXTK7hC8VX<
zA|j7dFgNQW3!A?Y={!S|ps~h)9uoEjgFgI5>s##DvI$E<<A^04c{Nk*FjEpDvxRH7
z{Zz5qel<A&tx<UsYDj9uh$uWf4EzRjV6RT6_l)9bn@c+*ehw#@n9W*@z1Fv2A8VsS
zl;p}nv$pC_<g5&-GH+fn|I*2pKxo_|r@YAGgXK4cNw8eZL?b;KXxZsrvu-&00r?C`
zmI_jiEv^dsDLFuOe+KBDgH==D9&mjM^~c`02{j7tIda0Nq@!s*z!f0LG%i?j4l*H@
zQNhXaylTK?5ZU^_<|gYU3<J74pDz*jAWE=s4;z{?q?$H;G-<m4$90Y21s}1Vh#xX>
zBY)lT1yGQ6bD<y?$Uv1TI?xL7g%6w%Dg!3EHH!2+1dLkJp36|R={A?PbUr8lVuMn~
z>Dmml0)WC|r%^T#`IWx&c9_t0sWsTD+jrXvGw@1{kC#8+SK0pZYBC5H&506QufdLx
zJO$mFQWFO^Q-xf!vho!WBF2?y3YQaF1p7vVvD6+*9o&=ISk<A#c>r~uu|pt?c_k=H
zlr7+?CmqYUymPcZ=(6kH99p|ulmAXPhSpeT4nIfll{-)L4)knYxnkY0*owxt?wh|u
zidSFCjXQI&b}GMnTrJc2kzW`Q_F-C^A+hxHIiLbdYVY5vT&LslGpX)Fx+P>W@>g{(
z)GGU~b#s17B~{q}cN1!p`>ZIAX_I3(X#qKJFSXDNzMftYV0$<Ok*y&Kiw0k{$kMUs
z5AdJq_~D{K+=zMgRtjhl84_i>6qTMIF1%K*u++k98uboK<#P>j&~EPW^#t73&(%O(
znlz1Svh0Pe{;l|?lhoT$ofzmcD^=fO$ojvjYh#h7(oDcOYNrqF3hIrH&=AqnW439$
z;aLA3(A)Q8_>(a*TF<%(uExX61}EMpZ<q_E^olDC-S*vgt2uWgZa{{5(2d0u93y&-
z7b{d12_?^E9p|d!!}GW8v*Y*UwsW@KxRXa7;)Dg@gi!pHOw)!{@FJ8A*Vinjdtve5
ziVC~H9$%Y=qxi7t+uM1~SN^(Q(eGnklOo^8d04eQtnN#Ot%?{=1PRqJjgz6&bWNZ9
zlzdDaU*H%nTd~aF2)aVB?VgdPoq(RnplgTl&}C0f))yL0K$~yey*AjsSr^o(F4_5F
zEtQzdH}lQFz8P-4mP@N`Z?TSNJ*mtS7b#xf^W-b9L>t9Ym8)<R@;~)dE1riq*mnWS
zWDAnP&cV_<2sbA=4mBcjJi6d@*B9&$@nl)Bq~cQZwJtX69E*(&wZurb#_)89m!;i`
zLfZb56hQi83xQs*nM8fy$`lSnB`_M`JCe(Xt)8Y8nUT06Eu&n7aT?b$t9jGBvK#;*
zTJFd>>vfrPov3d*66twI2`L<nT3(A@<`C6t^D&GgUn-6qi0@_4Z2QIy<QV<Z)X6I$
ztE+&92>~-7sQPs{*(y}29#PRq!J~EscP@=wQCbOS%XK3ahrU-DngwaoA2ql&kTcn<
z3y*jAnK|{!Pc2u7-*n|sQ$yI$h84dDs022o!5W*AL1?JXk6I3dm2_-%>I}k!>tf|h
zuFnq&(HgI#u_q8Mp2ChN$Qn_>CQzLY9(4ZfeQ~+H7n0pCyp^KtT)_&Xa#w|O+?#u9
zcoNEv6Lgy-bP$_gCp`m}AYQcYHNIox9vA<4%^XZp<VC*a58||wN_G54ePz}ac}ZMS
z9o1}3Tc??jfn>q%9{0rnyqN-H;7ReO*Y8_r=erPGtP+t~%|<)V3!u?9qJ67)JfZKP
zLzFQo@qf};%lEJ{cUIK5dGrrZJ-$eMz`Vnticc_Uepm=?ad1OE#8BbP9#>U)@4rut
zA1*fVX`XP?`&=3Y&Nb4c59o<1l-IPgY`k%H&E7sep|H<d3LtoQ&o|xl$*2bKov`bP
z%Ds-2zedxeYV|#VfqYWFYB6WK2%|)h!HlKQ_vU}5I4;I;NFiq|3K**Lz*=C(ady6H
zRFmu=V?a)ahNI&7VijvR!Iq(LaQpdNl~?{FEhz0oO!f&<70byyrTpky+Vm`@ctD{w
z&G}xrn^p6pdfBJT1Zmf?e2p@`32{9QEcQ1J_wpM7NnAh~%L?x=Y4c<s(leXkmJ<Vz
zWwPxCGuW_7WTS$mMjPvX0Y4TJq-=`Ln&u9FuWigdbiLofNE3zOO)XPQ(e>O;lzSAf
z{B<y0qE(-Bj8w4#omBJt_4%Iyt<^_E^R_~!`WjCjd3jUG&XfVx^+NWRUL7<z(dnI+
zRd)#CIR$SajblbEkFt)RPw=Nx>hEzo(44yKl%Q%83Kuh!3Vt;~hwrgfb&l`FM!TVw
z!1}{M(zt{59}-3};Bi3}5>oN!F*G<xK0pvsyHK)M$2L@*iQ}9F!u}0k6>NlDnfF$G
z#fI}An*9*)oWIgF;v-@db4Q6VC$8Z*x;CJJc-47VXSUWwQMP&W$oy|LY_St?9v>Zd
zsvFbBq)~#=_MLw21EXyC_d5K)Cy{T2yk@eWO+Sap{r(48`QKjvPSQSiUM`C?xoH3W
z?Czb&H{f%mrdR)wmi~za0KXiSxmf>CX&irJ1HU{Gf&rgX-23!@Tf{H_MhWXq{Erpx
z`~6>V-d`_~z~?ddLOFixB=c5C`pzO<nhW{0-~W`a*Mjb5_szsl+7Co?q+Ej2{2gVH
zu`<4;y@AQNzAr3RlgUd#ktO@MEh7pZX(QS?Gn4mr-6j4i$@6Lso}qX5ke77Moy}!;
zrurz^wRLWucU=dI?EXI=5LlWy=>1lsf4=vB&k@nU$Nqu&O#+ekkbd)cg9l@%Qg^Ys
zp1L~U8{eM~RPw&@j26?`VezUaEWi83JtTjAn5fnl@#F(S1sBC0uCzM2k(gGt4*}Ed
zz0=mXG&!SavOtZ((_gOguPgE?Rv_yO=EN`~8!gbS9j~*!KmPfZpA)`6;b{5=WhAXC
zmFw}3C?YP~rag(c-&`;F?=mokRF=pYs3M;hoR*(KCp8!0S264@@o`UGUu*&17g8^e
z+sfNisz7u?(+Xi!yN`^O<hti=x4N~?PU<M9tIy28eYic|8R?v_h-c7aa9Hm<sC>v7
z{k!JzXM!<oIqGm5A!GP#)9A%i0{;XsExyQC;S|3hM@Zo==1{D&#W2DJviz@~rzovw
zs-hXSIi{)|a`J-_F`s-`rBTkP>M1tpjc3%RS|3cs+t#jw+L>RDX=LlI4<vhzp%=b)
zKiegU_^4fJgP;*(Ngaw&3}Yf>=<D+ZPeLJamf~NH=o23z^kKiw2l2sTM+jS$2!3b_
zcIV?wE`;1%i$k!{Zcbj_dIGZ_m1rt?I^Z70G1dTFWqy9Xni{W{L^q$w3?98)!S0UX
zEW%X({Jd~GN6!{FiQiovi$d;vhTp?CztNH!io~_&smQqO?!)u!uWtMMGJJ}q$>iR0
z4iwWERQM^sWrJ#^!#{X#riqL7STKy;`^h+f<fW}=Ybdh->SF$p<w{7i+IG>GLLq6g
zKrxX=?cw%3L^hrw%6lL=213Zrx@yyKkzJtoRc7GlQG`wn6ogxI48Tm1W4hsd35ENU
zh>9OPT3?N)=<Y{=^GcV#?Cv{V>rb4PKLllo?m2$<mBTDq)9JQgY~AT?k!D{wlQI&7
z&f2sTk$m;SSXd;i$^KH=`Y`p|gP`r1l2%K_Xo_H<Dw4F{T#?4kcKWd(Y<_w&QFy6?
z=n7MHwrIX4-&8V>J1VUE_&;M`X!0orRU4tdOf?D{hk1#(H8%d!P-IhtFN)Z#YI<<#
z2<RkTiYSk|fI$(3dVwZP`m#BLU51zhMs?;BWQIIsMhtuV^#Lm^ggp&%YTX(~39jys
z?D<53b|R;_KMsMMvpaNSBn#kgtdd8gVi6>|kJ&AMY>KSPyoa^gn{3d{kO-q=F=~s9
zrB(T`llr-H(B%Pw%#+#Mi3%$TzRzjvIK0l56SrLBzNuc(&+=4z!EU<(R35C0hKaY=
zBWeg&z$0iDN^Uh(Do!Hk#qd-zlzGvR&1~S=2?p|=Kzc&pt`X*7iLQv#Y_Sg~{3ZW0
z`W7fKsO;w^u>+~m#2(v}Nt2iNJjTKkluwfaAz0`nOax9}-!r2XDUi~K9xzW88~Rzz
z%@2BQF50g4;09w;hp+U!mwh4`&VW6`r@tW2W^?N?G5``Z$`|FYz7=}96R^LIrKQ&^
z_|TWgsR;iNpN!jnEf&mM0z0@l$j@Athb^TB{BEbuQU@w`XKM)$@^ULmY~}^Zkc=MI
z*%h38yBR71rY7C$xYh=ufv8d6Q@!Y%wnn3ZW2h2X5jl$TsvM8j<b{1rS5CBmO8nXL
z2y#S;n6Gny{w$~4P|pQ3p5H-05@3SN3EBO)-$fw=^UDK62e!q39nF543n6{KO&A(m
zgh5LIab7uj)8*$`K&@cl^A(Ea_RRVkT882u0_?wJSa3yPEG~}LIeis<VBl6ZzgqYb
zaov;;j+4CwtCk73!%F1zN6SfMP8#hz^5>lqr18L-&~OZq{=Dr%=tr}_WO)1t8Gbz=
zt(WTZ+ZWjS?R;?*B${~4XG7L@ojb>m*w~V98ur+<OqGb&X)_kny7>N1vtO>|RH^;>
z5cx*Y`g8375~6pfOGsS&wlp(!=UdGr|6WvuY%vkpQaJZBOH2a@=r@Hd-$X9sKrMB1
znv8p|ul$+s6S8<zxf*gCiIIk(=+ObYGx4ZK^#Z7!ewk{uhS&LideIYqesa0qS+XBO
z2EjPA(U|Um8#t?gf*7m%@hAN1C-3uSPdx0U01;>p7h0!d2<Lu{k8o0nX*{Rp%&T`R
zg##~F%NcCyP3nB;p68MQcq|%dg{61-Ts^)n7YwdOQX25v+CE>m?>hCB{O_)Ur~qRq
zQs<*-9Pl`a4?b)6f$B<yF#m^x9%W&KjEAtvO)nJC6n!Up0IY0rI52sWVF6Vl=~q5_
zA4mr$QCD&Wcj|DqI}2k&d^{28W!ORQZ)@0NM}K;KI&Gt(mctz6g3*>j%J-I2DiU>=
zTED@A%5v+Y+ilb4XyFc4dU1oI!_LZhB$K~N@(>?y@!j4+(Y>4sB~B3OB&>5{K6->j
z%{ja=nEyQQiE(FW)cEGuNV&PHq&!oD6wK~?7~T{fNg}M&n_nLbQ%L?u2k<SZ(M2!N
z389NZxd+drzDmC+nN@(0XvzUoEjG333JYo=M?U;YXzDJ4KYCOQq{APsIRwogCS6?C
z<gw|fX+%7NcA`G!n{<*|sq}$jvzf|i6-ozw$zA@u|I!fD0uF(9jbepG>C#l0ne5?8
z_bRaaI#WY4)L}F0i#+MUT70p;+br;k@WP8PK*VGqNsBIT9bGI5cxBgO;XZ9d5Bv41
z|GbHX3k*c;B{YRuWcK~0lHhmB*7cU#&auK(t{=6sW#VYa&-Ugy%Xrg?a||V@5Xo@a
zb$)1}N=MG3U0A(-jrfg5K0K6}0>93TS?vO($+}}v^YD7Cc}{`3DdA%6yH6AnE37E9
z2O>|DPc|hYg&!a|y|C#cF+n2{3v0MmfzL%8$q+|T^-vVej3n$t=G^scM^)u^Jf($h
zg?dG6UJNa^N_3U%9LD_a(#fDmVs8l{bKY8(ev{5XCDu;@c4?HQ6h-}6w}r|5CyAN(
zi^}_5C5vhbEbAK)j}G0b<US0^-4N@raRK|bM2USL{FQ!YKiq6lfE~ZBr1R>*{N}XS
z0>_pgmm0<ZHAjR!V<WQ>Al<9r=c#@CpZ06P_F;J34l`Z1(a2Zk!{5lf>}srtm=VhM
ztjBFqsJwY}VXh@_`!@aN2nfELzl--h!HqQBJV<-Y=lY5QLmf$|I5X<kIL03hMv=`F
zjnM4Txn3am0elYp{m$7kD!uD-A3=-pe=qQVZu36OB4lzQoQD34wbC#Y7&1so)#zDx
zfwNEkjspI8vM)uJ(W&z2w}BH_Wc-m~aBM`{O$@02J-YuKCrxbkLpW2ai(`M+6K`Ne
z;K0Y}ORUA%zpTN3FUboD_u2^ubu?vuU8Dc{Jl*$Rd!mvZ!6eduLYO}<Uhd6Om>ClT
z<*)bjztGx8N%+l@d~Ox$Htc`GfIr$7N+opA!G;(2`<oZaB@;TJdaeWS?ECwh<HJsa
zE+?SKO8;x#|M`;aIYt^3n~v!Luki10E(J;KROk9y7L@GhfTv{KTg*ICe(&#mFi249
z{U5_mUH2D!W@=pq-6^ff|8sv(Ls$_VuwG(=Q$IpDAluPLO%w}>_Zni<<?0>Q!-e|N
z|22f}`jP~OeD_pjk?L5-<J2EE3AUhklVO?lKW)?Wjv6Cl3C(NP?8E50#vj}Fk)Tfv
z>TT<+%((yaU7;>)MCAi`ZtGW|gVPi5HRhyWT^76itG6MfJ{Y(*Kb(#ZXASt4izVO8
zj}*J~Pa6N)t3LbkWMb`!`(e>$lD}`VW<}7U`)nrf{`L3&_Y%Vrq)D)ZG8|(6|9{S<
zWE7VS_HXI!$r9o$fp<t)6p=WzD(AG7S8XA<y@MP$6tZ!0OPvvWc7!PZkDez3bB4&@
zeyxvoYvM~xI597ejOU~E{$z`aYMbIKkTI!05f53haQO0(-DD)yyGLg>UgGyxtSI_s
zseeT2s8~_tq>tdJ6}^ygf8ov=#3C8x_ieEM4733s8aw1FmQGCu&@)Uc22wc{F;v9=
zyXoG|6CX2jp@TK8AKa(9b<h$%=)Tb9$qv1sLL!xPG)0t2Z+tKC(IEO-CH4CY6n+Zk
z$g8@#@doHdwBzQ?6m%N2mRO-q^%KCUID3&w`1^H(jQOss0#(Siz{%U<95z!=B1i?}
zKuHQ;F-d5*=k?vqVPqrL9m-hJYFC81-ib)%PLh*<ndv^o5@0+q>z61B-aP;4b@O$5
zu8jJFwy6X7b+h!sXdbrx<+1d}bY*3yDY={huO0~Ejx^2M-YypMtoC~UC{d_`9<2Nr
zZq1$xC!W3@wW~0U2x>^Tjv$%68TFon1ECzdCoywcV`ETH)YF_$LeB9zJ?|SQ?=AbM
zzs`iFGE4&rUG!_@M>GcZ6O;1TR0_}DVLlPtSMxd?rJbs>`}D<Bz-F$FR`BLHriiuh
zZZ_({fLV`J>;T*%QHlr7=F!L2n^Wy?uU++>*836|!P_TlkA{#_*g@$rc2FRno`5wo
zK0@F#<`C12!mqu75ssu(XQ!uw=Hd&ou7SGk?3=5zL~+zyjE5$`=F_6ot`)v4M2WU&
zZJwiu*X{vQq>$#5H&d%#x2`@oG)hBbJHWJ1rX$p*X8cCGi7itsI67MfIBjNuWFwl(
z?g_0<RdL;6554`(rPEJm^a`s_l0V6SgFyVn5v>RU;q!k@xIrnfEI%MS9LtnR{;qsD
zzjTg-eXDJ!eOQ71IZGO(-W(cW=CsiJfO*#AGLII!)~_uUPQ;yQw+wmU`5D9J_Vy|*
zFJcg=+&|;Z^Ofo4sfY*T(#HWMAse^rVI~oeqf`hUq;OGFl6_r{TQ$e96<#G@U0(WT
zg}@n9*AjB|{IIDNSf?kxXg>|cqQrVTyxObr%mkXq?NGDQm!NukbEc1mP7mB}pK}-u
zgShF=u{3**(B|dPerdzuxV9GTg_V-a&Wlv5!oU;;kLv@;Z@J2%()oi8U>x?ezl+bb
zm?(5(LXktWJlf-%nO^4kbv(R5Y$u$9yBoy&p|E6iI`wX9OC4eP<O*XZfUEPZNMF#W
zl4KiH%O#Exdkk4kdVYNN0#=)}>IJ!&wydUoxpRC1(?*@)R-0rtgVd=WOP?d){I!8B
zKop!vPgU=JraZj%2J<<{M8A;m2g^h8_}q|j>0Vw|dtN!03ATJ)5cj_^*oS#h2aPHk
zf<V`xJqLcwkTX91eUirl9v?ELqC^BGr9f;{pAy7PHvg|VdW~VwVk9V~`NuXa{Pf-d
zu>ILI&F0?bwqHR!(L2CYr&3ILf}sz?3e7VRtb4@p0Rp(A&uaPHj=SCp58OcX8yEnf
z8wYY|X+T8LZRu}+hnHp2tNyLR>Waze<6m9??!ubX1=2Ac(fsZlPZuUR05@RmyD41u
zT>aJe{sw`?1Y?KOz7&4#*As>G=Z>7v6@Zub<Fr1dF&mgUJe>}J^$hrGJu3qi)0c5O
ziAQ{v1cLusk*~4R*2DCv816PuA2RI8N}E)-n&W7&Bv+mCL-!-gEhhBZ;{|n0ZW*iG
z%Y>@TL6vV|#2he4qQUZ&_{5!Wl=ek=)xo`%Krpd-04^n;3}io($IG!)THxT0EVuqx
zP8dbNQ0RB|e`Phj=0a8QYB&Qdozim2;Dj+X&;-_T;DkJ;qK3yBGVx3%;|Hg=SF1WH
z67fvB(f1!*CRnKx`En`(Xi(>*WPuM|pRh{g*U7!`4#Y_MTz|f;sA#a#rL2-IXWpI2
zS%mcdv(jSY46Bbht`Yk7<%vb9`G{$PfM;O0&GHQ1lQ3LsB~Z|FF1z@a)yuR94+rAl
z7(z~KM3H<@jkeefCFWJAaoi|q>CIQEcVmyiARq)p64Q#>#;HiUSjda=$%!?S-na`a
zn~+D0o8MEY6-g#a9Tvkl=WM^!qy||$!28d@0TB@b63X4%3mZrVFZS3i)VnA2Igco_
zVMm{|g)X(vBjtWf-3fX~%rW=uV5yxQ$#?i6YPy1yJ?eC=OX>xYpqK9fsxMw&n(%xA
z>nJJDv3uF}ZH3d8x(25GDwkQBh=^%UvjakRxFIm%iKW*l<l;<R4M4#&XG%f!>e2cu
za{S-sxFC3ct?nQdy%L0Y{thrwtf5n}NksrqI_KE}w3CbP;Wb?cOWnJ3_4z4Ga4%Z}
z!%{d9+!K=9k$n3=o$oak+NVa4D?10U^e3=&5}hr!29*M8iN5DK@D79XJbfwE5{g%#
zko>(v8zBS36dcC5Y^+~9>6KOt2!yOQa~{GCsldC!;;lL%n;&jV-d&XR&xbIGv{E?F
z1MsA|&B6QW@o!FM5bOC-=Td-kr$o*)({lbWsFGD>Pj&=)-CR>0?F^-VD)Q7D?E!d8
zmi6p+)7OUpC+LY|O!ROvIoV{Vn=EjCPtL*;O{5LHFB)}^&oBBp`%Px5O4g>(Y>2l?
zmh}imf)s&;#9O`(>96<ZW~qG2`d$4z1m^fWW~P_^y^h(ykblkuQ9;w$a5acgFg6kQ
z-8UViNd%6QDJDk(e5qxX<fyX_jeV?IwfWy#0ghmLx3t6F9KXiZhZK+98P`p6)e*Ho
zI0|(M0U~^m<647^agE^vIESEJcBL0WSZf;A!ZE5<>&*BlY^r8w+6BYkbhgfQBvyM7
zp4*L1?GP!i)G2kl2mN2Mq7cSg3{E{h^wwaZ?L0LAl)k3>?jvMy8UnZr^)@gMLY$-)
z;4t1`*a9$Fi1fld<^C;uibiXgfxGk0VbSBt3(<Re5Rk;iEO#oNt8Q&@kPjb#lh@Mc
z__YCRes#_WfRq%jhbyIEMUBT;(F@x8XTMSGvto8dv*&Zq_9yzL7?F#$W)Jo9y6j3F
z{}`rSsFQP{8Dy8kcw_6gN6l!7&E?o-*714W?opTb@^0Sp4j0%~aqIp)76ui_+S&=P
zdIh5Oo-|WAEbv{4zeJ(fXMo*L7R!xgTUpckzTEK3Mf@?Anq}{PHtmUu!>w8V>>T8J
zzYeiKU#1}WziKfW)HR^}b))(_&wRrPg4Vhy!~cf(e|*Is<}MFx7B~3!l>grf&_oCl
zy}7R9q~E7XBKPm|z=bEp|BtA3Aq=c>HZL{*71#WAC+TnQ@<3b6Nw{A}!Cz_7K8ORu
zI9VB}|9az>;);)w5C{`#st~tfet+{&5X5!}Q-R*bqE;Z13?6*aqmrYrlXESO)Aklq
z6l{~mdH@XdDAWaXJI*_Bckl$?^0&9z<b4<WdyE%pQv>MQG4`CH5WwQ&g<8uwd^B)(
z$prW{oC#ipQ=|3w=M^^7rh2u`yRou(kdGiRpmQzQ(?dMC!<Bd2NhYShT2RMODI8aY
z1^l*C$w77y=63|(VLISojf==QUW}>;$gUUdkNC&dfy?4b1TKBbYkfuD_e`o10_Xdp
z8`DLpVxjoTfP6tM-+0-hs-|zkU*i)0MZ4mhZW;M^%_H=K610w;)g?N<!T86|F7McD
zXNtVNRi7zsCGpvbFzMG&xSwrDF=&-eeumkL_jEt|Xl1658W&E?^T8^S(<bh?sFCR|
zmvAr}lu4PXmUneOvE;TtFV6Ee`016WT3eC-g*{)R7wkMMz_6J_oV2`Jfj{6f4;u5-
zOo6*#r`&3Ke4CVBok6R>XgJ_#y&t7lIYTT8(9fhn)FDTF{Ar|e{^KeTVRsQBtotv&
z*HG?y?4O<`8~4HamfJ<aOAQmnO6vi7tnvIT0HM=Sra>rXaT#I1zd6)UKP!$A4tF0(
zPxMFUP`aq@bk$#D?w1w^sdYOcLc}tiPLdi+y4%;+g(?i6P^WoZHea?yWc9{LgWY@u
zIM1-ONPyTXQ^*H~74ZFnAG3cBB^A_DNa3Spd{G_=f`J}b-A0d#(wTa94b%P<#+<>V
zC{PHI%~$`jl7Wm5Qmr_Ubo4GX`>nQ8w@7D*1yhJ#9+&EiCzFc?R$VTeFAjf_WHlQg
z5|+Y&%>cWR>|C9zNzZ5=t7ivlcRVEZcy~<J>*tNjR2B>h>J4B8erby!)t7j61L(Xm
zsSTd1r^&n>?>b2ktW103<W2jM?wo+*MFNn!OMx|@>031nKMw1eSC`=egX7MI$-FKc
zzG~Da@KJdn4hY2br-YI%RvEnI5WHEzHV5U!sR)~2+Uc*xYXI0rea}}gDhbk?sU9eN
ze3pkw#5MFLPrI@(Nj>867Only<_8;NWjW7j^!0!qlL?(vAU3Jmz8g3+%6US^$#77%
zR>5J6r3KWApmrpueQ^l1wY82rbRZjzx^tDo&2YMHffI&4H`rcC7tMYV*O!pxb#uuo
zQ|}&5!cW&8f*YgwdW_XbK^FI2F+Y$K7kU}wu)2jl@czk|;G_+TOj%XCG327~6NbGS
zxn$tX(;Tg!Glf|mr%+pVP|~PQTN(iooQYU6FsGDL$&oSH6dv-`t+p3`y?HQ9B}Q@#
z=d0uwh!)oLRRS&(#GqYfbTGnv8UMvyK(cp!hct1b6I|Y$br{2^urNd0(oh4H$m(Vs
zCSF39(VluFDMvh}pGos7nyWup9LA!QqhTAR?|58if0`GM){<W!+=o;koarNs3Z#fG
zFYDHteBx0qNy$ZEON>I?-4WmUy2vfI1jzH(^4q^m)m{eZE(rLh7Zx8JWJ3>@s2E<a
zyv2-uB92^}c!w!5oIO|0ADXu5J?nYsJIH*pt98&xy2h2u?D~yjLO}CPC)R&Z&AWxZ
z335MBzM7@l>!xOVx^fUY()0=5P*y#%nFAU@p;6M(z-rn<al=kJP-5IE*A-n>eXUsc
z+LAyF$KusU0t8{z+>yr~QAn6w&Mthv5M?v(L5azdKa#y<Yw8@mhQg9XiwU$5%J%m8
zzK2g$^b2gZ%Ga~sHPlurg=ym0-4(Q-ne{(M>usrk5OT)(KW6=^Vzyd(DpVgGsYH+>
zYQb*#Il&Ewd}8pFm{;z7-$(kcZsssI%HjX*P``q7ATfY0#|MWJuyw4w6xNGl`PkX<
z*nXC*vLCFUv>WWCXd(rX<@hI5@+l0!)IY}LMb67$EY{a6Qd;G<>12G!6(BB9E%JA3
zPa`=TB#ZCcH*A%=Nqq*3#HKBbffT9+Qr6d_x#Y%P=fWHoM#j)_1LD9_um(n)eu;4V
z*`(d|{Ufz%iuY7*)JDX~$AqFz$z06|CS6Ol3F>HSjnN_6Bw1ehkWgEumHQ#2kd*G_
zGBNDOqCZ4l#j;aKcN}k4TF<gpzfrmc+5v(=Vnkk)lj{;a1-R^Fx*wrR=p_Fwe1dKK
zjXn(KQpdZqNgKNbnMibaX9Y7tWJP##*fO%_XkfErKQ(!D1sH`IZ!xjpo+j9GCVLSE
z5^;h;g+qTwxeoRbu$&#U^mFNxK_e7VnGd8(>Klb^8u7kX2APdj_GMt2kALS*N?(t^
zR(^OMCwiS((zpcn6;I67T!W|fLn1DNQ$G!izO}4o)|L+#JWl%y!n7*oG$C@)3pzZa
zV<NcmwtR0Hz}VrKwq>-_MR~PL*r%r^apiK;fhJvW6L<rHZWTGv?EPT>c8nSfw*cd7
zV6~f_Oi?44t)X8i5|`{uyza|^Ndg~*@C3J|#sI^2A>Cnfl(uZ3vRIU?D)qkW5iT=z
z<GQrbPUI7xy+gmCvy<9esl8Q6awkoapnt?JWcZ+nfT8R?8cq}SRGh^)*1+<ke3Vcz
zMpwp#nPDR5Md+;GH=6duv>`VYyrqYrl=s$PXx$nd8pGE8mi&2@b`qD}N?-`Nvmhd=
z)JqQv#0#sL>ErV7%%(hcRFsx*YCo5UeW#e79if_YA_ZhELTSRpIq5&8tWoVcDpqI0
zZ!nkwwZ6`v`95l?otbtl>O2{eB?I-Vv)y^zw>kf;==<!8-s9%`Bym7+<eyr$l;H0K
zB9N9#+?PM4$wILZx0d>6-CaCf#{-6Yo2+C0j`!{%LCM~b&VCXP+ciXaC$-Ry0ey53
z$Zez7$G)r#vS>Z#`_LIakMn)FGvA<p##K-~*!@b9+qF+oLeM*aoKHtQ<qlAaMEpR+
ziTWNHi%p&C?=Z}#SnA&50f&jW=C<IK984V8MP+?c!Il9*yH7!#{m-?sjtCamiuBqg
zw_a(f)FJI@d{_r4@Iuf%)AzrussGRK<vcj^^Zx^L`EwV-utwUqX|#1W*Z++D33{ZW
zYn{UI+W<%N4#O<aWjT%i#{l?yxmd`00szvini)Q=-vyt4Vvw4(M%$3z{qZJQak4{M
zr;0y+jA#looY5Icqx?xGQwp+Xce%g#pxS#cWv1SN_rqXl|3BN~4kCO612HYfd@+!6
z^9NL$LxI6!L=jBDdy0Z>Wm!r7SUd!C3Q?-kdL~jbg5<mW^}i}Xh}5w79JmTh6pDVZ
zQe-cY{NWcnTstuAhN%CwJP4iNHNmy6Y6>P&WRnBc8>KuvT&mUiymZAMg#X$J$UHDX
zU8A9pdN2_DQCLK>PWe|~*T*W$%|~QF&LpyBF6hu`)b^GfoJpXn%=C?|vK{`HC77%V
zfR#wr8I}>?_cPc~^`b<;vXP3VQKoP^{(%JoB?Da4l&>}e@9;o0u2uIc>JLWA%}M<B
zY?<U8!vp}aOqy5Qe5Oiq2xyNfe(E(aSWTBB^;kD-a@fr2D5UaG-bJ8l^cp?wD0nMF
z&3-C?|Eqr@;s$qyP_=wL{zZ3deLo1Dm~|S9O74dl;U9p-Up?MJdY_8nq4z|r87L!w
zx>>I2)1o>oNdVH(bVpOg;Im{T!nRHn%}3rv62RR$J_ub@vSktYI*Z>5*fzbDZE{^|
zLkcdOqYcKUj0Qoybap7g>W)Z&2;4@e;?%oTJcdXRE_|pT%zxz|Bc6o1F(T^=i}dm`
zLa^t}{Rf$5MS6-;mt{NZISRzV_`~aG?*T-fuw@JC+h!KA?*E*^{#ti)eLKsz^A1ma
zeBYnsJ#?M;B29-I6W5+euL=w_@%PVZReP9DUSF(E=$jm^eZe9(f~|Hr1Ch9cWP7qC
zWaU{9Al}0IO;_5;R%}nLX1Mt@4gIWkSpR(Qy|hdm!(D)QHYSMp@cs#R8qa;JN(4|D
z`Q!l<@b)k2O=lR>IBZGOtL_f155?#HHr56Nu~{nZAuvTrQJ*BbO-{V*D&KsUyfFio
z;?7nMUBE1x5c1_x--En05Zz@De&U~qogNkpYNDGo(=$a-*<EPIl1kz#8U3s!D&WzW
zWI1~p?;+^nfBa)i`|@}TqUxQ*RFor2ZS~@7G(@0?`s75dQo$-uv!O22eTQB6gPPQ1
zu^?nOOHDXj7QneU!S%i-zl*Ptu&H}qNOGb36Zxx?ZzC=W+|_7Sx?_y-x|8YSlX%_L
zy?*X+Vn_S$<S3-X-m#meceKF?3}Kb%6`yV+MzX70%6Q6cY<d<j9kzccPknXBd3JSa
z;gD40wDp}TGpZtZpXqqH<>IK+aYN+*=f!BHz92ZrK7q{>iIb*4qbP-Fw3x#7$4G5u
zUQoj)MF2drc}kLE){K^!DVEJPP)bLOsnb36k55NO7<JEsmKZsg8NP9*q7&3cC|KBd
ziI(mN?BSz8mLMBJ!Y_9K45(gn9{sZrzCcL<7A(rTAABYN^o*odDsAIcAp=>Vu0xlH
zpk-Bo=0OnAgc1dGf~>Lo#{Fz{(Ju%KqQ@6Z!dUZCeK5}LL_n1#!v{18uNWK=#cqJ@
z0f2QrQxBO67Jsj>FsoCs#8<rIXvKw)nSaZSYLyJ{0^n$<*+917&u8n8*wh)8+x^_S
zJ~`acg|bXA1dwrP48Gd{O{P#<^Tpn5?Zp4b)>(#C-KB3^k=P(5s5EQ@0YT~R?i7$N
zDJdy|O*bNqba!`ybV-NO4N^*Oy5U{SJTvpmy#E6}?1K;NU#@k>b)9!~Gw|J1epnH%
zN>&$p5=hu9W+?oPl_B&^Fdj!~!Ps$8S$16Qm#+S)QXLj!XS>bk$B#BwtH%r8CQOti
zvO7F)jGg<#)nT`N%NWSad8;Ou_@Q14y_3r0`dr-0Cl2Rz>8asr0LE7xzL@0_dlfM%
zzXTuvSEw`@GT)kZgof#~dMrR#;qQe%l3QShL#Ii&-PFbJ;JyX-=|Jb;<O=oQmh!d%
zv9>e4Du$_sBg^||b-4kj?E$gxo+g{Q<OFTZy7e>S=k%JEe*PfLmL4J`8<>)7{hCPT
zNlC<U^U7BhvCU#MB7+(BTQPynwYIPI&_9UNx`0|-QsegeVkjdl|6Ctqkp*s4Q8qw3
zN~~o2a8j>X9U4|W1<z`7KI$j3Zh7Ttfk_5ol_U8D=_0jsm1_=pSEg%qTB2GMdtsSD
zE)A)pE!3s;N5f-3Q4|0^Jn^T@F~=AUW?%Cp$w7moLacP;l=ij_B%f4f6Naw}f;d4L
zm__)pD6YkCHI)=5K-tssDW_C0X%3n<o#rk>x4HfVQug~vh8@yYCqdrjpy;uL|Iie~
zfnSx766R;mln5RDua+CVAGwntXK0jJc%(F4944?Ys?bmJgUqms(_-@V!$*+!AF@D3
zR*UwpsyuWMho<;$go6BzLE!GS`u+Bg7<QjnQ={|42P&=xzTNT^BG(7{r7BdV)dd{L
zz(DBTY{lNn>Pk2WTkyW`C8p9nl;zUNmNYP%tp4Iw`s*>y%Ey^+6R@1MsfSlnTQ7aE
zL5}%xz{&S>Z~hNMqwaoH92S$;EcybUewPPv*c;WhEdNldAD#Oiyx!n=DYY7Jg&I50
z#C9H<#1V>(z_jLrMuUwDRd4&b-O%1LLCgyI+_4P`-i*dbX&{DQj|d=W-$rH!8Te6x
zus=2g4_g_IuKUgC%M4kDLan>$^po4sDpG-5Z)$7oK1DKJfhZ~k_8iIE<X8446q(5r
ziZ3lVPPCy<4Qd&zk*)Ekfmq#L_0P|mqV4kmwc?2Tw%sn78{E4|xv5@5oStN01;o4@
z#(z}G?Pk?eDjqwE`V+ta;B>_ssVp@l5uK17jIzaLXFo%wqx+N)>h@QY*PBuD!F!Bb
zhR-v`Xq*Oc;mQF9zX!O_dCK$Na$N4t)pAL@JwF(DJ%8l;K3=-6=r%H3?}u-v%cuR1
zx9L4)lc~j^BJfkoR7~iF5(pLAf?qx`$O7ippZV=~gKcgrzq$`U_TktaeaCQ4&rs?P
zeij1+V*~i^@}RdZkGq%3#G<JBbKh24PB+AjZh-X(d^n}iXe@UK=xX2L2HZEMNp|iw
zF^-ys?MgQEy<f^Dj>B#Rv4!?xi)mkU_IsXZ0LXFQ;IN+q1gm_^bU0pNa}Eqm*Di9}
zU$B3(^i#BV@_i9?NDT{SWH;#6Gto>%!C#u*k~ET}*`}_A9BfFf76%(w`|^tFeR#19
z5{hruVCcGLe5JcW7lBrLfZ>dTD=9F+6v|loNQ7TGt24Zu^3d#=^+mzj$%Ha7?UBVl
z`>`XwCMG<c;7YdR=<%0AMDPQZemCJ6zwqFNV&|8VsiG^#Qm`MNW5X|klUUuJ;Pb-D
zqv@5upU{Hw1ueDq{`UdpHARHvQL;XJ);8^=Y&@$5NTNzeiKn{c%|eskfU-;^(qoSw
z$sA5OQmYIOqqhy$_P1m+6^40Ff6g<OXtKt+q)j|q3kGD7qp$2%gi=XVC7z!#YOUaw
zGwBUl&=<vet&1<EmmfeVI#_UO1@mSD$epmzl`S@v%U(1=Iq%XhG%t^LEov{$YE^Mb
zMB2PK(1jx{iQvORG&o-=QH*La{3K?|8Fs3c^U^{_*JL3fhl>h0Pbehh`*2<=u2oWn
z5)FVX%eo_h5jSQ`)9kIWg_*-?`Hli&N!l^zc50w5Xu^C-5dl3g<BLk;w>^<6Gkg1g
zj-sf^mEj+`iw(?9{Vja9f0P@DC^S}t{1svgvpsZLL4J>+JBcrBECr83Y{CVdIu!PE
z!x`nT!((~!s2=`9my=o>;g=wANX(Az{o{rGMJ&<gaEyAaF>?lo8+xv3D$>qSc`nkV
zygBTXAg6D$e`j{<)|cZ<2`@SUMSyI(^?oYvm$ej;)Q6;9pu_J+J3lIvK877TA;V_{
zA7JwEU&y%PDGAt^{{1s2m{K}k5&Okxu9en>lBrp{7fxEE<9;l0q6$moo&tEUAb^^p
zZ|Y7Lyf#7E89cs<yvUNkOpg9tnuEXRTbA!ZKX;TeD8Wz$2b@~I!-`4syqE~&;`v0C
z15{oKyj&L`&Js3*Czq4bw@+#9znK`<W_3O0^7d;q{Y3kba;cj@|CH`{6@*Vex4%62
zsfLuuL^~ex^G~gSawGyev~3xeMG@(H#5kTX%GbxMaU3w=7sqIsVrVDgXdx>XL8C7Q
z(fv)$r7`+CSbi$UPl%on3+KPOjrWuEE2o7GH0AoB<2>n%2!t)QxVK=28g;>ieK-`^
z<%Eg3?2Npfp5Q#i`oSy4>A{tPM*1aK;m;kx2eJ`Y%G|v|7S#*W-flyYQM8tuTGWSC
zCv(+i6jmpytE=HTOHm1xJM4dZJ(i{FD|FehZGe!Uf0ge-fRi;1>q;DfAxN+b&tZAg
zZ9a%QeJj7Ub~bM4<-L@VW&KSDlKgLmp<d%r>AchKbi<;JO2J`J@*}L5$8g4iquhcS
zi6QkYc3qGmZ>26vy-Z(j^pD~nW*{VRC;^Q}gZ~QFC)tc(forCh)|4c~&@^AR>q2x6
z&$YldhB*+PAH`thDP{;WyG=8)$i2Y{Or0Qprru*o-J9a=jffl^mA2+z{tqP~kZTH-
zd!3;m#sN9C%Tr97r<6<{GGeJ3_9imnsm&o%s7+SP#%^N}u2t5I-H42(0X_^i7djb}
zla`AfNW;itXIqWeJS$|Zi%bn3=&XZ#oI8bQnTj@YLDFCe^{AfQeBR;D==wBO38L3=
z)Z@~nFHmi`dk|$T`x(O`o%AIvRv5pddKLR4G*IQO5>bN%+=tKQE!n`Hu^ekSc9$8D
zs`Mv}K{y~69b^IVGkpt+iyC4M$3&!JFies9cK!*0%igT%N{cCail8idQCApE%nyQc
zee7!K7<#KNm$id-=VdNFy|mqT^%a{IS9?2ADaL5K=~(ha1$PSiOVzaWT+M-&uEXE7
z|H|LJN5LHBFuCHDJ6%wjL?5833b4fn*<c%sH^g?e`{>5LeA9(gCKBsm0AlP)dqh4y
zm-YM|{$fchzajEdBe67gt3-V-FM(O6?S9a3$VvV-)d}A|P6G~rmnT6&I%_3ff{!}M
z<~}>P*9;3E77WY!e8Y#Ms~sRu${no=PD`byGGCp<>0V1G$$75$8(_m!qBJ`?k$nsP
zaC17Vi8L_(Uj9yXr(L#pDOrwM<FXM#@U~Fpl~6t2@XN|m2T+T-EV^Z&@_TRrV^r*r
zrDDZ_Jx6H2LGMi(F`fO*$1UiQ>bQM6WNLmp0Yh!t!#c=#H$?b0?=+8j?aJUV*kx?3
zH&SP|&Wb`g?<RkdQzrYUj$|#9pJHR^PAUzF-}yag!;_O0ZKCqp!jt2<ENY*4Mw~Z&
zmzc<@_Q;bc!VaPQF8`!US)%lI+jV)rKwfbm!UnI$UwtP{sNM$!73YAzMCjZLw(_P`
zPJdR6XnIKp@t~IB61^QV;;P_mUt8#efxo7x^gCqZv@f&Db5{lHmGu<D$TetO6OE8q
zJ2aj6um4qj$6ka8MjtRl6E1RLJ6{(a70bT=e$LRdSXJA7BeiUXYNYfTZF)nF$afyG
zGnW@Cf<LbE%#4@7M8<nXP5Ju89&xXr_Lr6T<~6L8FI|KlC;oWyCePI2Ha_Hak96fl
zDu4RXF3YCzFQ~d{;ky2+x4HXlFzVIr3-krGD)zL~GfMC;dUt9cI)D`C9i9apv3r&A
zeK7*&h=46yrqf6psnR7C5B0vF7Y=Fmyz@@*rq2zkc=1GKkk8VOi0&!W>>#64BE%G_
z<w%Fe;p-CXQ~P$GG~>f@GaMNa8YY>E+@T~9O0x={FKt0gaBP=%BI-Y@G!^FrVSm`M
zP`CL`9!*&SLU$^ALqB|QDT~GAKSf_kK_z5TPWf<RX0U|AWxNcOJrWSLP&=ZNkr|L6
zoRA8ib{&BRG+J2BRCzw@7G}83>$SSRVgosO-+V(1pC=v*jvKAzQY=a+b3X#Ch|&mD
zN1Xu%z!LW#=O>Q{U9?<kQSS7wO@MceCpNA}Z2TZBAP;M>Uyr6w;rTN@%|G^mr_t*M
zd+(8f<6*Sf;b*GmH)^#Z*@`1XYlhx1A5%0${RtB@a@l1+{zh{6yyGCwvksl*mA}_a
zbl-zKpI6&$%~CLx=t}soFR#Lf+i*}Y#T|(y;k8bsXatMt*Np@rQWW&>vvt<i9cgxJ
zz2p|}%GgoqZHZ-{gfLSMbhSvfe97*aq*6$De>ka+jJu}tdoC~h=~y|v1t&5NN%#bI
za5XKY=EI`)>FxquynHg~_rtGIlX)F}!g|3S!b_WqGNPR?)vMFhtTv}es<mWa+N@(E
zFOdrgCVeRzFNn4_VruuT?Pw#><)dgQ&IDzb*z{2Ao$kAKcAq}2XXFTE3nO>5m;4AF
z2FiulPP$PeV}CzMNoY1Iqh-GkajRP=qZUMWa`{D?I-icdP@hh@8dB7K7be0Fr!^mo
zh<~1POq8cLqv~y5Yf5onxB-6g#28{P<9N?&rFg5t+KnC|*mGXTOg_{;;Zox3%SA9L
zB!LBFUlzWG8rCVGwjnJ(bmo;%S`0_z5T$%naxU6dv1yLk%K;K*{g*&|YnbxuA`qOp
zCa_m!ld@R<iC&2^TJKjS#Nir~CAszY80iTmC>c{%hz(4PZqiNRrka{~em0XZK7^dg
z{v%U7Dv)+|*|}@qx?Go@{lB%H8L1!;7ub~X8>Irxjsd(UpvZM3!`+cH|3*5RhDrk2
zg!I5fGW`qVE9*=glh0fQRqvb=X{WT3UQ>Iw;Q(RQel`e@y;*MxzfiR;81k-T&z{F^
zs0iH>t_ch27GCO&vnUH#&(-R9;Pc4;{NOY9n!5_n2Oq87*zHTHrkEAToROM5&kDFJ
zDkF6iGhTM`HCnjDSi~yUN|oq%?}^nL`k#&-c0Vekfh|p|AcNFdd^zV+y$^x3i+)6(
zh@`>}4vo(J7X((2ug<5~YbmCX!NvexLj}#HC0HL6b&ofN_S<-4PPXPm|IbzhYmItX
zWE0MuDSPG~pGzV^erAy=NiHH@&$TBuEK?$!O6(JIVcp~Bj8%%e(-&^0ztEiD^1k?6
z77psxf`W+goa9j9e~KR!gqF*x@cHeI^C_j1d)G=5I6PZ`tyek^C5<70*(91J@oa0`
zx+St(CN?p<3)om9`k&GDgLG@?2NWRn&19eGf^t!V$7=3yNBJ-%g=f&74c>nWyg*;g
zs9K+5JEcXf_!->5<&+2Zi!js9H}@9@K8_exXj5g_BmXMv*Fd}PXVxbdXk2+?AFxQ+
z2mO@zVCRZB79Z(k?{tbxFr=bsBCjttsV=Nm_I)CxC|@bW(B|yjmF~>fXx;Z;wQ6rq
z#&1@_{&UsgFTi4M<{L+P-FdivBa4cuv>BJ?>F7|Ru+&m&Ob-gf*AhS$?D@L>A3EoD
z@dr}<Q-9EvHJD9f`?-~siq$KO@&b5uCK#psgmYvQg!^OZDV&cseh_oII#7Kp;O09g
zJA|)LIP2JJtoNnJI9gmp7bqKdeizUo<!YZ4`n#HwT=2nr^e!TNQNld0Rr>j2ojQ6%
zmoAYA!D&!rqO#QGI86li9UXf*tEx@z2V!>&D<#?@+xnub!f-N|VJ>7Ok}pa7o12D5
zGQUXfNc<}>NHAJ5@`V)8RD6o9KclotWTpCYkl8Jxd!CRhbN0;zp<7iU(@Hlgs}(LE
zu!1QXN~l_CJo0LDI5kq4d&o^vYcRfo(k0}#-NnzBdY|Q!IlpK(!N})z>|J$#Wc5U7
zHyriOvK6_d)ag9LyQ`gZAwMC|FwF@j6Yx$JxErzOTPw0%?ROuV0Up<(O_$`j=7Vgh
zsDi*W=~_#GDpy+84aZ>%(r&g}_doL|fwUq*r`7if>6Vu*tq5XMGBGFbWm+wufPFCJ
zYjJ<eu!|9EEf;8e!pO$5JH^(9IG80_Prfx2_-0Z{;Q(w=SuYS0QaY6;Q#IIj_<dO`
zsbiM;@j`43jJ=El!T}@;J)ha>9yaqn)010M(U&}Y7lko&*=i%D8IFQv47zR?XG;Sf
z^3NVE;WNFb+28i4ZO-@MwAGE_ziEjA1AUbFN=XTau}J6RA}LzpkOYPvS9}b`XPZQS
zA>h^;J>TL!=P>_j%tY$QI<Yr(!GK4q_KMG7<gs&3%kH?G2ySr0@JvI5ys5>pbz{{>
z$=9ZX(Hf>nSBn;&zC~Cf0b$Wro>r0Mz9`6QQSFQJAFlaP)+x2BE$Q}St_0<`Ez=#$
zJ&B`rd~@hi{XsQyGC^`b{&b7u10@){84D0>h%#4}eu%tJ?sW)HOpDbG1*~+?IARe6
zq}q7y^+Gc?kMG}c#K+!XfN-B%iY;31s@T!H!nMY?zf#^Gc5kK)0IloV;zA>p&_SPa
zp7F+@#v#zV6mSf`sTv7}S&Wu{Q7Ji%luhE6G8suUxb0<7u1#jW%FPMz>W+1^(rOC)
zc;PC0{gv;eXXb6r&nu6PLG$rzl|oMyWvHSba_2ItJJ65>v05rBV3J(Vt#JzrB)Lt<
z+CyK)Rv3!&os|j;N=LrP;q|zdEL1HckD?BBBEcPC=)mor@YYv{e^Qt}0^@oe7>UW(
z5;>iJ)t7A_HHGmy^ow#8>FQ_92+y5y<rnw2sL6xU%4@nib<%Ngu{>HIimYV>oU&hC
z%8V_(O?hZ*jgM#Mi-b<(yoWfCMC>Z~*EPD=5?yb0mi+acks8w4xNB8@DzKbuk!!XL
zl4iz;@hP;|Ia;xts}OPAUpx}#(~4ju=onJGS-*+Xa}%W&Jl~y4Gd?R#q3RkC{xjKJ
zw>BK5hJEQ9Q7%t1*Kd2m>cfGG48ddVqS&?g6Cxqe+QKIn800xO`tiU?T6&dAPiDUU
zV9>+2b(@;*-8qb04D>{sRgtA9i)U1K0$)=rBuB89B*}cumNFh)Udh#BF}<S<W<5As
z7P=YY_RBY)aQ9fytzN_WGhg6)-C)~1ewiJ{H8Ra@brnN**I!?e<<Pn`$bX8_3h(5h
zay=#9h}!~2QOTm`Lm|^lp6$Y1jyr)y{n27Y0<md$*OU|iSWk#OCkOv9I}OW@$pUfK
zdR`h{f7`9Q(L|wutj55*-Q%wk5hGjUPuC=T7Fs=*(1)$(>*@8_Z#8peQ)$dt-Ab+R
zgew1xguc!GU~4}NO8l&wmYt@c`%UG3h52>u)mIBGEaR2^*8Wj)eqCJO&%j=bZT7P%
z_e*6~|6;Riw#L}^+&fMx<aYS&rShETelwQamq)5a?yrd^%T73=5y&OoV-)Nt9`FH4
zCWlm>^<3f=-U`Rf38`cXcPQzf3gYQjJ<Zw==EJyWZC?8U_YEq_n2#uT!%MYsWi21+
z0pxFL+e2dK=1`;glCULOE%SE`OitvSl+PvmBCA$rZt{)x=0Pj6o^($C#Ngn^<wMmd
z<gdKWXyIpmd<@9~#X48=x-*yKaOboAmg*B)mEzYJMD95^ft1w8i`Rv=k8nNwT+^4l
z+P5^~RLfjNxoq^)8p4+^ypHH}F8MHYf-_d6&(>l%ZL?SI_*s@Hs)Oz6RO<7O$J<+;
zsVM2zOEO&){^6jC-db#KJY21QC2rYW>vcI>r6zDQEFSglO{=(r(A~z(aU=SbM}!+(
z2<bwG&wAO`zdV)Sy_IE&VLs2^WQ1`Ew`we^u5J9lF-`Ofj_UL4O2z(|H-B6<<Pw=h
za^+K@Ky({x&EqsRV-}RS-DGgX{bx2O3wFi7*%zg-{Nt-)RBNZNW%prDsp*3j6rhQU
z8#nuzXs=V}ShILUq(XBe&Ucu3b)gIN44EDj7t5V!RHid*?)+za6LR|p0z(^KVV@Z(
z&-QitoMfa1-dPRtjn?W&w9^4pTyC?I%u{yv?K$g+rWm#)4S@Q3gu|!Fs!<PDbHBa)
z#X!d!POr;p-J<a5@+|h|>eQvlG0QU^T9!AGE=y+Nb8UQh>$)#}i-2^pIA{`|#d6C>
z)XaA_C<PBiL=b92<FwMOW3N(Bsz*g+eMjz1jmW9WY5fmfBTNX9Ub~*gesd_wfI_^#
ze1*d3XLJEo0TXf_{`9uMk?@(@<Vx+)^&6Sz=!_J-mBzt0#ONL9hv{uDX9P4zOY}y_
zp?t2N*dA?0X7xO7;>z6eqHx@q_R*-Bt+Oyg&V&egU({jvUSg8)4CraqNv?|P;Ya4^
zd>;PJ1K3$)oa+A3$;<}VSU<}Y6u#gIc)M}H;)w+uRT%ZXLkalaZj2;Z#U9j3qVNYl
zzcCT@6<RupbEnnUj|dfFLG~lzsg4Ac&;)P2MYju`D}#(hJdDBgT+i0U!-cjM@ve*C
ztBJXoxppJ*OjV!uPfM<E=gQtTNVD~2`W}IcC>2vUp|h)!uP){v^}F(Wq}EIKOO8S4
zC>lp;|2IbKJxiAI{z#TMgOh$b+idj%9A?{V{69sT#9HB3ozqe#HlKygunehQnO+Ai
zs#Aqr2`&7jOKeFWst4lkBx{qeOadXJ-a8NalDUFeZ~IN#!gh05&De!katrFjGP}O8
zZN6oGN)$}MlG`;J$HCGw>5(qresvu21^4?FwA{;Wtf^*E;ED0IpP9+?8X?-2P*OHw
z=HE3WIgcdp_a3gz)VpuiezIKBL<-a5#+{>K8PM#(v^sGiW+j)&Cz3GL9mpmZIl!fi
zzP)itu<zpYa(x(J@t`cD!QLdn6{EqD+v(5BNvcgM-l43sp#`Dm;qy^<vu*avLplmV
zR;!<ie1CpPo{_o_=6d_Ka7<;m*2(`Yv5)|o!{+0ig%Qin3vWKe?l2O~B2Jf1^i-=P
z<0aRn`Knlgex~!i`J6^Dv6hP>zVxTjW__CN9dd3N_7nTHK{HXcYPEO%jp`Ne`_13<
z@Zl=xG1~Ys&XropJfq$usKC-Skbj&X=6NOsme~-PH}@p`+RKSdbm@KfB9o6)_@2(%
zuCUm#>q%tKzCGbHW0%uCYu*`dCv$7RjWYonYf*i$bV#SIU=Z+ImWNF3ODb8_IV53<
zhjcg*ohgo)3|Zj;zEX1@HaFTnel@BAVkTbG27*1W!cybz+$|2TuRcuPz<GCe!i2<n
z34@SIqso}$jC0Qc5`q0X(tc3j-8A0F+}f+RAL{2G9BCgp>WGIDQvq>cESLt@-}aE7
z<#L%+Hoc0h1Hpye#dR+s<w6|6n^GSs8FsLGmEcSpAy-a_+7yilrmDTwbxdUi2?lWA
z%Ar(S8};hjt!2w4y(`seHpzV^0Q(&=%parWjyGOqFfh5f=m4vLhI~)B>lza}S{o`_
z<XUQIvImOatQB3EwS~7iNUivti5wP|vojgG;;jst2FQxFZxrTeRSQbFEJ}NS|KQhU
zrI7F_^J$wi84dJCM&89Xd(LpVzzQ-7$1enQv3<tAmG4UCwySq!+?<02ZDtEj00v@Z
zwoJ2zkV=WBiANuwQz4mGhySuZ)^c&}Xn?!l6Q51nqzDhg?gxM~^uTJ<u~>2y*qqIm
z3(7z45+3<L6jhoHzlRgOS*4PV0TSdScvKMWBJ?vvl*^*xea}z7i56VMIjeKO{3u!f
zXc5}?U`^TCI-H$pGRqoFI4e4U1?9g$MUL8?t!6~P?9f4b`q~J&0oz+2{<NIy)1pUX
z`P%4hCnINo)7aBzA0vwK%M{F~VjgnGV}!J?qE+RqmRv{D$zmk8^O%-u)Bfs?+)|~h
zx5K`<Ir9+We}+C>Qe(L|K2L`CNgrJZ?IL=4p~hi=N~78+CrHjZp|GAz;I259&h*DK
z435s4fZe6mDPj~-8T|8KEDl@Mw~M$YJ=E5V)7<->cY88)gqd7xPrRw8n4aNodn##>
z7Qj%+AE&;DM9tJ$mvL=<Xf8`EjEk%RMi|Mo#f71*uP=kwI<e>yjUkxJY_`ADh2&pb
zs2;o8?an%QOCrvd8>Ae_&wVqls{T0FzU|a|>d^Pg|7Vk!MS*%jq05&W8o-ifau<Q*
zjS+fk=ruQI7IKWzA?y>VFS;SA6kl6QFL`a9JEd`kueXHoDwfyNYQI)z)Z3aUF>boD
zVW{^^TS@Ul9DoS<t!VgZ++c5UxlLm$2kbv=LNNf7MwG&<^XV0;g#Cl0!-yLO*{|LS
zC0NdvFRnK~%HVL?<K6J^SI3WE1uCKd3=*SMq2bM^!??P<*yYgMjz$z+5ymT>I`_kU
z+>tmC&=QUCj34cPOIH5D5Ec}C$~o-#$b~(#H8s`MfR7_K>S5e{gBw~6w2`=)W$0I8
zAic?jv?;IGO#PI;b-PfhKi5L(wsrWPUtOuwnGPTw#@a^*be@CwOvdY56NirN2zFPK
zR$Td{WMx}rspk{N9T$(eDO5EyYTx@e<zA)o+t1cNWtcjVC1B|{M872q#{9sN@TU{2
zAm0!d6@~gWdMeYhD5qatGWGd7nFHf!b$~Y+@54&d5#uhR>qu*qw&?%|7@vN`iWQNc
z|M{iNeDTa<%c>3B3~Wj0<6+NYbzmP8`K*oPtt}Z(lW{Szph5CT7>2`zvGN!IM_8s%
z$mt=kg3}ifCK)B`9OaOZVD5rV^`HJ^j^FG&rV!4NOnB>Z`rEQ7VWbL()sac!U74>d
zLsy}!kdCL8CtIqeK&h<DhJb|Z#>jYK`(1m=@pW5NcOr{tpT<oMpK;;1$yi>E0Lw)z
z|9QHPhQuvNpu*lS3=?gj+8kS6kD^o4-R;4M0{{?!3u2!*+<2qkv7OwP#Z=$f#-p(4
zX7ZhKlh5WjAFGGrGgi_Kxvmgn)h3qENP;EdA~rnPtKRlq=Ls?vi9u6UIaY^a@}@Q6
z@#YYbR$gAT=UGnGAEB+3iDG9uiuXQ99T2@KMn?4>>qhRPL}QpyJ2tzC)n#?xTRD-*
zO0#@j<7Z~@6z;sUVCAOvX8YfxOOAL)W}VImb5kH}ecYt%abxT)*z1cD-6%9)aOUg|
zh#-}8+RR^r(Eqt9p!!Kz2M`ND<}9yOS<J@k3sBAc&f=+EkA^}LIr=4jSI@149vfUp
zZ*ciDZ=O9^8tJ_xsd5xgt2yPKsRaJ3d`X_6(nQy(Vf1I?t6Z&4rN|4<mkoUPnY;8Y
z&P$hCZkC*8VqLHN1kK;1Z@=Ge@I>C2`TYk>Q^Nd9Fyxa4EFsIL{m;Q8t1FCvKbohA
zcXkiVxx^p}vB6d9PCQ5bWucqYQK=>3z=|!665sdI>?@@SExcxXlAr>`*CCJ*+i7$-
zben@!Mbxudz+=hpVV9!N1LeR~tgAU-<-x(I5WK}H7Qi~#IR2vy63{zBc6MKd&2>>s
z2znc}9I<Iw${SZly;NFY>v(!ycvp?gfR@xXRqL&nO@1yo>P22`F~?4ZMM;rMgW?qa
zu(7O80mjyx@rXaKIrq^t*rvFGv$f6^sjRW=#smWe`NP`qk-H!bUNHZgV(PC8c)vtC
z#pOblA1umS%<XA)-USA*3-bCHD~7+GLl$KTbVtG5;n0SvN58oVD?%L@P+d08@Y-m2
zptcrN5uiu~`vR>CuhHRg`T)^#Z#3s;g2`lpF}g*n9KMG0xQu>{&>hpe7Cw`tDhBwx
z;1W3|)#SdQBDyZGd=)7dByPKK3c@1oUSQzpNr&ra_3X{Nd5wi7V{6M4o(m^E?rp2L
zW%*eBF_Xk>Vj-rF?P{y77Utd52*=Cl>T@TPEiDQ7$={OjxODl+uV~7qvJN-QycU^u
zzi}W_>izhs;%t-0X!{rLZWun>a3G7*af1&rqkL!cgYeiT&#Ue=MW6KAPZyr#{P{%j
z+$gGwaL2TCIegwWOHL|Y>=HSA4sr2Ne5!Rim8ZO}^7)M1PJ6E#XHJ}dt)XJL_pfR8
zAT+5%>t|iZroIanw^gB%e*lE;w4MW~_AlpK`7d8f=d})01%ont1YSem4SOJOeB<2a
ziF<Tq`Dw2cXr}JHq7mfFSDR50TvXxzaAU57wHac>)#tZvEdJ2QS{&8uwl3Th5e7S5
z-tzAXLpxwP=5kvLbY7U?W@2j<rB8X`cGq%4^@Oeo`&YNS>Nd%QkfiuC#!|DeSd{NI
zH_u4&(>OhZ<x{$#cNKj?2a333{8%gBb>mk$1N!Hz@!0#cm9r(yggN-{GQFQr!Y#Ov
zan6me=j0dIJbSiHM#J2&v14NCFGh%ZYVoE=mOiCJ%1GCAybK<kUS+ORJ*30A6#o4?
z+nF<B?K!Hg#MUxygKDu*Z(DTTUcgK;w)jH^>S=(?WnGfdc%}492e;HKA)ncJ_!k{9
z;>og9mFt;2MH>|H(Q5=lz*J(HLhX~c$`7y?pkxV<=#ICI8fq|rVvPiqF1REyG9?9f
z?D=N*!Tp3GLlOHD`I|o}jvn)Mh>V|!fBUp;Y=<e-70<LyJw!C#y|;Bxl^*bp++bH?
zf>t^QZNEV`{u_RVviu__h{P~n7qj11I+;l88LGd}8S+;)3W6gSxlbXkE?vD98{Fgt
z%yzSFooi-IIT4(MQl_@I^`|L~r;KL^mjU(x)JT3rYcF1%-F_b4Bf=t8IP)u63oTg)
zQvd;h7M8A{{l?7;?Cwc(k1Su85pq-17P@+C*V8TQOVAi{zr;LVePcY=zDe9HU`HWk
zAKgn87j91!i??-#XyA;k)|M;U_m=9F<Lx!lqY1TY?`_l(OxTBFjb<FN7Iab@M#_Wl
z4j;=J@}MnTZx|h}8x?GBpu}uelO1kYPcV`B8Q0(m{Pv|6!!OdFbVNsRpl;l{-|kK!
z0h91s3^5CXQwhjZ-htc80f{v>9;5?|zHdx)PyILpyReZqZ;q2tmEA~te0!@MwoUAb
zVY}q>cJt>2;@J{YH-_sT<_~r7cpGi+>h%<=ai>kQu9zDB>7fIi8&Qwj&&yBlTP#4c
z3(M~_cp|>c3V9_FMm%AY#${3t+h24a>aNf$)om=ikgqMux9{=e-Cb<f62pt6_#EC#
zi;>3d@hclO<q-9yi;<m?ek=}p*#2U3Ug@o=D&(6PatD642)Yl<?%<bL_%Dph=fnZY
z&#{FW@o6tUnvGE+XY^+I5X9(*5_*tliHEZ^NSB8Rsb4vk3@#I+sxM%KK<%7yQg+z=
zJVJ<gq$AFsv9z}}en`S>+ll@r$BKgT^@T$GZ;ELsFEVOm$(wO+Djd={y0eVxj}wE>
ziNaC9#;naGIsJtf|6oEUU+1&@PhEe^WFle8=eM^-7G$pn$Mh})N)Nsy*djkt9~?BI
zPcPHtr`sU5v7NBF8BP<hMvflyOx{v%-z8b(g%b*ST1jGbL;&NRKJr~HH?jj9A}mje
z-{h=Wb3O}urr+#bMP(vh>>qr`*aQuO<^@TJ+#QqIT_wM@MuPQmTCH^G$<@bI4bCxN
zde3CJoRI1x*W{xdm>%aFn555$nre1fD%;0!p{G8(Q{Kmo&vR|doA2@MQ8|3Y7~I<v
zrD?j@u}pxx^7sq^ohtx)c4PG!PW!wkWTa|iH&u`f%IIvky<;eN_A(4f=pmOpG<(Gr
zZF`|A>B4Ul$K)b`XX!H%;(VzNu-v8Y<-O!~U=d}cZ2g{2vay5JWJDE0mW3du;|+&s
zHBOo$zkGdlM$+MEF%V17d=Z(f+%;zfNtg$*@skA`%lRZ274OfhADa62L2Ek6+Df~V
zeqZIphW3Gus0Twb+2Oiu5`Pad=B{i4Z~N;`-@Xg?0Q?{~W{mNq5;uS8fM=pppthKB
zxo54=l3V(o80#)MKy*%OT%=fLN_WH6-*x@-4~oGvEYu0#wY^ZCf3X1Uc<n<MX6t%7
zs+I_ws-a)W`?wIZT2%(U^Fs_fn<^@==0?0qA9uB%9TPMd&Cp)tf7?*Adw{xA^@H^B
zS-{a^_4;4xcZdhV6V^wqad+|v{OWJz%u(%~Sg>FUlVDkgU?KWk5qOElm-96`@~QFG
zeOz+OJ<TfM5I?3a-BB3BZh9SoyV)N@M`gKgHVdyEAsGs{Q8R_3%KO$m%AGmcVDfGv
zfmjIroLOdtCF)0fL@67tee&GP@cs9vPQKI#U2J}`^+bLQ=*hcYsI9!z3aRrj@%pKn
zZ8&0E*?13|5d{sqXZ!vZlY@54$61=Tg;D+{=(5f-tDhXk^`7f1<-&LuHnAy!5M5jU
zIL>VYf>a*t(9!~Z{GS2vFqExV8-J$ZjtJJK!-?s~``gx<gw}<i$d1oIO5CIKsl!3G
zggm%c=hq*PgZ)%NJ%sn$J7u0{FSjxnUMZWk(4_oa;4|uJy_$3mNq>DcMRr$<bczqR
z3C`8kTx_}fK+Wi&I5Ow{Yv~Mo3V@|xRQp)7Hi=hf1aokmjw;9tMH~FosaMAuh}O>(
zo+tN+coa<i%(eFq@ZL^tX#Fi~r<YhTXX|#S-f+eM0=CY!`LoAcw!FI|%s7xx#P{Ff
zeH3_vSKs-Yc@Zm(hRo%kOa1XRZ}H^D+*&MFxVJ1bkUtEhh%e|`xS}ejVT@>YyRg$Q
zLitA8^wCGFzP+6Zdd5mqN8&4Kx8kFh(e=@DR^i%ZX9#Ln|Cpqf9@%OU`Q!P?q8Yw+
z<7Tq$#~x<H$aJ8TITJibGa>0X{^iYN%B*UA(moRfHQt^}CU@kc6j&ix9fC?D#p$TF
zk43F;Upj?eH&T!43R<T09PhV?@*Q{22`>H5^;(SQIm41A89#GOJ3r&=suAiiQ2?vC
zcuJdF9B<8f3HpeI^%P5e_~%yx+wQjluS&=h@y6}3-eaaJ3@0RCg9RHCEZ9y@#v42p
z!&c|R2kuMzf(RqwAei0kiG2gagnotyb*z5S+>K0U&4<+WKLqQA(mpVx3MN*UnZ5rr
z_w$=EizkwQ+wpOB0Q*qtvwF%tZ!?D1lkGe2AyS#ZlS2*98fFbJT8=4FVOf-gpopB~
z|I5K42-FWvSYSg)O>7$=WM4-Nz%DdB=&-FlVI9D>+AA{cJ%R(k+pP`Re;T3S?g0-8
zVOjsc6m@`Hv)fd%sqDQi0wTzoVS^)0kKFj*c&n9YuYE4E&OcIz)&JGp|ChT$5?Xwp
zUcM_MzLL8?_5b~azXjN=)%wABdUh~h#6uS-fbGZ8TEt}SwZ45a`$8t*5kdIoZFV5g
z=5+4(7AP^Pnn<eOdv1YJ{wt?arXU%;aewr^4v3|>#GZPSi*q2&7|ef>HtE+pV;{BF
zR+BxG`ae)e_dcF1jh<(7%@%Q%OSMBIj}ZpO)bAkxfCo$Jl)~;%2x_6SCd2XMz&P?1
zcuOvyaPscu``<hKE|2%D*zf6{gxC9?CR{>3h(3ngL#_Or+ep>Pw<lBU#|6sFp0{pX
zHp^b0K;v;qr)_7vFRIdFQ$B&sJ~x@uL!15MG!(d`nXb4-+Lr=+%5CTCZQnL)vI)4l
zowuWGBqFO)9*trDmr>=f>kBXk;EH)+VnzNm?(cPcCJJa47O@pAfIuNp()LFD3yv7H
z7@d@NdIS@j9V~9t_lV!AvAj4;67F6t>*k!X_TfWij&vW;dKdsIaoHNN;OBj0VJ(+a
zpS@-smKJL))K(k5@+gpAi1_D<V-S5)vn^Sjhe`vNx)dRY*$X7J=h0xA<Ui=4UcMgw
z*}F51Y8wT=KiyoOMDV^2e-;G1bZruPG%dV4e={B-(mlN0#Vh8VnST7D?McX^>&e#J
zuRy0i5L2YK4y?^41l#xBVsBdhfB4LcpcPIdqyQ%qd{F2u@#UG$va$Vf?-LF9n1bEL
zv6;8z!?JKgiE!?)7hfrD-&eJm>}oaur!_F8yp?XbDjhFY9msRLx4~?tA*Bj(l7`JG
z_>(HU%MeDo2e|&BYcgD(vD@3x0Jev^DSj9?UUlf<BOu2<@|_A919%n+Kx=yLmMa_C
z5vVo1JG}m+^Yehf-8Dr+HI~5bdDg-BpP8b2@TZt&t@+D)I^u*LKrzAAmWkU542;U%
zX=?>vSKbT9HGo=NLnq5}@)d`T)6&Rfk13GOVtih7nO7nFcG7|-UGcvAE6#qCVL#sO
zZ6okL{b0P5MxwHCFrSKTov4CXbyA>_zE#ijXF!O2aqt7LrIhC0iuJD-tO<8{$pds&
zWv0p7p2$K=RX62RvUQtXDnHFI14&#ks<lu20|m@L-2FoMTP3k*f-R(@t)epN;OB3Q
zf-o=1cwtb};p9kyE&pHDWHq4Wb+)#N^w<cX?Z5J<(XZ}4B1egpwC{gPudNP*8}wjM
zd!!+uu4=1?yO6;A8<W9!anZ!|U42g&`cNXDmi)FY-%0yi1C&A5#}D;o1=_u9<*U#~
zQjwICCJNC&sH@{XJ1OfM&Du}<K^hV~`VZb=w<XuwP0%LrIF%Mb8bD!H&sw+w$}5+`
z+-(Wk#Ho}vYY&C_7`<0m!0@!ncSy-IE~&#evPkZ}reieEx-dz3X?ADpINT!YAY9ha
za<GfaV-J-s)f))6*@S$3Dczec<SWZ!*b^2LOuN?>4A~TUtcUnQ0!?GT3@#RKD)OHt
z$sqW_-RWbkso>@lS!m_v`&e$&@e(aH`6B8ezzUE%a0gZIlbu)3`s3nicV|(X^Z-Xv
zzt#Zm`5SIvw%F9r*NR|GI&MoSiUAvtRZ~LARLBW>W}LSXa8WOg0XoDijDb8eY_FJ_
zJoL5dEF-xcwj|XU8cd*~(Nnkg_2)!(EA!KXanKw@;^FhSRzJj}S>`S^lU%x2$m_6X
zPRI@!f=OdU1*%oKO<9>J!rA)oK5-SJBUIvUDf!t4G?)Iuh)4Who_6xqWi^z;Z8wfh
zi~Ry1QHM@wl`6BRHrfmt0^-RPhS{6N$$nqB=QF^N)>W8>>^{dmvqSySf%g#N45##M
zlm7d92{aLK4l-(b)YhppV{R6gMZ{m`omv$B0chlD=(^Y+^$Aosz1(wkxfo&1pSBkl
zUZWg40;B$`f?vY4DR0c-5#-`tK+E7~!2z37fB4Y9h^;KNZmp*@1)&0av|;|6!@!j7
z4@#4jEkL<@l7oGF+)q~)Kbpf{qBUTEfm_vqvGuEIxk35&Ze7JznMMX$Ojr>BJ85$c
z-^hP}#n}rCa{7NhB58ik&O4w??U=8Q{K!<@5~ILHr?Vb=Z+$K7TA~^*1vQWUfS|zi
zx78>@@SM`C^#q(+C}PJWAWrbE4}yfx0&*Lr+QXiI8r!eVI9$)%UTi>#^8nknMN>3C
zvXPt`-K54xh(^#O&WaWXsu0Qsp^_iS%n+`#UUGcytbO{4*HlAr^H+hAA7%yy>=1eY
z&sWNl{f*OaZV`s>#FO3d9>A4o<G;_FJ&K-U#>le^dNaK1NboD%-XuTW?~Q>TT$EE4
zAYv_}yf5dA8XOsU;!@B^u&%Bzj!?|?-k~KSW{QP=<hAKA*`Kla&%O<PXpOjR>)Uz)
zni{$gV7Tmx5xtE?Ae25{kJFQONg*+xtu~`7U_eA(>*$MN$vakjxLHs2gM{BQp6-BN
zxa4qifO>{Cy2s43zy^$iBGhPcYtRp%^OJARL5&E{#|y5Rt}q<<+8r!M`kWGH7{I8C
z$2C%~Szuq%6(su~lcR1dg6FndYYsx^Q!}p353%ac?ue<$)pIw-(@CpgYkG`;f%Dku
zCw49vJ)c-!2_`3M|1!VDzvF#B3hBtSBqgujr$=bOTr)6;M>B*Q3FFV!k?;_fkz-Bp
zJ{Jn{odORnzW{e>MNj{0bPhzgh3xSMrjWqS1W3n_@%k5_D*5MclOM1v6N?gETS<f`
z#levwn|XuD;O4erli-*!FJUHL*JjCYW9)~K(_$}IDeNy&CEpL}qB_{ykV_ieQ8rd7
zZ&O+BR2kIB$&}%hK+vV^6eEp!zSYMpW8c#@oWeuR#_AbT1BT)d{Xk)PLyk2>Wpy@2
ztxK=oGeCIxkO&YT-Zt17M?I}K$P)KyV%>9xJJL%IZ(EMlhAOP>Ug(&b`k)M(%W+!{
zYmH!JD7|5`WfCnapmkK|Nv~oNiWX+x1aS55PqTJ0h}e52$UJv6ok+0;97zm545%`S
zkdjA}M-P0opA-?<+m07Pd#7Pq+fl4~#42Rh>Tx5RVD*S@*tjw5s4GN$fwfFA@`YGP
zV6~p<l8=qV<0VAstXOEi5A_lL=l#b+ReyiWq27$rKxzj&xK)1W1H+MuqGEFuguw{V
z4Orn#?XSh(F`^DSQ;ZD8(5dmh^GCrn`6A32RRbEi3EHng`*8&JTmO5ZSCi^a%XwGy
z9LJ~UFHOl7uHdYjU`2$rLkTLXtTrna;wlt0<ucKQsPhAE<jv)Y&NPaAX65y_DU8gk
z1N?-VW-1+u`ab*2)L_rX@+hRXu^+vic)VvSxah_&Jd>)^s7o96zpiX=xB_$_O3H=Z
zdKP~zDrZv;J3>elM);&7$;MZt)1q78@rTAdcc4k$$eV7)Qn06&>_uOpr3r9P4Il`!
z-H~EmNhjXd*b>g}%X%?{lY&f|p%ov?D;|P>*o~*hD^oT9f0@KnzkvANwByXpQG7V|
zh<p3V8zWS~W8oL(gH!?Y{PB_I+4AjSMiYOx?f=Un4^sjl%qE({TVlK_t{=MS4<Vd#
zyp#qWOwB9zHvE5Y>wkR^^sACPp#Lwt(ZBu^)LRkYaOG%PIR74H{rg1#9~7Hb`2PQh
z`yYSFh`%Q+x!F#${e21f&*yYlr|bWJH2?UEFcVlUGtPgN|3eS=fBgT0fU|$SjekEH
z@Y(NP*DQR{Z7}t}=lR!n{leJ)_v!uj2S_)#ZhpUlJ4^q^H2<%!0072E|C?O=^Wd<9
z?hWYOc%OCt>xuu*{}1A5LixWB2>KK-e{q8tFdP3{)cfx*dTI^B`qvTsk9?H|Fkq;j
z>JdBsD;@jmE0*teQc&Wbis1$Xqf0-HfJ!0uL0(8POEN}?B2q$VgeXy8Kw5(|6tN%n
z1sYLOzm!52vcr@HKS9uova$wU(=*5Y)8+cP=Bq8|Q^(xAyvEIx2S-+_Y=3<}xa(~H
z$6dJ(a~|G5Fb~p*G5_a~{&lBm7CB430V72kV8uyZIz3m4eJK;GxSooX9|hEnBfnfV
z6=Gi}s1&O7l{w;4N<yE}sYC!A-GXvjWQ~0SvvD>s-b79i&;9@WVyL&e6o6N)R^L^+
z)V$WK9aOS2we3D!uQXrpzyE^!c<9|5Etls@CoVvtw+7D;VRq<$%H?>RuTrW#W2#jG
zY~NAT^1T2%V04r#SLwOL|4K34YvfX?!r;8z;9ff@A*Qwizz6?2nSXzH{~A27Vafp4
zroCG8mD|Bw?Nuz;U}-@(X&ZB)8P92(S$r(Ac2Lu5deKAXP6<pzQEnGU1Ar5{4hFWK
z6u+OueJM?u_NNv|J7vO>xpMl7%Fvo@fa)o#D|A44=NaMuMFxgSLf}J-wwW#0TXa>E
zV7#_kfzIB|dR%VtRpWlY_s!LMa?q=Ll$Kf&tL;Pym^eHzhm;|+?Qx@ePc*ofs*y5b
zYv`W`lp`n*Sn5@Oh=lU?kAd`*9E%t3rQPT_<Y%9uU>$G@{kb#SJQv7|ofs5HLmrdr
zEPg-B!~MAaek@1ub9&aBjHL!m6ol{^V6*B4A^wmp37?%Pn5q7(Q3{ymUJ`LwMpT&&
z_kIz$1Dchq##)1inLjgpQHEOJ=gwddr}iA#P^wNCF~4Q4koMv8*p0%$|C7w-tq5LH
z&Isankt+(%sC7%Vrc{HBqYbq3KuQ7>UUuSQAsHf{`*-K+a`aj~0E5y&nM_y)Fzw|$
z2>mpK+7dX;z5+FK$6Drv$2g6$^@<PD+hTPl0@e=^fQYXi#CwrFn9Mack*9OMQ#FE^
zD$sg&ngphZNNBbm1T3tb-hkPP1S>fOF7txv1vxKpzN0fC$8yUNuJWmr>kB0ucVbn-
z4&J{~EsVY{&U}JCu!KoWb5D){8ui{Z+C6;|12r&JEB)SKK?(+>{+X%Do2j$S5(n}d
zwJL_LVs>tN7$o%Ps&62x;~>7!Ep)omY^)zlFMI(;8%Gd_^HHsS4`W-u7D4zSw|st(
z_3Pqdw~p^;(G;E%KPxXCV=S14;<x1rIIku|dqVM>@qmIw11zD@J!B()X7L<sUV|;e
zJOAZqnV#MhU!q>KNy=pT3SFtyDL+5Z?F}Zfb;e8$PbL7Yt-V^+&Tb1Smz|%P{Zh-4
zww2#(<bPl&Og@6w__j|YVc4rZ;c-BFtOaluMw)VI{LROI#(G{;lnnx$Z;txGFO%lY
z&t(1Wk5IBjK4X3~Pz5|L_Dl&tb2aMwnUnhwWQiexzJA}am2%}%Xsb*{*yqqJ-s(}F
z`NJP75LhrY!F_bb@8*ZTijZGe-;b}ENOhmA!FZe_9TzLK<Uz;Ka-R7_spTz0idqu;
zMBKNDQWrLr@e9>vZ+!sUsuA4r?oqgM+?CbPx(R^R2=N&$V<2K;`!a641M$DoP-7Ly
z3|S^khLQ&Gnq8#qZ}>A)xpN$@8DTVRmeY#z`412~YcIDZ9=XbPC;cAoG(_$Ypu`q_
za5nexMBVqLOhDIKLXJ{afZf~X^~=*6?(BEt?%iKjwG_hf_FJv$6|f5Hh(s48NS#HY
zx34v;6;s^~U#+_1IZ&7=faOvi0r=@{OKPp=#5L<au4>jjhUa!;kshp4kKS4EzEI#h
ziN3wlPk8#qW%KTNtuAZ{EK<=RQ%LlzBVzqPqgJYoYK`~t-!L7>dxX1Y48#K=(T&k7
zK-asaKx80tgMam7R@Xey_r?pY0gpoeM~oC7i7#XQg%RF2fMa2GKZ-k*F-O(NyBUm;
z|DJ`U`vxRk?-)Cn{bB&_yC45VMI*;YO6TN(PY<R^YGR?(7rtG<m=o8R%6mVSWHm?F
z-u-9&p5zRK2=3&h?fr2K3;=z|#NaJ6PJy!OZS$y8CWB-@9*k{R^<R#9Vf2n3_DJ7t
za<A$5)ZvRF{6VFox<9dLe(ak~4pdRRhfO=6>M`Ud9qXnkg0Mfoz4+BvL>psZ&%ssQ
z%JpL-sKM}a^#8fhcxSf;18f%lEC!#0iN|n(_9=U^d-wFb9>RGYt60!ep9oWRq?DT<
zNnm{U#C$7kBbr|4`}pTNS}@xO;GU^~?MguTW;I-ZiEE$t3}ZiGO(Sjl-#f3j9CW8H
zvW@i0onT?f^UliSOPNMiz;cyDH-X>Fc6bVd9)F!j)z<N{=k>v7t(i%yxe)xXI!Ddw
zA|Iam1&0!GBxZtlU68qu(+BG-y6J@8@Ji165|a{5@dw^&3JxSFO&>v`W`4L7g3r|P
zj%6uyd<=;T%(vj;MFnX;^pT2<FeDD6MsNufS?qV!c)MIb_-<4R%>go?T_}oSf5{!H
z`@3e>+TWhc2F@q-yRZt3Reyt40C`9>z7`Vje~jl?V(*RU^W{O81gJYo>2!W)m;)$-
zcOzAug#6BmfU>+9n`PeSb?bsf!Z<focBh)^dE1(-0COf}dS8!f&~XbfHeb38%S$Tq
zb=CH=xr*8t^@*W*QlYoEUtzHa+Jeha_(qabljX)pn)Pk#1BMy+EgwKU1zvFA4*`_!
z<d4MtZm^&B0=v}E9ShuWU^JssJEqfRLtns8ZG~&S8k1!9aYr38#^t2O;6EGXAZYS_
zaF5R%K=qD87sYZ5;PJz0f|9SHUg*=R7Sph>g+I3Y34lGrB{8qKYdAcdt1e7u3#id=
zef?jxj->FI9{vDSXPwj^ejWa-M&>y#N5wi--%CQ5SoJV&aVHz>O(|x1(;H0n0R^3v
z%RD(o?SlsiC4x965o|{=-YoM=^a_FBZE*(KTBp$YRO7<8zE?xGk4EBuBxWJs^$}<E
zgu;V|w`{9jr|5+cOkls4)g$x?4z47{o8*v4*0#OhTwa*kqeD7<)JB;xaJ7n!<r!Yg
z3A9Q<M<Gy3n|p39i|eWE#rbOZAT}O-)2e&G749z?!jo?zG*I4Y8?wO!d+N0c0I?=4
zRG$mX#dcva>V<-_MULxfLW@DBX(cAQm8Uy17QDZ{Il4=Tyt$JJA|;S5#&5J;-RS)g
zW(9)9d8CKeNti_JmVN^=Q)t{RiN4fyP1*)PpLVD^q#CB!2e9=CcaPJ{osXt;=E|`r
zSx`t_Jnu>b#`EJ8#BKyaYD|U_6TUpnmB-kfu-JzGdlC)w616M*mK<>r^1TrM$hdds
zt)a?v`Q*kB$4jt~y#*yZ79w2HP=J<<&+e?n<kgE9#W#@0c8KW0LoQ2<rA=(>2ckPs
zOurZH*x!C!*Gvso-W(NB(TPuw-?4g^|3)#uDJx`r6lwJR_|M_-n8E;m$_ZLP?RLv`
zC6t`GwsX`3U;xzvD_Tp|9eLr5G-@rpg^3B}cH1Fu;fED0V&qTFzn%K7R`mFkzrjZ&
zgJHJoj@jlZJ$zjCQHWFcW`zH4zUzqM6FQki>UM4N5+$|{A~fP%z9PTM83^83jq4RA
z^UJ^4t<&PZMLPI&ce})<@<%?O(oVGvz(h;?2A3iMfRkwD7%T@d0Cyw>vfs51Fui%(
z^~oxmTBQ-{GtoTBxmxRfj-_LctHmKQN(sFte1x%-WDfY#j%E%i-0XXYVrHr!z}<xO
zklyb4f8GszVvFRw4G6`{643cphy&P!iw@MgtB|=)I#}kf&eeS-N-z{oHxpRiiBy2Y
z%)VQ!i+V{&6{^SpTWbq`zWwhk(7WxPtUAyjgfBe%ne4PuEZbW?BNUGxmQOrt)D5_m
z%tYvb?5szhz--E%A$?x6_~+pxTk6)~HAHIyKX$Ip97|Wo58CV%3HWWh=Xk_CHoI2d
z`Kd{~5N%a}T)W7F44G2_`&#<%UawkLULii|<{};UR`<g?JR_cHS*|f0B7N*p(kWB0
z*#bEL>uV}+p=@!wpZgJKM?v`_7;ohELlUYFS>`SSe1#pmS8plfKsdD!$OZZbOg`v)
zS+_SR=K0E6QOP_YfZDjNLxC?iWqrECQysKQXD}Zb085)gL&g(x07fz|Tp?YmFicwL
zc3Vviz(!O$`H0@y^8cv&3a_ZQwrx^|9zZ08p%qZNK~h2qkrt#=8l+1alm=<A>2#!|
zyF=*)=^k48+jD%5=bYy`?;r53<zkJ@Ff+f{d*A!IuK+$YlVxlZxlhVcFY_)d@*{A;
zY_2NT;5l}Fx*@#3GY>9fPxP(I(O0BwtiZ?F4WAh}zZRX5iKE%h)=eK$J89sUG+xRd
zlw69N9jvM{Tae|Ae)oH2WF+QXci|dSbEqk&WOw;-w|wpUk}bUdV2!6o;Oh7G4C5}?
z2e>)fA!u1t=G(M*_(MY~wxE^>>~2_s`%5OH;ZV-&i5bH>IZfQzKY!@-^HfATi(nL&
z9J!1vabsiPpw<Z5&YT<Md;<uUk}@PnZC+~=4HE|^>oOQ*X{g>$zR9dO5akEFqt<QJ
z9@bjc@dwz|dU8<hq=6jBREiZFF_E*>r=c%iu7vS@!GD-~pzQwfPWy2n9bJ6V-HQd*
zw&F9eDr!g`Ii_%#d?m60pvCq?A1JgzMQ9jL0$k;l{bGa`M?BK_?JL6tbjxk{=ywWr
z1(e9+b<M;zXFT)c%PW*E;`Pk-5VZ%7Zyv6uEqiLd#Wv`t7Tt$&HsV6z8X5)KjpuLo
z0Va6zjVJ&2sgYsvulzPHbQVR4mV9{*wjl4bc#fpdBo$HS(Zfkt_&In$bDQ|g+Ot@{
zmXFKLH$%22T=$kCs16r1+-CUU_kfvkJELGIB5<v3CCI%K)Q?YUJy;dSKQHwjuMgL+
zzS3nFjo@u?0dW^3<u44@tcUZ~=(X#8#TMQL4zNM^M_}g?h{oa*uuI?(atR@%1twQu
zUW4Ia!$gVc*|xg_g*{$e0Pl*Sh^D>?%TUm3;&Z;DzfzRhdgFJ=C51l$O5u;=wQf85
zZLiv9K~aCtr^gpWmSt~)%sGYw2348%p3~pOL#*_ROo7;DuD809pH01y&fMD=utau)
zx}odL4-&LVY{iX(pzd>1MC#i?bi)NY%{A$Kc2gZT6@}^p-a_tNy6|N(Qq{z-#4fbr
zm!U+O(Cx6r=`?;@PMUk}ozk~VW9e1CO5L>41!m)%^+ANxf(?rwB`k+NMBjVHA5C*%
zuzg2&*0%zL5rEp#*xW~&_;FfAlZQ?DHpqTxv+xgYv?mT@L_-w#gNpJAC9ivHp+?U(
zhGYe(yzG`@OTWWc<Q`YM+LGJ_p$0JgE$`kS?a<>M9k_X6XKOHmbxdFuFDzod1piM!
z$Nx+b9Plv(XBwzRA$n*~c_0`!x(6ckh=Z=^KB5Fb3cQX*I-6n*t|^4<vXu>@XE`n~
z2MpW}yH{K{(b6+fZrb@cN#b#x_x**)Ks_}81M)ZFn}2#X@Yd+oEn1(0QqXX0S#0dg
z6ssn(i;2eykMuge<W~35MTxcA9CI`$t@11oiteQ`0sh6C=Dwf|lp*%#HhYkk$m#E&
zO?M(*VAehQG>zKh{Ha+c?&VP9J=gMFP~NWEBxAPBTJ-gcbMQYk<_-c{Zco>Km*2;Q
z!U$zC%TvEn!n)pwnx*n)W^<lG_hzHHb_Te4<bf+O&s)n<;*b_)r8x@uM5TB~2Gy!!
zJ*}}sEt2O~pfjQ$4`p~KVD2(r%+n&&N^8&#DS{e^l0={7^*bRy#DN&H1FYOmd2j32
zB%dRiP1lBU;m%rj!2$dlJX3OLC(nP-kDa<$Gfd$&XVFJQh=6BW_|ZV-0nUC*>;_tA
zgb#(NXVb)X6)mQt3cVh<1Yr<VgZ^}*=8(LYblth>(8X3@0wpPvYWDQ^k%60Xmzi=N
z{x^(2j1>*C`yf-a3Z|EL|5c(J-=v0m<m-#yqqq_oP}TQKr}`i0ive47SH>t_f&xDL
zmq$VXNSH{Q6E>i}^A}__Md}YCPCR<MU9=({S$=<uTTBh(S?{k4i2OTr<{ySMhvd>P
z_}AV(nA{%#z$xK}xyA31_m2lqE`SHpx4kO+7vvMbI4}@mRulesYV%JQq2G|suLtzV
zIG`5yQvSaV<M+7$4`gnG`Cj#9TE@R4h7v8HO%hJdG4hvv>0gi_m~?M})tdQ%Bb?{Y
zlkxXw!6hUI?@qMPI`NMq`VG86KLIF2=C+;d{{b8lF+jzQhjsp_-Hd7cLE+ySnKrrv
zl5w2QjtZ0AUWn2YL%7i#f3&<t8qJ-%V*_Hs+-XL5e>|*uOmK=^1(y7Vh3j$=fdWn*
zy$k@?(6!nkD3|~lAu3Mf6^n9u4A@SFutpaJ0I&8A(Ci0uG#e3(6f!cf5sNkfG_!e{
z&YZA_!@@C&p4PJp*iD6nhqjr}`@dyEAv$cs_qLM<U0^1o02xU~cJEA#R;wQ{6AFJr
zpUjregDuS<nh6mo;c$_@)Q&p+@b%5fita>U+u@U^p6kTXD|=e^A58Cb*lv9&mkHF!
zMPGs6S<>OgNYzM{1X^iI!?5+$<I3<}GL&esTM&tc##+kNwiSi9Z|I_;F<Q>kXz}_<
zGTD%)A0Z2-F9KFD?IZ$;m%@&IEhBKgdt57v4|f5kfl*Js)eAcQkkJ9e4+HcRalbmo
zhpFmL8@mlzDrwB2gtSN22AcH#$%-iTF|#!uJj}H8(SW6dhdPyCX@NeB9@-yI?wq?T
zVT|f8;g{JGpw=J_+)OGTq8-#YzL>1EiU+1!`giqeEcMq*BKprg_DwT9^Ql$8nXJVz
znMdC<@L^)%O9x?CaJ1-*0v-6*F1M|z+mjWxJd;N|lPoHvw~fo^&l)Z-vb=XP#n4JS
zCSH|UciWMM56sZEM|WR&9&o0yYUaRLcw4Lh|HbKa==N>xlD9(zI%>d55#~DCfQsI8
z&jNVI;|tHE3puCya2+ARCb0A5v3+|55IlvgCuKGj-!G=!rrQpl#WNS#wnyLOxp;r2
z#-h^o({Vu$>2u$x2v#A(9Cgk&Qc+?zh+kk4Y)?z@+*3i@if|BRgx>R)@W~7kcS17f
z!g|5o`_u6Vbeq|bd8cBmmEOP=5bfP3;4ARib?eF6Xz?4vvxM7fiVt%KKXMs8=#b-i
zzWPWZPoHBjTSZ{9Br08a2e`L`q6H|c+vI_>JxG}%<~DADC?s>f^FE$-10-tIwW%sN
zGH5CeECp&T1~WC>vKzE&l1u(rG@MJWy3HZ@X24b!Ejk_eX?6q0dM4n7$^8KM@NcJg
zj`HZVyPZdiV=Q>Bz6<Sf2kmr5Kv9Y1VLdgp0o_>sb~OMn!f#$kASklm_iB=U$n@G4
zvI4f0<81&W>cNOfPX>b%>WzW6JA>cCg*ltD6s7=M$41OVdM<Cq0_(>a2rwv7@y+Iy
zbO2b((=wI(xl#o&7a##=0_KSE?5wHU&BYE(4-$Fnp&(;wRu)O=2~XidN9zQULyK0^
zJK-#uoJ|kf32sJYtIV^!NvH?3UtaX+WU!8P@w{eDIgGNeWr}A}DSuk&6HP7jct(@x
z8o%Rr#uO2Ys3DKt`xLzHB|`%P#Q<ej0FF1)M~x4V*#VWFC)e5Zr))M(=0lwj4XA5x
z7g9dhHUMB^?im%0V8<2Ml|tc>e)2GFY(vf4pzke#AmTi>TlK(ve<<-&G+gt1GaX|)
zRRVTTW&6~yk^#J+04C8-C9c9FgKrIyrWCb|p;jYIp;c%1$5I65)|=)$xCt&>Ar%C8
z`s|3T#)RyB-%|xB|6z&eX2V{dRxj;3sYx(Zg3r;(irvOTsiv}$-oZIX6I(!bk>zDo
zD=)aScaou5>i|NQ=O>~l%3L=pq0h{=j%WQUUOA~Ycg8U)Z$4Q;!v-`VWh|Nki>QUD
zt^Zt^)}xJ}b``Ws=v?gvA=3!xE<hN;!m<6s1{MK9Q%)0@P2gp1VArjiD%QLlq85s_
z`Z;V}Tx%W#4SFEiBlLVV#;Si@eiv#_Zjn+96y&}WWgy-|F{^hgHMBidP5ZOi&k*ed
z02o=snhsla&@ukCRQ`R%K??-SX0;$8A<#iJW!$){cZ-jYJ-3D#m7pKWU#7SJy(g3J
z;fQ5XI!N$IJ{RZ+=MiQ+?gfo*_SdYM33Qa(L5!Uc>%Df*qfM*zk?%5X9d78wv{c4r
z_p?dmqNo@JWPNXDUwovIkbiN$)!?5so5Eub$+{6L`0r7E5InRVkA-#*iA+lG0-w{h
zY;NzL4&&K$_ixEPHVotBv=Gy!ot1wyRsK|Dtr$C+i;C3d>q{ntZ%Wpn+luG8;~KE_
zRm>0`w$UvxdMr;LvDm@c?`o@kquhvfpM$ptC&ZW1^1aE1*r0*Mdr#H<*|*Un_BHb(
z1frpjbD>E7Pdqh6%5iM}wJN6qp|eJE(RQPa-QQTm-@7n#m(Z|yZjHPv^>D~h>^&b(
zcsn*e)=S$E2NuQ7$QYKzm*HV%-v&O$3m=wOTmbo(T^z(~a(Nsp=q4juUv3jPh36(g
z<ZKh}BTAt9(fj(mbAvZu@CGB_y}6E!g(~q;g$64B4&Q6n?`SqXhYu1RtrX6B-Kq9o
zdgEmqaE2^4N&K<mM9G!Jymk`%Np_F{q>U1=flQXhh6oenR4;&bf^(e{zrdgZ)V1IC
znVUMcH&LP!02^v%cm^=&cdKTnac(82QqvkND~4?9V6{6{TzHnZ{;ov-UL5#$Fg2tL
z#CG#M(4G9^0gpJn^lE&*_!|fK$4iqzTTBW1Mk~n1WqqjA`lkkz_C}~O8tVPGAXh97
ztlTds2W#<|+fAZ_ALwt}aDZS70JBqC5GB2q0F4t&qr!QLCM%+sl$mBd$=`}Ma7Jm?
zzZ*0AgH%RHQ!3KE<UjwQH2#RIPzrtAUw%4V$qh}IxIH#D{tjX8;j{-^73PV8EABQP
zfUmV!!~96x?Wmw;vM>$)Qb*SKP$rz@p;AW5HolS}jrgTsEF7$9^lyCk!fVE}A`iBw
zbpdM&2`Xa`U&D#fzIidXpB9by7Rw-;ymi^rwo(?4ayuxdOxl_A=1{sJvfC7&x&XE|
z&+I+~YV+t@tcmQdy!pI6YJen8el|XF>WrZa)D<hJ+3yo^0;w_MRVJa-Lf+ctw$sQO
zfIx%LY%D}L=ao-MF$a7U#T<HIMxHHZ|FcLfJE}O3y*?TPcrdbXj22q=txy%}KBTH8
z12FA<WbOzksH&pH&PETqIqKPMU8DjfDgZAMVm@B;=IEs|$~VyI@c>El(k_EbTNS<h
zHtpH3GMfRM-i=NuI+yoKxL@oz-TEf3dzQO396<e9HLGeWRl_TPM(JpyU0w7~C;kVM
zYp_TA8GZ67<I^2DGrilyo>_MU%3k?p=)KPcD_FE04>@WXcQz^pZB9r)APcz5ZA7h+
z(_?{gV(b_1>^|4f=xp1uN>xN@NK!`-HpzrW#SU;2O-V}Wf4(ecuP+Rx;q0CU4IQ(T
zm*gWf-^aK5GQ{n*0)Kb`yPZ8mGa=RXnMOm3m~~>rh{N!6I_xt?sJLZ@1!xd1_hnee
zb!ZhK>5G<hY=_i~k5s6wl?;4_!=BY&9Hs512F(SK)hUp$JOInq^)y0!g)P;P7;T7?
zyv(dpt02TFn|J);_=DUe^V8WQN<<C69_e$`lU~lg(8t+{o#N+n`{%;P^G=Jz4L+Fu
zyy10o3&IVX@s8^1Y&o5{`%ArSxrbBU#iLr!CqQNT0q#|Q)+VU(Mrgt9B22TT3f$T8
z<AtE`bwbQFK2o9{^>Pk;`IU33|Gqkg{rKlk2OFdP8ly=pnedtc5*2wPAK$AicybY;
zRsS%`pNUnlAYe7DkAuAZ=U7nBEvPselmnB|qHko=wJ`f#o;z)haart&ag2hbM_{IM
z$3j$J+vWs#OJKg~C;{&1XLy~W`I%xr)xR0M6uVR8Nva1@Bt5ncN6kQ)ZI@=fj^_at
zzH3mosYx@<<}uuCndEe6K31-WnFBnil6%Fxj(E}VYvLU4B0fLOeatjoZkxty_q}Dv
z-|+C*z)wNzg0Bv=7P&QRoeV*zBU1r%#r1^9CN=W4aD(cQHKoh&0&0kvo^_c9%49)R
zuc7$mK|J;MB)Kp1N;RdUFILKoTwTw{L3K(2{}A>{RsOBw+V?gdlh&@+d*J4-^qhfP
zL^mujReQRBR4YY%^t2bkmqOd_EMcz^p0};A<#_sa=x)gAQAOk<5n&wq5BR(Sz*|@@
z|LTsc<{5P82di69j`X8L^A(MVHfI#`%wE=8VrQFh*&%`*{V!W$dmRjqHEK^_HqwYc
za~)oAfK}Q1Kbc}dP|KmpQe+BH&GSaw_aR@`O+xtHjX=rRHYC;2Q%uRdA5~XU{4I&Q
zMgmrIZHbgsllQ8$eUHq*+emL?_tiFNwwtAeEDq&ssq|Tq<Kn&JLQFV#+PtN9AqL%e
zpVNcJyl(H;WRR4Q0gZ1aMrUHm?fU$Sm13zbg`Qs66o=DpZslxg5d{6Dqsmd3{R_`+
zxL@^=8T8v~Uo|v~bi$QWzgRuJuRMS4ooalxVl1G{rfea6C23&R2)h6sqat95ysRfM
z=zyK=^X?4Pllpl%WCSf#L0zH;1JZl1ad?0>*hM3F6<xBGVq5)@U+`=motkE)q?sPu
z1r{Es5vk<{m^X^}(??O7Dqz$Tj_--vqOFOI<4EK5M4ag!V2ggD<FYu^KX$!PPUBYs
zpyD#~)W^{?j3zzS!zY36&X2{4=kU#o-&OFjWA)Q-Ccz}p77F=PYs+D*+H4@973r%S
z2inzDb`G^ypkZO|3~0+r+Ju=((W^XGuU-aY4=oVPn2(@Cwxm9WbU+P?YuQ-)faS;l
z(~~h@ho}5QoFDMk{R2uf2cXgsn4@>uYfP-(1?=z@raj(AF~Uo_I+42^;!mgNlzIvC
zC7|S$r{uG-40?Tk-4@a;rv9XrYcc$5A^jN<>F3g7GfWjJh$+M(NDxOwhO=F;c07Ll
z{(dx0??d&TXEOyUiyHko!0N{%f1NdO1vrdNIbVheKraof1|*7}Rx^JP6J2kj<PfEr
zKNPeX<J6kfwk$Mq7t90*)z{a*e3j*4=1{}Fv}8vbkLzGiAIC~Cks2UABexy<kKv}f
z#)Ff<!~W^reKlUfo`P}sl8O&DrGRhENxxvCs$DY_Rwtn@U*KU;H=GphvA@zCe`*XG
zDU@2o^Ne5bfV%x%RMsE}*NJ|;Ml-`+*CovdmB*ZShsvxxnk=_o5hf#V*J4ei!`^iz
z)tRFk)URkJY46)Re%=|^Mup2Ukm<Vl)3NhroB$-e*vOsefVYNIdaN`A!QI2WYV}a6
z#?MCAwR)d2S<E78m-trh!sihUOi4Xw9u1q1_i0ff*WBsEKW0+T-~~+i_S%}_(|AW}
z2)6N{8pWjX77*zR73X&t;Tv}?c>2&|;5IQ}q-ybr_CKlfKKYo36LysYMT=cH_Bs3b
z;-y!I7v{r){*OVdeG<w3$Am315S)=TDE$GlATGKcgn#kXE~9G3I`1t;V~16L4i;rl
z8nAxveBzV!NhL`+hUeOHf2rtOlm55Hq%yBxucu(_Fid%o2bN_Y-20mKV+Aa~7>BfY
zyPiQTQ_ifZFLs4*GHlC1ZYd?u_Rz@5lG?#u{-BZVD`sB5v(lq#a@NQX(e~oY9=(0D
zy?!EFyvnk`a{Eq8SGjHd!kTs(pY5}tX)cqtN*QJwLi-v)EqCR)8lCRE8A)gzr};0s
zu0#)`M1NYchrWk8xa_g=DA=C5YaFSZ=btCE3|6`Jp)#o(Amh|m$v&IoaAeL=RZ05g
zoY*{jyQ}-y^#Oyht>Q*RUrNsQ6{4Kt?Ti?$??LRzSUDon;3ci!xg$mKg;1|n6+EiK
zY;o+Z2MY=vKCLxy^W=Wz9vgU!j>n-1{oz31sDJq#dK)O<`GNU4Pg)F=vRt9oM$|Tx
znX#>#Uf;n;`{)Sw3Rsr+xPB=E+hh2q2mblnDx11*-c%i*0u5G-<b$epj>`h#Z!f;N
z#qz=$62dq%0*mAQJ|Jw3&~M~HqhUj=2ViwWNh7!Rf4s#JcNxaL_P%ZBls#Ze-5$EA
z8_+n3-hW*$A5oW^V+(6>9LiBOKa(YWkynohSkzFpD1X^I0!>`mN)5Ci!0<mwt(?~P
zgK9e1N=i6VI^CXr)KB3~YBbe>WwJxz7mb;DmqA+e0QO?BCsrCKw}J|Ttf%X>lsJU%
zfH&x_u_&UW^j)FKGn(5FYEEjbqgI|t&))GB9vPOfOyy>tn{lzV{YUw$nC8$>o3os&
zu$LNR2DCVFnzbiTePl>m63ZvJQoq^a!@wMzW?w;-k#h=;qtF!=(Ne=tLmQAQnsLs%
zlQ-&rfqupW07v6m(c6;;d#!B}|2NX2wgczUr*Rm&%c{Lf^SzVwHFOQS?R8=Aw!IJQ
z8aaC98F6>p(hgT=rmy>*j$7EJ`Mh4h<YbqYN4?{Xwl?I90&kb7v3{o@Z<IW}EaFq;
zoo^y1X?YzZ)tC+~(&80D6i{la50jj<?~(@2P0xLi(}1*eZ)5tb1sYL|CS@-Tkm)4b
z;&#sz`2%4Lws}$GE9NG781!Ikl5E)On|y;lZe`I?UN#;c*lJbOF(J%LIS!)Nxkar(
zT4G4HnOZS1+l*kX=$ww45}Pz{<KY3UM(rSwabW}&@K!vCOK1PEiEtOgI2gF;UWMRb
zxJP+VHpy<Rvp)gxdx;-B4H~+8Jc&=d*xQG7dnrZr=eRIRiGScr)!S{z9xU*N<qFC|
z)Rc`{ZznW~U>^aq9}zb`A=~uM_N_J+-seO<x<)r><PgRIFjJE2AM741MR1SRQ!0lz
z|FG{M?&EQyrR#Jw^SNwJYr`?Vj3A@oM+i3g5L^QjZK4-`C?6Ie(EPDyKz#@{CJmXC
z1OR-6J%Yeci}@D4D#@xHc8znUV|P&KQ0fZGnNRO)V>FedFXvO&V=yc>q>!~%(I#Cm
zyr8CrKAsTFQj%T$CYKsUxk)$E@G#qdxj<qpzE{^&S5BYd?0x7XVr?rQ)WhrcoP!ZQ
zrE)jf8FccpK@jf9_3`qOA6t|^Wh_v=Bx~7X9-<4~i-Aon3+~#)Z>Y+lyuY*ES-{$5
zsTZ?GZiYzwJffOiUuP*+br4ML`fVY_-(}Ic^cq)dsxt-0VF}elHXOei&#{TJ$k{$*
zil@TcX1{=t%L~{XF)QL`0^n=@lgZxMPY-tGUCga!k*vi*6)H*GnOhw~=FY2Qmyz!8
z*pyYL?BKHq^T2VFiY09~9NAlPW6xL*1JH>es>;m~kB%~Bh0M3%`YL^%m+U*LyO=gd
zxpO0mAd9TnV0MXaso$OGK`2ERv@cj@xAI9%FZXJ{w)U_5F_<8dtq&O6svvX#;}+`K
z_dqhp$}6nPg`Hk>K)MnxGOZN+2;e?*{c~IGz!jsq9C}nI?t_a{>k6g1E>-NAo1oN8
zNIEn_IpId+7Zh97Pg5?vq(TkjG7QDYQ8v0(pZo?VMVUfhr}A3eXvuZP&*dRCjJJ8G
zTA0;ZUonZ&#l(2#FjJH@QTB;T(IVokLH1gA?xVnc%YCi6CsNy#H8vAT>IS1IN0Q&a
z^9wWaHCfkqD2&zpyjg|;{efXbP^Btb%Lt_jvbMGjv*_d^wS=KaO_laj=taxR+ZMkn
zk&U*-Gv<-XwXwusU-%j{W?B27=^1(^52CJ7(58}TUA!;=M+OhlKN3Sufwqr<e-c1v
zpaNptBP1Lv+R<)6?PE+>VsR9d=f5AlS_m4&ZMnJP6GY7keFKY{3-mGq9VI(7f;gxy
zA@d|n7G|hqvZLqiCN=EvKHH9ug?<&YN9WDWkdv3tJWLK6cf1oGeNkNK`{0(Y%DYfk
zOb+EgR~e>;7A&sgsl5&`550fU&`-ycab7G(7@yYW8h(|$z=XnK+K18NPL!H`!ql}9
zar<YPOA2#iYj<Vu{tHBiOH|1VaIsRh`V3aSg%&$aitC#~zoJCntXZAEk5RwQ(9?EF
zZ@<Ajnk2-?WScin;a<8wWw+59Vqf#NY~Wm)vnEI?MmKAo7Bc-p09A&6v7B7MwIDYz
zY1sbsSOBdDqc_{1n!&*PvdX)N&TcM>add>!nnFsGbDtzAiQm444&vB^i^%Vqp(O76
zVfH+>Z*{Q9R+l8N?1`8g6erNx^iF{_%3wIdN3{AmuaL>{i<6s&7%&=%N0N1MU?ZMl
z+6)Pm7NprToj03W*3l^(#csjsc^)(+3w=~_UbAhzzY^qz0V1Z8dcM+%mc~?H8*Q;(
zCkou?r<JCm)_a4v8yJd(5gQ`hO=6zboo_3I-!Ed*ygMfphu0Y-UE{iaq&*;D-Xuj%
zmzY6k+*c=H*`fZ7^dqNo1d&NAyA8i0&btsT7e8vRyS-|`Uf5NB&n)lV>c^Nn;S*iw
z$S@<I;p@OO6<2dHXLYami4*@kisgMb)c+ex4Koqi@{W6KIz^RtTz<p(D+VnsnKKKY
zKYBL)a{$szn-f%_W&-#oFLCAC9XZ90huEUrh&Sg1tPzIG#3HQ%S*e$jBno$jDFQ$M
zWH`4G39k@Wjs509#m}-Y<;+)K=v2oKnOc`c8r4gFnvoBV3;$6rtT<Vd2FI9Qe|HaZ
z-(kXyK@P`!D8T1t92Mp}Z0OxnpZ&;`8EOu{jtpbJ#`fZA=qK0#jGZM5a!<`A)!}Rp
zPbFpLu#zQz1zqL_d8PNsQWBbD3t)x#-b#5sLdJNljkj?wBxz=S{m$H%;Z)1A_Nz;l
zr_ViCV!NnAVj`JYN5;GbZUxG<yz-31Q=4C5P%^}Cc0GVAT6FpcyWt;bNn&RTyN}<`
z!M?)@X$UlbyIh|4;g~+|>15CtVo17=#NvmZ9u*R)r*F<bweg>?lg?y!x7K#{46W>v
z%2a`L`wI~6<w+#Gpu3H@UfWmbft&m}^Bz<Y#?Fw-9f#KZ$S>oil&)wdL-IN$oTc{T
z=v};dP5zG^J0{o^6;`lCLK$pC)Af~1^LFfLYIlkok9TMxf}CM=ng?x;u#dPi68o(}
z3V(V5K=^{@1?QH`o4zW7ew>KtO1c?cT10yv6FwihTy|*mv~TgOd%2XLYz~8rv0vLO
zJ(*-BymyY!)&Ia-Txk;2O6AL2v$pk76U^0tUi?UN>La4CO$&){#!PrE8K<=CXI4qY
z=SdrMpHCIu4sQJ*Q(Ji`IH6p~QvwUc)?e+<A3(RKT2zv(>=C--7P8<Tg{fe^%Y6{!
zxdY=W;xkxs^?L5d@{P0j<Ec>LJf*4?<W7^vi_r(#*ZL{K%mrC4DO-r$?`}FPE^XQx
z1ewE*vGhMTbGtu=3FWRF{-`1pbwO~i-EP1*<(u=Mj;*?sAf7#WM-Qj<oBW!z5`OsH
zd0BLCghT3h?mnr$nElxVw_bCHvq}7q2`9gLBF}I=P6)-9_u&;jM4Can^m^TC_uER{
ztw$4s-O?{<LTx#|);CjfXxHV=$BJyz9V<VlHF$nZYuO*ay4-jP%B*c*?nvi>KU%Cy
zHQ%@DL1JV&<&PxFOOd>g3#lM;TsbK2t8F4deIyVVf|%2C&2}Kstb$*x>eaok-&`OW
zzShVn`VGTG_-0%&GyOvItDNE_RSS{K6Q1<R^zM<tE}dmt3Vp#Dr}&>=`D@A=KQ}PV
zrAUm9$}^axy7?6u7+idyo?(*@qRFJg8(KEo(-h#2YSpbSXRlc}I2N?=`gXBpTLaBO
zaH*ASppc1}%S3GNe)jt{gwK%H*B2%rav31X4Us6mhanY_^&@kywQ4jEHMYyRaRGj-
zkIuFCW$!J>t3~T5d@bhX%AjfTY_Iw-YcHQill~996+#ZM3AAN(2N)=CxH2TNdQwbI
z>Gpp90{FLi{5zx4?;lsO@&DmYzzCJV5S&y;$-pD{Bh%;CtN#8|6tt#y8JqqU{{{UC
z81mCrO&xCiDyIJX=NwWD!9s+j!?w7CCNJU}*@6l;A*U=cq{T}B+J2ou<sRWn4Ehw5
zooV@NUCZE;w?$Cv8mHV^Hj$0-0Z7ZMb~%H$86;7m80U1*uwy$|R3~j>fizla2KbvX
zUrfYL1=vpr0-yP3AB7x{@H+rdwEdw&^c@E`(A-{y{ra2R!hQH>jQ)lU4PF9cedO&A
z0bINrG78uY5(NxU0XsdU0>41Sj)oH(g9mjAUg?J3`%@JA3s2K7P*7r^9eNO7cIY7E
zCizJqHCLFe%kV}MB*B3j4V#294j_2<b^JENKrd|@drrSqj4B7&;LW6ndIfGi$ia%j
zDiJ&gc`I%MQF}8RxGP~J#PSK8Tw)H!TjMMKSh%R?Qc+%f1Cjml65`U-wAg^XDo&os
zBC3Tvr*n}K5v2AYjdp9xUFVNZyB!ANmB$Ymg&x0E!TlgA%Ay#FFGLNskZ;$H&0$_s
z?r1;uTjC+33V<t#v9Zs1+UbY^-pF4Tb}Rm#MDp5cpU98vmH=dB8nzyCc-@;Krt5jQ
z&XliNe8c7gl1BhAt`418&Y}=g&gfDO8Ds=KA|B-pi*JB2-=9PQe+Usoc=y_SJQ*ss
z5qFwGQkRliK(6?%!S)fT3i_WR86f{6>GFRIgR1|)527L$wej{CEYMV;3KPjO*^nDf
z>4;~E22$CFN88g>8c#p}&9C9F_wgrJhFP|x*{$D454^R&OC@a9nhq@cQhHbVGn_SF
zA@<~ewq<Lg{53Aw-Hs3A@UyZj&;eKkNO@en1Zw@dm*(lhPe&oogPiLes!h%$|43^5
zed`<F_J0ethfTpSx^7R6aj$qtM^Wiw{lHHw%vmT{>35n=(&ruqK`FN}ac?BpPL`j*
zC&h-5!hF5cl{a3`0!7;IqG=ADOBG*~^vrr1l{MP__t=m?`z8&uG*9F143#D(Y@WPk
zOwbk6e~p&?LyzZ=x0`msX&@g{;q+i_snWBlVa9W#@RpE>kkfJkXv^MLFxX^VhBmm^
zw^G>4Eh()YXhxg;?+5rRz;3lr)uam-ehHD4EO7LsGNdrZC%z$ary%Kxh1iNyHnemZ
z^mEDFo7H`nDOSwkQw2Ia$mmC8UL<t;06bScOd)QUzByj2d!y$K2eLU&;&*u){Td#0
zv30!Er+R#6HJ0J~`}??~zLUe%763CgpWeP2j22J5P{|Oz4La-2!%Jw`FHnlS*z`QB
zM+zMlJnCa;{)fsUfq+7AcCQI?EmiQPjkXm{+B34brXCJBCZ53cq?-$odLSN)-Z%iv
zbOjRLy9E?m&$lKjRT<Vm+6Piq&Tl)zo~`mGG4-noXbU`b`EJnx1I&SVz>Mt%R82+t
z1>o_}#UU&kWPEYDQ5+}eXuMX)F$?y-d}r^(+%Fn7FULYI=;$>H9!Z7}Qkj?U6Jl$l
z)nQNr*K+88Hu1l24|$OGigCY>N~NXN^O#`CC{UO`p$qOSGvfSg^)aA<m;sLnSqEfS
zTfOj!hOp}@3^cU3lzh&2+hwP2>lUSNYVJ^yw@|w;xIft<Gq@i1fC6{|WfPcx@O=M>
zPAH#7r0m&9f+|QF%~wiIz7#!f^}N|z3kTj8$j&>pu=^uc&4SfJ%5GyJ)Q#ZfgU9gW
z!08*^M7xMzXv5#Ws3kVWC-*?aEk$&Kezc8Pljcw99C?u9mpLxmL!F!4d(}HV0zvVB
zL-Rn0^qFC2JWCSjsda?lQ|UF~cU4<fYJy8xy2=gV8=+cN!B6}8d@W9|%BdA1ki+ZM
zN3AyMIa%q7r_*q`N*i(kqST7N(p4kaZjh9W){#1pvy#5zxJD!V&#NsFK@YwDBeK_Q
zK~chH-&+tqFIX`N5Aq}`1)Zjp^}xCdO@x1Iq!-KA0>JtBY|>7kb`;%S@Ca#}0U8!*
zwnv05WZ87ryLFcJTu0#DnUmkJfV!RD5R-by@os1tVa5M4p)mN7;~C0~sX`?Ze?5_I
zXw0h7pIjfH0wroelZRnp0v#&o#i>B=+^vl7_&HHqIx&;=vCKr7EgK*25Vk&N>q}A|
zeL+7E*P9zMz{0jScJWgQ?VIF__e#$nSs<gI|MAb^3W5>rj{fc%ZIG;F!|n}&fGu~c
z`?p?`y44Kh;+g^JUGK8pP8}PdM7ZX^r1%6PQjEIVqHeO?(JfE7lUm>oIzw?>CbAxv
zCo9wI{{e>o-fRBjF@2!KKo7rJtxZJC0#|+lNl>qU;O8(;nEH3qhS6e!BoHwtoW|Sz
zFp)hLcvB`bHb1R)iFaIRPaFRP`%!~v!u@PNd#}o2KEPVMKZa%WS~O<RlW8363a9M_
zy`!<<{C^zMpGR04CXt?y`?!kbS$IPu7;rN~_W&gc<LGNeUf){uw>Y(e(o7+!d%>8q
zeQhvsz|^Y(o;D2339RQ61(Il7uOv*x(O=%EUYIU-K08Xr1cqb(Jay0w>_xUm4V>Eb
zcD6w}8285#_H)u_csm>yQcB3*<F};cd<N}mSq-htzXzNDJ~yx%#K=Wyr2|LsNaRMQ
z01fv)Z@yoIMhSL0D6f#cap%dL(SJe^e?9U0=L;%;*VzeH{^67OzbC@~K7fr3GYF&p
zW>NCjpnHKa1<(%1=0?(gTL3}b#Ez_+HoP5p|M#7dznKG!zep;AqQBGp{(3B7h-4#b
zuLOSh7Xd1A$rS+aKIP!>f0T4+NLplrFfBFyznvKr>`L(N3n77Q|NhmW+JG{JF>*3z
z`S-*9>oKTZTd4%7_5S^<`8R;napsaftpC31f8Jeo;N2_d=l<+7|8>xRTo{ZS;G+1u
z+!Lhy_w(w2l<eSRY5nzee%}}10jvZGK^5#be*3St{m*~@|G(sw#n;)jAAz}pVfRO?
z^DGp_nq2FhRB%lcBUpYO=NjKfruK|(9Y0QHdUyk@B%QUqjXnx67`AC|>izH8DzwjR
z2*uA!u5;)s0%7R8N0R}iFd!#hcK^|KxU&nIZ$e5JAjuJHVcrRJEB-cap$w3p$MwhL
zjyadKGRJPSuaPoazymNnD$qgl{3!UIsY`?lHizQJa{&3v*4sE1mAjMWcCUdgZ}(8H
z+EU9KzXw;B7t`xOWgtzp{82!(#rVh_-H#gGiAH?Qwv13-j5wx;%s|f^^FfXf^nD8w
zBdJEa=bN$25Ob16XSENLD$Gs~tiBiM44%2_iMZOt14$^-{io)L%VqsK+-I=^L?g+4
zbU6R%xa>{?Xvp=hRDA0qYT_nscy@<e%fLCOgGA-}b2gC4N7kdg(tFlfnEu~`z)b}V
zOsr~PT>)G`*XX+wIg*mrM_vQQDHYvE&E8aQp9$XR3IDQFvz-~xIgXaEJw5#N!V_O9
zT~L)$;2Gmhk>?P%x2><pl6uqlJNNs=R`NCq@od2)`CTIv7n!;Pj*=CyWD~P9Js->k
z|KY&;;r$&^#b-k~cR@uP_C?z3c-l_(`m@QSUe3w$*{iRidzZNy`B@3<I!&8o;=YC+
zsBJ(h%&bVhL;N)zi;RQj%ENJQ?w`}(6fWxJ&V(cf@FM&E(a{8HN(8Qg+zk$Wb}D|m
zgv7!p3Q2A=vC-m}V;a?xf!96*!GRE<%25ADPd!?`&>q$K{{E|omG0bl7PIq8$0FM*
zr=_?^Dt_{Q8kPR!*KC>{pzK=gm~a04yptHN@}{^BsccX7b+wtj3klydgKSi~Y$UR4
z&95a#z3WZ0@4_9<?|$Kr-g_7FA@z7rB}>em#3%wGn)CS6%bqs^{X-PoreW;K;<MHi
ztV8NV`4awUk4vR6oxLpy^&j|Ai{AxBKcDTK3g{L_U_s0M+hZC7l`a5}^Z^(Uy^c>@
z#lE_^ZM9<(1Ae)I@A#y9>eDKx`#3k)8JQj=sB^?MeXrcjH-|tt@*chO&Y2G)vs<+F
zDto38c`NAb4tt;jug4yv7AZs#h=oSWwrv}A{W1^Fa|3<O3s-DtkN4k7x@B_pIoWq#
ze%VpE*<Ze6PTS94`OCHR&!OTEj0qwaHWP+Lxl4fW*4GC_FELb2L5=S;Hz%nO1M^4r
zy6ZF~7kkzQzt_qUo~X5w7Dx}h5JNPe@2SVqb1UCZE1WFvbYAcM{?)}=G8s*V;>##V
zSZ5$;<jK+<Ei$hMLdOSv)Iz5HfPqW+mJEO=NLJ58nKe?19<T-YfLt!M7i;2%FFYhg
z-RE9-14bRP($J~(ESm!IFLi@TL;ve+nu-Qru64pl)&3NthdqM81CWI?w}w~*Owl9J
zNF)lM!}_NLUe_D(daI3{he?Y-_YoKB5@A^WbL9<^2+eJG?yghz>^0yv^be_k(0gx0
zBUT&`#w~)4g}{Q>*;d72>q(Ra>gn6qh70BwNf#?`{AQl373-r5<<l3GF?mM&9#7w*
z5c7-(d0bf9y0-}*UOb*XPKD;*(dDmRO78+X40he}AsZ2=#Tc8Jns21aU!0blI^P}h
z6}W6@I?XlQlcc$6?|OD*>!G+4d9*c|(hC_?1kBuuyzihr#u`vWdf>J?fT~ksm}%M>
zH(t06oDd2#ow_4O37Z-pwez^)YXf1dp6{7$G=+Gb?Uh|*aB0A!+xAjP)0jU<Yto?r
z`ZEeHaGO~(PL811ubqFjpaVoRsDr>-Ci!&C{%AAB&IGIZWZ#c_vXrn7+sXf*q)Qkx
zunk-g%-4~sdE&Y?vBWCoy&AJQThV`LwR^hpy>2>#EWi8|-{?~>m3{Ill&(N2nNxCR
z1i=>c^l7c<u>0qsB8_gwGuvV#MK+CrpO0&wU0$BqTV<g|mZ`BqG;*J4m_IAZSmQC5
z$3J}T@ySS9C;xrdQg;&5L-XS#o@&}UqP~Lyocm-P`n)Cq*?lFz|4SY{4vztqM<z!H
zZqbcrOZC1hJ{WfccUaX!pydpc;}uY9kCMG_e%F=mGrk0*Y+YVEwJaJgJ)G{RHRpTX
zvN^fA?URzxq(*vzH>~+l4}x*YdP*C+v8k;Xp&A%_EBzKViVpa8>N~<J;4hCqc^%U%
z&y-C9zCHs~%RxL_DNQ!Ge=Q3?EE#&=X}O0ES#uKwz}R~-#ky_UFWeW}@CWgZfDO>s
za_4UTjiZZ`6@z4##c_}QUVdghoHL|+rNXngc0|wfc-yRrk>lxUyG>xsSJO^o5R?+H
zBPX>VPSQ?AgyCa6m4i+wojorIQ>4Rq)}L9sOfg)xA>z4H?xwk4bq>cVD*=Xm9!lm>
zjuO;T^(^YEZQ7hsLeA!C7qAC`$QXl&6_~bUP{1k%*R_6mZ6b!SXs+(tk3T*twFxKc
zLCSzLwFFrN_1!Q2>#AoVYb?PJpwS>;tr^yU(-J?ij=H~Q@g~K#K0hWfNw8*>{Tm24
z{ti2bgn09e_+DMORa>?sFs?g327Zxh{N@LiaN48R;{Mx1YORzt<^`V~CMjRbPwL?!
z)$&c|?R2kxJ6nQBFP~ZCq12ZKF2RGRy_17rcds?eQF{=VZ-2(-lBMXTf7JABbd=xy
zMk7IP2j;`PnfnOgCfsbbm`3e_Mz+<ro?Q5kROo_&Uf<RovHJmZ@!0-g(ogl>blFhJ
z#yXhsKHr^MMwh&%?+)B4`7><?v-_c8<R^QiobR*%os>_X9;h*1lj-NsAtho@@oow$
zf-M~b*f#7=EPYfEhh$14Le^$3o~nUg)ybwiY!7^B&v?{)b+IGXnIRHw+MW0mCM^9i
zSc|qmtGK_HJds@|4^5eXXB)&5CWDQjR5J`@FgJ~{X?V)Ej(VCqyPtU2Pyv2CUsb@s
zF`)k1mTsJV42dezfy=fJ>YlN(0%pGey;2l)bk(}yJeW)<_#saN8_)x@jermKYa<tU
z)VWLv7>YO;=?1<h_i@PXzMqLPQ-@JnWp*2QJFe*1vhET7qjGM6f_j^ph4{oAN%7gS
z^rhOkiDrL;ks}0#s{78v3en`I&>X~_2Xf-7K+NjVxW&!FVEw80Jd%xj>m1E}7oXSJ
zRc7#vw4e?#)y$EX!jX<aW*b8yD<Gd-KqF>9t8RIzv3N_q+@=;6`q}iX@7;;Tz%fzp
zI7v#mREnf&UXi}w2#j8*nPUSn;2-*p6@NoY3h2y6<^`~)RM5_jcLaL9(kQ32>Tz#O
z5y2cU0G86JE=|B%;0I>ly`B22XJhk}q|C?FO-T+{vO|>oHk*4t{aJcO4UV0y>C<N5
z2z4R9%bLD`<63LoYF9T=Ggw;#4Pce*Pk%kU*)LJx4We7`8ewdd4T%thaLV`41jpol
z`%o`n28lf&{cOe>bS|lF4*YH+4RwJ906vL}AHk)Sc`cP?(A^+@x&yo)3E5CvF?U$R
zxW)IDaJRiHo9VB;qG^s(C7;c9g5>Df@l4vXtp-J^Uefp+e2a<Qef1Cjn?JY}QO@B8
zUa<+}nYXA8M(r1KbKlW7E8?H{3lNw;2_Wb8>>*c3<;{&4K+xC71L#A+b)$Gjewiwf
zE)-AyAZC%w2)!dXFt;`~uq{n2f%Wwop*|a*^-#>-QkO^mGOI|ILZGUBOr3la9EJY6
zzANkj-4_%N)oIYTOR>o0IuVY6c@$pK6Ew4n2wsK&sXN$o-|OxPakzvr;?^(v71L=$
z2AdK#07d@!R&Y>9zWSUb`B)ecgQAf8fIKV@QlR6^qDVNelzVV*pEn5Mt{*RNV}sw1
zY7Nwms;X{Po}o2{dyj7lIt;icj%^WLT!N)?F;ZY6*{j}eI!SsflfLz)ir7YmG28jY
zF%vwQ(+F)J-yeg<0s1;U#3F`o<@r3s>N9Pa?%7Z`6no%;D@+l9c)S{5V=!Ni8>G8f
z<UbHlx;0kXY+L=*aj&d>nYy}nI`|ZuWQNEQfd^7na#-4IgP%>_{U;U%V@BGvI^Mv_
zLwqYWkz3f^s|tA#IRa&2yd`^=e6iABX)vhHz^aaN5q@%COb$p|Y>0QHa!QuH*BWuJ
z--F5{=mVz+<!SJCvBDqE<D<h5ZKxm|Qn#A!;Wq|!prW>=T0Bi)xQ#HzZ_O(fG9S1E
zUI;0U<1xjbSuYLoAd{zS?X<ncTN%$cN7{03VfwGG#Bg#Zc(Lt^@>h%3aow#6GD>x1
zd?>4YvxeHQ*r#c3TwPAO;t?TQUuCJuQj90J(ohK}CAJJHY8VH<s8wg&n`u!+8b8r&
z&k`Sty)oG4ZBehM`-XHu&j1d_EQ<Y3uFs3~jDd`R-K^{SvPg|MVox$G(L!X5Q2!Rx
zFovy7nJ#MYc=n36uE*88*$Z>#PRAP){K2S&6k}VE9csQilslN}P#Nd>B|%^|U+d~U
z&nlnO(t7D(^Qo-%b*XMq{3<WQv8b3_*~rx7vst*l%*bd_ErW2eUR75F#R3?*;p|Vp
zDn2Q<xfz?K{ZH4n0APpnmfY+*A#o<(PeaXH?qk|yzAK!tu(?2;_^4-Lar;R>)xW+O
zgg!RIf2WedyT)lA&{W*jm&CJ}=SUfMPVdN-H5$?|zu64Gd(>P!++h`HFjY$EE97?h
z1b?2nzAr3z{4JDu-a}8mWJ&{*Ht3=DKD#qb8V@$tx=01x&b2033=f1~(?jZ@kzJEB
z$mWxB#zc*tu^umR*fX@=)Y>exzGCy`?29u$fn#!W>E>E&Y5lCFZY}cY)RH#y9iyIG
znC~!XTrw5>0YYZ6_eDaxmHk1XZiQpoe3$N-G9g9(R=oPA%}0Hwc$O<er)gl2{GE}t
zP3&k`%SZ9B&Zj=n9~~>%7f7Xr7}A%!UUwJff<RPUDfW5WN~^(Z8BEsjPX9m)1E5=e
z_Wtmh-C?sXz{rY>*3-YOfB#h9YlrCY&sJGzi<Amh3?407IRaZBs@Sj3hFMw1*kc#g
zU9|H+GzW3&N}kgBW)<x}TL(acG_cPc!4sOqt>WJdKfUKN==PE|bJC~qYPX%%t|_wE
zQxg#;PwPsX?XX}&tB+C_g?lzO{&e%+OxGsA!EEyQdNZMTLap$Y&^^fQ#xL{Afulsh
zR4hby<(N<xBW~;cRKRDeiO->aj}t@{#tfo-Vu43Q@;qpm@GPFyL+PhXI9*^o`i?*d
z$o1Y5oWbW$5baV-j3JerF6+Dtjsb{wiYb>5?d&LVPSB%y{R*r{-FnrR$=RB!Ih(Ib
zk;dALh2ZxlbEyHpjCc11%YnB96(P)he5zE$=QW3cszu6#NsGEWiBx5yXW%t_B|ed?
zG=VD--xcyL*90Ji@9O!!G!>;&&3Wx0ny~}%*eOKbV_{=zc9NS|ih+nXlNOQC#Jglk
zmML@Hu7=2j;%|$nL<tE}I>UN3bEAejSLrA`9i5jL!8iasn&S;tJ})C03b-<VL|XV1
zVbT7nfRPx>9h{OYlboycy-1}N^V}@@8p5Izf=6L`yghA4W1ksm^uXExuH-RtXmL}>
z4G@AY&CB*RXnE!vc?f^J<dBYY?fDp5!=>!ls&x^T$RJK$gH{`?dH>5Diwm$9$Rkf2
zcR$oo?XAk_{wLOP3uDiU5zpr0YZ(y@acdfcVSxa=AW=e**nX4if2Bd&SpHr__T>C7
z;(d@wAj0{pmG%3vzdRBai<9_F@o&!p-3~2)#ij`JY~ug1?)`?j=#fT^@@Sprzkl<O
z2Xnq)j&>H$`K<KsZx1y1K%b1wnD&2wc4v~v=JeYx{{L<+{6g1YaR65fXN<A=Kd{hk
zEP&b_2eSVEdhoEz3(EvsBup&9Nrc2KrYb$2+RfB30k%9&);s3{MkoU|uxHZ(Jw~?O
z1*Vuv#}g5?2+>Z_TYsK!&?*IxEuCHg(qw)RqF3#!0cLWdL4sH9(rA)ODD>-*R6nO;
z0Jiq92_rn<)Lqd1&p!oMUkmMAXhR>{32+5~XxZU#qW-#Fz|p4TTxl>02K7|_-y~Om
zC|X;V;r%8o{@RKtGy<S$648p`zRCy{#c(M8k@@Mb-iU+^(x}C88JN1=0(~F1gfP)`
z{z8x&-Bt(|w}iOcYRnbfews))F&Kw*FZ-Fq2vV;*M-Gm2g})A93pYwW#7dMvIfd*%
zr-wcJ)dC}`GzyJirQ6>HSyX^R;Fx|0^w6D~I&f+xdJ)&n7$Bpretk!;@{wkd_PpkD
ze;2~2dGS@h_%ZmD;kC={YNw+f3Y|KelzHc0@L|Hk77Ac}QKMI37u22(+zo+tN;>Em
z3_XY9za*~iJJ)7~7Q%ktr}Uu!jmByRHdw$<z(bD-AU!dl6(SFGx>H@W-I-5Eik@!d
zk;RkMoh(y<Zs9ASuGG_i?4Jd8DcmN@ibxA;#W(z0nFKzU=O+NjibM8x;~S=-4F!+E
zY4@K)G7A1fJ@|$4K!w0q6%<rf<Dxfe3Mv}NYKx@o1Oe?!r-vKg4~DgrdLNLorCL1%
z%5}?;`PWF@7ZO?9oQBKk+kS5u{;_M)ZW{cOhUM~H?VkIUkCxDl&-p0Nf%-MceYXvu
z0G<d3oqFF|Hg#OvhHgw3TUsDJbuK%c3nc79BbA)P4<t;a6KsUe@IDN_abQ2$3le+B
zL&SaRP9H?k2pLaIgZA!XcTzuauY-_Tf3Ga<$q>;3ElUPqLY|1v>QH+$Vf`KW`F#S0
zd0*EB)HPREfTusV@06gH(8N@oZQ2_8q?gZb@OX$x&+eE>+})?b@L9F{FCk6eMskly
zqDU6jztvXf-@F?mVM;vrd@fcY-ZoZZh-j{s%B5hO<j+a}k0!=~AZ<Lz8L*qJy92c~
zBxO~P0oX6nxp4_uRGR^5i~!fMv){QY5IvjLJnqYx;|4MaxINhXe*}DmtZkrHgJI%a
zk2%4;(XlDytVnSwQ6e!$DwFt?qu!=V{N$&}crpC|5}=^qOXvu(2UgWpV5v|C&MS5p
zDUE^GQl@yub|>xYUxRIJmo}*)vvMjUX!XVSi=Xv;sggJY-=~0Gbte%BY30icbj;o*
zHo}<s>}T|DT%5xXHpkVFsPrpd>YkMTtb*kVe>(QA!Md|;k?|T2cs{vQ4;axv-IFYK
zJmri;?BKQ&L%9KnoO?c}9j^G2)<EM0z;->^wM$?c#Peg!^ycKQ_+<=mY-@lB?`(`|
z7VF=!&zHd1VfjVh{N0}V<3Wo#+OEldqZf@xjm_mFqIcdWdmwy?SOM4{dJopugX&1?
ztIO82AbiG6F$ct3A_u^LWNMG5O)_ScZI}y3BTPYNC8Prq01yku0VpSR>t~?rXps&B
za7|2#QxU%d^(`<AFpYo)-^d@^1IoS1qZ1$CND3O*I#+-{Y*t_NAsTe#|8Ut4^DLwu
z;(9$O97)N~YfNx@dv7tG8%V7@;I_JO4vciHAvLTJE~Cwh!=HwuWjp;}wo>_?bsSLe
zDC*oG^)jq>-Qu1#x{wq{wD~tSX91Q2HxPH<&SsxiMHd0P&xYo#Ji9ab);!7yGEx7D
z3VZu<T>`SZdNEoDf?XTCEr|$T^cQ~rSoeM{PYt-Jo-^(e{tjr8s5JaQ7|Hdcef8Ui
z-aVj8G_5fQ@TkSngM6d=Y}Dbe=q@)$25!5*)0TFR;rkLwkyen5I)5$X_9O8@nKs~8
zmQ=XD#v>`F4Xe|3UJgD18}vvr<(Zf%Q3OG1-us@+>1wl%xkmGJs3aik*t!CDp0BoT
zVN#mvP2@Bli$o;l!AEpc4NmtiEA#7adJO}yy;<E2B)Bu01%}^)xR&r<PqxD^I-lrC
zFhEj$Q^Aj1ho#<o!(0+Ul$?eykckmpIit{j^R+cbM$gk|HIS1&`_UF@Eo!T5ML-b-
z92=r>Yb5Vot}>{dA@PJ+0H<l?h;r$`O}S=7XnIpMf=nCrwSe=MqEd?Jc)<dpc}dsN
z>W6?Bx-eE?^XXzMz504^jwpzeb?i8ma0O{PW&sSz=<rpV1dV_f5ce!Z^>d<zm=*Lh
zX0u~sKGTOIGk|kIPk}nm)+OHW+KF#W$(U#BSfsdhrN(TX&sN^3{lW2wu6@b?oaOn;
z=h>X5ZZF=N;b+QmSa#@;lD^Wk8dH}$yc>}Hul~KpJjmm;@cZzR2JzP8=X|JHigKha
zzLXp=epeD4M8%H`pku_z<VBjE%A1O+HAlwGnx!)s6;IFl(%$pd2JGu9$Cx+EX1yMP
z2pA*3Y!umI;fY#v)Id@h7|f}QO{?f0Rxwx>2gzLKJmyUSF>o~!0uxH7jbUfa_{;Yh
zz#Kj8oQGqi$aZh@oRa(D4X0_?8%1y?7y;tNY=9qkLmVjQ%5~#&PE7FayV8Z=1!sp^
zlF8(8OeS3pL8ieLz(SZ?13GiK^O|kBGUE*P26~#GGb1fXz^EIN)CbU6GeFwUVy%fq
zIyU|HKWG}Zd$d7#ugt+%tTq`9?E<fNC#88(o@QaH%~(m}4TUt|695d9QlR--`X2Y^
zgaSl40>Nu0z&>j8y@Dm4BxXoD2hy<gU^KUqb~q)wEhk%vD#J0)r`nbI4S@`qwY({k
z2NZ8e?t?eP*xJE`-t3!<y%F|O3V4w4G{qkxB*(r1mx`y@5p*wMqxg6{vUHocJi-@%
zMn1@rg<0`VFDven;%(CqQ*ZtL8p|7UN5~SQje_4M`T5pJuO`pgB<VpkVH^R{nW*jT
z8jYa*qX?(&kM$vP{-`YlBc7WsyzI86FCRc141v24w{fCv((Heq25@4aRM;&FDXha8
z+L_gYo<Tbe%9g%yV>S?YSC)rUfm~m0i9{WB8kAxyU8@nj${6<i=Q{d{<7dmt8|JNh
z=E({;3Q5sxUx0C&!!7q2m_;P1hUmta3(yZe+hY)pbd%XMI;4M)lgT%8Dkp;E#Q@W;
zs#7^}NcP_KIQs7_8?yMnwj!<;uB92m*5emny0tDJLTJAEe$QFO{x*`ba{IlljNFCc
zO*i$=QaLPft#Yk!mgtaNdMnW;`;3s46A~3sB8K=fxe>mf5^*Bo#~D+Vpg{cdngi*+
zf&7C6P&moDSK_6;<orA!B5@FgH-GKUjv5D*J^cf1AH8oej0R2k6EeP322I%D4l}U#
zF*X3?7`FFK_($R{r(gOnOK);;^j&^aQLhnwXHSX!QyK;BP7_0{{F&P!vo!g<j*?&$
zR4Odq-xyNZA{eWNdn0EdQ`jYsD2q<{YFYYu+t1h?To+Ms10g5pclz4p(!mn_7p3>C
zmla3tl0OlyxOOl1E}zqUZ&`zC6&PI;>*v3GG><v+l87s8Qy#-l`Hf?Jj>Qk7!@Dvf
z$^VbEuMDedUAtCfk%CA{hf0@pr+|onbhALZyE_DFRJudDyGs<1?oR3MX2JKc&)(;p
zefInB`@_ZMlC|cXYtHAn?=i+b2K(T2kHdC1AKI@lbh8{*tlwOSTM>})`mJ{%)a?-u
zWJEu>L`@9x8Gv8BqGo=;d|c6ZI{)Lz36v8E_sWSIdU8HwR-JoLJtLyku<zr?!8a>y
zrjxm!CG*1$#8&jD({*E<X`aCQ#T@_#<%B`;P^$Nxu<U>&<q?~`=_<N+<{w8NLDFM#
zw{O5$KW{HAiSTLo=Ilg3Qhx&fjABnTU2OakC`&w!W{(G?GCRVg>vFu|h9FDX7xu%Y
z<61oJP^_YBz=rWQ9M>hW8p+rRu$VD2zi5(x^0VpVq*pe%owAS@3OI!?B76#d`n1;k
zSegwU@9aA&(<B8x;TJyA%l;(gba$7EEj4Wwm*-YQo}auSTkOUNoqx_H-QNWmui@j4
zDO+V@SWDh{KZ$OC?(<CYflDq~zdM)*I~r4g)`dT1I_{dNRQx$Mbj6}A+@!ttb1{AK
zMCf>2S=E6|L?@ePCC?W!T+1H;VZB&-RKjay>p0+w(J51~K3}r6X<xH;^48z`-ZJx^
zr@ObS$^LZ?at^FJLp?bBk$<hUfk3;@1*i!7-J1zf^V7!3@fh<w0|iKWScNkd^UZ+;
z7!lU*M<qVEPlAXg{aQ++Ku#6j-6s>OuX|@sy}z9<uJz0et6Xy_JNM{Nw%w2913;ZV
zaSUq{aLdf|i3g=+ok8%&_^gI(>li`%$;y?+a#QsV1vJ6=W`k3Le<U~m*=1&9rmaiA
z<FDXj1a<ct^1(a>s;`MZfI2gv#$pEK!ZIo5Mm}|>G^McdpXah@akXz9`F=k-3a`uc
z#v5If<%oalrw=oI)D!s#lALt&!hS}lcSg~?nIZ_^z01wM*vVcm!p_5DyJN}8#fG@|
z)Y;9t<@&-P965^L+R7NzE4y&|SGgUb+uVxgWs>tAV}~_FBAoyu=`(j^GBN&A#mTOi
z3#t~zC4|+uwkI(_UwFK``$S}9a1HSNO%{%HVEst`rT6Ht9fh+R4zpEd%q+b&5DCaJ
zUyLpkPmmM`j^xPPw@C1pJ!Ff-?<4R+Q77<<iH$B|cSmM8MY0!HM-@vXUbO5NPBc|H
z&?*XS#_3`Nt?ACOFY&)7b=T=q32=YUDk3sWVT+c4aPi;rs8%YlX$lNkKLS0hIh8lw
zhZon?=^XRUs#eI(bcLhw!;vSrOhe2oX>B1ERiG+qXa<HCFW$wykJ)->4UE6bPj*K-
zotYam>%`WWRLRf?*dp)w>#g>lhr09_BmwBewD-ahdbP&MD&QKSCUJ*>5V=fS--%`j
zh|uVK+Th1SuLcxt9{RSA(kRS!5YMCVrI0zcwL@G!kZB4(t0(lc_Pu!xydGcuTCgAl
zWgXJX+*UpK)dQX~1gjX}GL$gT$)mBzamwgJh1M_WtlaGGqF?|xL+yg^yc?zc&o<af
z3G?T9D%N{#^biLUQrW1`o3A&jSVfzB?n^Pl2mXT})iBAMbzFkiaDCUm2nHNb`SXzL
zvhw-T^6>p)wtP`A@QobGyi(Qn8p2g_fa@k!q8111fME`q4YIZV*HbA3CvY!{4+<az
z%MYl!Ui5wh?iE8+C@&_PZI%O&S@HAIxem02Wbjl0WGsFA1c_Npgb#0V)kl-53-#zw
zg7lXa0gHkRbPoV^mRrkWb&f^qYpWBEJYr(^-ZSL;`8lG%UO6;v<)m4d1lQTu{qS<#
zFjfekvyp3FC4wx}stiR2+*37yD-%vJz40$SZ!TEWB3feC>U_w&t#bfnaaq+;fytK<
zldFTkMDqgmYzzTYz-Zia;r-`66F`8myO_9caC<*Me2TU{9iYU$?aGKZE+EHQxZP!!
zy4JnJkGAP+gxg^;C9LRr=lA5BeLM?gZ@xi+a=DgR(QwvV{QY#mr4w3_`7;6;X6nj5
zh5;zn$*>eJe9(Y_J@d8_;6{6S;3+~$gOYW{QeC@0MPOJm^M2^V5H1oABcK%K|2{zB
zik1*?`9F8*+J9ZxM5dQDuEX?Tkblv;=CS`XBpPRzpHhq<+K0i1tU^OHEkD3{y7l?*
zPEs#6U2Y8K{@g*oK}@>WLJwMvmIKfjN1Z<hNFaVWoyH(E8a@K2EeZO@l_3dIhdx`x
zR}YtXtF`STW=KK}0vuN2ZCvdpd2~HEfpvwmGZ`ORz_PgN`x?tR{i&l>acd!Td@BwK
ze&tA&PIa$jVq{Vz6Jx8iV75!qAKV%_dV%Pauh_TxQUw=q6Rkd(VhrjW*!!OU^1g)`
z1jm4+M2Jn69}1PswU+3b!25KAmbIQX?vIOW`@R1Cw@X6?Sn!!1&K4fuzAs=wP~QSM
zNZ+euL9h9#su`AW7sWT>qzw;Q(VFvrQxq@tCwSxketfjwYCXT$<hSgz5irQ;|Aid?
z?3uJqXnfA%(xYp&_9ftgN;Y}(j735Db&2rEga3HL0;Sspn0)A<BCx!-Kih=|$|H&k
zApPlVyzoP`t*tblk|!Bt#&qVW95$GgPhLXgaGd43)WjP2Z9tOF@SF`Eg`6sytq_@9
z+F6YxliQr^qLu7|2>Dl;uS0{oyj{F%!_|I^qg*lYB0v&;kggsV->+^T!Ryr}WL-F3
zoc|0%&MsJR60gv4Z*~(!iY{}W3VIdif}*S34;=2TDt$_y_hPSB0%(at1og^)5i@3~
zmTZlzBouA=?N`b$Y{geUO^}HFia>aR`-FT%ezV7Qg26iJCpwHbNsUQ8&nnnoU<T;G
z9U4TtV(1bX<H4?3K1rwOiE25sq?5jQ9#9MaaF~FvE;97lbq2bBTL90py54F)8AT<^
zS<@Q3DKJf2@?I{T=XJ6}n{K}RZ?{K`1OTxPzEk}}56TFK^wG3H_kEWbE6~0!ANpvL
zB9W_oW4hW0k$i{QLK`UNU~v6tDkqh|2Vox7`Ip!{q4RR9#-rbU9{3VnXnfeF1=c!f
zF51l75iJ!Ea(ZNKtW>YVj~3tE5wB|J5Vj2<6}W8`5Cv0S(XV8R#Dbo)I}exONTC6*
z+up2&Fmo7nVLlVpEJgSXr)w5>)|ApOzFJ~1Jk?|!u6t1Wy)$u%=y{Ixgwy=1<+2gu
zOvWFd(GoD8i40_FW6HTz8(|?Ejq8<8JcY-}nH$GJVjZ_GOwks1B(z0Yx&nC{MTsi!
z!OALKoK>GNdkKd1m?!L5Pe<>g-|7r^Z@D-^A0anA6)<jUDdo?YNVE9*x8gh<nR#8>
zn3DbvEE(_N3Nd#EE0r!TbuZ;<k_?SSlP2?X4DIG^Q|f5;#x9DSFzN6RTYny5oi-^J
zKAb`{5|-C*W9Vr_**cM4BRhK)jvr){5hVPMNZ&UayMClA)CKbUgh*=alKKr+D#{d`
zNk=tlBl13dykT1e!a<I9Qe8utfPNT%?B_x}xPE#ZYm8~XRVPIuVWeT-NM<f)#=haH
z&(`zx=Nr0w*^BHSpKNJ}-aSJU2qR=?q3BgexHibAgq(E}YzZ7x`{JR0QCx0pN!L}4
zLm*DKJ(xV|jFd%KZvr-Th5;3lwA2*#-6j~AuRK@Ig;k-d0g*J$<D<jN-+MLA{35G*
zVwt4O`tlx$W*9Mlci5$oCvQRj{_x!FP2vbx`*_ZQ+wy;esNTE)rKmk#`~&Clr+BYw
zH%qqgmLJDt2YaC!)7IB{?G=8Jt{kW<zesMCJ!tQFWxJ#^ZJ5LS$O_WiJ6bM=+GZ|T
zszA#%dVX+Qn|B?^g5yU<p|h4hPd?)3HXhV(BU*UWnN)RDfcL3mKW;A4xF+0{Hf-Ev
ze0N6!X`=11R2+9V>7(=R1TxpFN)vHrA7ft@e{pier-d~ET0Z2DwRvRx(tP1tndvqr
zcb}m)Dn(v*k@|Jw^mnomODUY{l(~FZ#PN&oSiIC@pm!woItq$cGq>%kf(S!W`0|vV
zRgO!QG?7wDf5{@)@*fmPOzp6A@>ChO7Wl+(+&O}4>WnLrPloF=<9Pb+d!cTZ+cKMT
z1`eHauYV?YPZ<%|gzdbF?^2t1ePZjb?znEt0(WSGaNS;q(2Jp3e0lryX>`VLvs8$(
zSUwasL!ZS@lO<m{h4VT21*T*O14GxX;C{PP0`DHndHqE4e)5D1-(GV3{<(Y2`oz1w
zb<h5FGVcd)NPqm;!R}J2e;2=T=hPo1bub|yjsI5kfzJw7*WQ3P0ve&)kIRMtvel@K
zfUxa~Ml%Dw+?X&Bab!R8+D!FVJR_k5bmjb;iK`8Sp3P0EQM%p2UVuR4tDMxg<Nggz
zy@`SpDHFVvU=mC08LG?)0)%gB->_f*+RfJav{!>dEAa%mr`N?~Bi6JR<aAK?^+lm&
zigchfz7Xjt_VW7~j4SKkvqXd;^xz>p_~G*ob##hdx1%yI{%%ibO**@73cT<|O$b`+
zrxwa~J*Wa=>FzqPpY{hw_zK6HpghRszZ_Mtv528wC!YA#VK4ulE`_E<k3;HEzC8?+
z+&c5CJhNE5s@pJI6jNqQypWFgb$9qF_o)>#Z-1D_zb{b`Blxh41@itUKJd72I(ie~
zn~9SM;3UbxKg-V=GGhMg==f+#sp2`Y>HELly{M(WKj1(9{jVQgba3!iN8QOQJpcQO
z5Sc6rM&qxqHPIm=fBzeB&W#j82b3kVlEshydZT~8@6ophZHas?8g<~q-pG<c1q7~!
zk9CRRf@lFqFSq}A{o|kCF*idiXIv7l;T;lLq&4t1KNyO&=KP2yLWcfuzQKlU2=eD!
zgEw5zIa;w;&A<>M<M;7}pk&n~IDkJ&gKKocI_a=h|JN-;7P$=0#WpGK5t+A82J*^4
z>H^S}#u?}QyafNi1Lr!h)g&eQ*QtDw^S(oU+tJX^{@mLehf(nda6k2ihePrg653a4
zGBWw|J-vm>!BvnP=N)jk|9>=%Pw=^Jc!kK?AW=4%=d@^lo&FochaG4yj_1~=p9;0!
zu|pl$sx4<e%v4)|V8QB4-v@AY^l55Yib<;b(gAN*kNOr5{yJz3xb!FUYygs`lun4k
zh)cRZ1_dD~I_kQUC`+y$Q9ogq+Vg)u7i5v05D`K1e3P$~juH_hE;k%DE`J^BU(XI%
zWZ@t0Y12kX@aN<7@4E(EHWi2nyZKwuZ}Mg+i{GllmlX3%${zo99{>ItzrVl;UJMHm
zJMGdjRy1uRocjc}d3DnBudx~+>YxzRj~pO$c%~WF@}U3_Q{{RQ{Nq3u_MzqO)-3=Y
zjQ>?y+yh~Zn#LdwqO9DYdy;b`$EcQ=_@DRLI=sj$^S3^P5Z&%5hQvHli?4EnR1DLI
zB+jk3stx0w_C67>Ex@Q*<>rFBUJLB1K>C^BymE=U0gxEfnbfHRs=#8UlT;&O#!B`S
z6B~PkeOz$uS&Qdr*XJ{U4qUGTp1d54ab?vLqdxuj3I2H;I{U~k+z%+ba(}%&>;1;m
z(p$a)v6#8C0Da=UooLUI=b3&xkd$4uz5{O~KiBu^iQox7ORDEp-l^w>ZmNr&IqZtV
zu{Tcfx|8t+=z-+*w|*3_v;X75uQUsNN3@cEjsnLMuNA9)HyQ&j%r9ESTpc>)!h-^<
z`pxXo`qhi_@%0ol0!*bw)iT8hkj*I#)L{!d-_eP8$~s9Zuw?j#a2eEB9`TIjD@!{h
zZhRczyDp5TgU)9U#64|_TmR?1AOx2V{6(xYL%)u~acvg$gkJVF__6~X*b%#q4Dada
z97oG-{kHj%)56p=yV=hAYERL?^N7jwV&&ctrsijxGfi$z3lgGV*(x`G;%S%OTsnOr
zbzby23qeJ1IzF56G>*BtDRnMO9&W5#@SL42!|naX4)~$#@v({^i_4?c3P9>)tIl3E
z9`87-w7MKV5&WiFjK_1fp6am{F*9y}3&bIPf`ULW>a<PRu$Y&VQof*-_GW??U|YOH
zkYM9~p3?FCDH$W=RSa-<$|p+FHaWzn^L6feff$FgTSU@HKm*_V`8m$obc#9JETeHw
z0;#i=fu=CjplJNOcIq!LfI^L${^G`?w}5(r6=tAz=h84AS?|v)*IMgdw?Y!5$nC)7
z*UStfMrQUZN@LO%oans_c0pE6Wn|p~ua`zH!@n<uwNtr{fWvh9YzBgQbG!kCN~f5E
z#;bg)!f;B>D0x){1~B+)vgl^@OMt>&z;)ysumaaHy7llJyFr0rGow_GYAs+*%;d@k
zL}SuERFLE|r-}b8RDV4$K1n!8Kg|{Lo>KQ@=k+iLS-8osi(lt!H`N2~ryuY}Yujyn
z=QqcW9r#9Za2#K?N?-3*&sbIf@=Uc|Jj@&rtDaMdR*kv<|H%e}@=beno%i|25~D@i
zq*XhFH3JGV+%M(dtM*jFh8j>11jkEKwLC9ZvzO#KYnK@sE)5&j_ZncwHpOH%=G&ur
zUz@JB3dS4NSlwV%6F1!Vfl=eYLer_h;5Z%iwaHrEP%%mWYH^H=VM4Wn+c#NKLNGbN
zWbF>Zf~Tu<<BM1-1g|Lmv#$N?H=!H2^q$6{$C3mbS@=<GU!wwWH>sS99k#xXIfkId
z<u-9SmwF{J>-t(uSN@XW*?e(#<^2e~^g}U1A29o)@F6U!T^-kexx(Z>FMFKNO&IO;
zTE}xIxM~P0IgO<UksO^GD9Wum0dQ@9xwIWUWQVSa3I)B&WTfNCS65&<4Rg`9`H92v
z+|YHupV4g1A4}%oVKRELfQ;dc_rpi#K+bcPRPZ^ITp_<`<nnijMHr!o*uS@w|Nf@5
z4k+U|8uv3JT=CJ;ha(bVHeHl+5GMrGD~;0;Npn>zT&p%E#QaX}rcH7#9|Vo#h0#zD
zX(@8mqu-nOnzs!+#=97d&_=X4Uh8X9xCYL}&j*%WG9DA7l?^lJnH4vU(~`QE_Sv)-
zp>wZbqzYs2Wa_l}3Rq_*aQbDM9>H(3x{kgtbd?}6L18nwI&{-wNaL<BGOEPi{>KVP
zMhGv$Y5vy1!dIlZ{&s^N7cDbbJ60_f3=&LFcwK@_teeW2wzKQndgcV*YaTo@?oP-%
z#pShU6Dpl3EJhiqICopYSXQ&OLw8A@SDPSZf}@2F62M_TiGf@Se=WOhQ_*uFN4I9W
z_oEs}dZ$1`?omWE)`y%B;AmocOu%kB3DR6{e(zd3e>IFw*oNZ;ZmmaRpV{4M=N+5r
zd<)aHiZpi#t0rdOGwe!;g+#}{{y#1`7LwM1_!Rq4h4iM&V3Z9OHUoN$!J3?d=IGwW
z)CeAW_`EW!D9F{AWRi~wG3|9{``}+>FZNL?ntqejF5|id7=t^GDXCoYaH8zo=eaX(
zO%54NDCTQ&*PA5_C^cRH8~c-w5l?tF$UxrO`)?)Vb58qB&9|Xj$6dmrRbUY5cL+lP
zwauzlr^{G!!E{{_@Z-%>kX~1=Qm^W2W^v8%9;;)_%nK&$KizBCS7+vp_SIzjXFaCo
z<4?+g(@Tq^Ke^$6&nYFZE;z(0m!&+&#V;d0QY7*tA&90r%qE`pUJdfoAFy2}c$(He
z1NAh&4a3y8g3;;HGQk&>^;TNU4~K1t`8h;~N6Y?mk;8}&op%NZ6C<QL;~FZ#dFE<$
zVXd&3T|$RTG_%zp^1b^ePdzJmZY7&@;;~A;U*`8;L!Fal1*Hdok8>Z=?Mpc=r!#My
zOSZBf;Sdh-^xVvfomRv<kz0#|lgppWJWk+svK&m7_~#js`?zQIL`hzv31A_DsUF|`
zqP#-8^1KH&kxF0Vm#G&=YOJ8!%bw*GCaPZyed_bw&fg5p17u6Xi<4E!v~%RKa<{zN
z?$)|~77q3MRy<*u5+Q3X&VsR{)gD-6aUs$36bnUGkow4THs`c(Nv6U#Ge~8Hs()@I
z<QKqOF|&sCtas+lx|Zj^j_k|(wD9ZsVh)R~sCQ5g>Huj2_KOU1W#KiMY|y|9eGE}D
zG6X|&XFoTFs!MwGO`7S<)^SvxP*wv3n%Yq=TboXbs;M>1v6{17Cn~IhGyE$J5zl`W
zX@4wc_m$ey#v%SL(+5;kdkf$>vP{t0+A1hHtI}c9d^;s!0Y;^8=(5TGcM*I5H}w)D
zVNu#q%i3{w{h>;yxL`hMCQb8lp6*wzPD8<eZisxK@I5=wE)Zh=cR~7}I;V@q$3NAA
zHit0@_K)@U0h}%dnKM>E!w>F%ye2e!U-?dJ41E0GGxHXD29GrFzr<)0{?B8n1jq8_
zY}{_?A1`4bJoM$%t(8i8{*U75MG<HkzBQd{a{c2ae_`)M>1TVG>!1Ez-~M&MB<VmG
zVq(RQ7ygfz{=ZKv@;w|Fm#hJGt~v<l!rE%&iobvdqw$;j(A%3EDrSLKuB&1`psrYG
zc>;p867iTK%>LCUkqsapcon<s#n=jiYWs65SkLw8l<8$p*Ke=)dyMVeHnZB!=H2!|
z-HK>98H+weWepq&uHDWKJD;B!=U6o!*0*>8WO*SIR{9Fg$6G-5q%Nb?(p(|(ykQ^r
z!V5(3PVhK7#6V@^S-8grn3H=zS~|<@zD3R4)&indy=H<gl;_WaUuX{%^l&h@R(vw=
zR&CrkkXFH<R{q2DdY98{1#r<<6Y3mB{Yg#k96@%qA5rnZN<8HT<9D0a?TLXIPiVXK
zkw7=XiD%B8Np~LjBw=>NGNqysUyWNfc%4n<6ul1efE@wv+TCh_0#x81M{qfda+W^8
z411Tt$F2I~ukU%~=|kJw<F2q37SO?Y`mpwMjW}5je+z1YooSqbS+6N%s-&nbbhb*v
z>$5^cwBJ(#DFSW33`RLKPp4%~J+>xFtgiPON-X9Zxw%FyBl0Y2G;-n~4C+Zsfw9rl
zaukNrYf$j~_a(4#x7`AhLPbuQ{hMJ8yrv50YcZ>KJ9xN<XkxAxOY@HF$xfGI-MAnT
zN~x4*th4YovElZLkm6dS***R)sXK4r6)|<@aj&t*ioOeI+WxqO)iBA9DW%NK{U5-&
zs4aea9$pHjDh}|U3&JW#o3FN{%%*IOTB)kG_eM7<G#W45QmbqieRu=q+07iw7i4C(
zL!!^{3VFA3GiRZlc-Q2pcI*9T+n`P21?8%SX+sT2OPxd{eSAJ+3`NE5t+_Yq7F!VT
zAUW%ta5WeN(_A`q@6&|74cuJX?e;KHP1oA!X_cDhF=*BQKtSKwpGb8-k~~`q+`nE2
zU*K7%%i&MF8#51bx22E5*~cTzr_Qj`iA#r^B-`M9Uc0r&bn@RmK^;M@b9vuC<$<Bt
zd60dy77KM4i}tK*o<-Q*E}?R(0ouDuv9<%u<bv=5<vQD!3r0sPC50gCm&33Z@9YOi
zEpi~Jr<BkNEO)clvPCZ^8kK~mIFP89q{<2sN1GO9oZPUurQEJuL|4M$L1(8<@H4yy
z{198>@<l=H$S&Yb&+aHsn}<vDbuPa63SK!Cwb~BOyBOs{v%PW-InYo?9k0xXUxD;N
zllw^7ghaEhiJFD}W7q&sLDqiG`8HK*XlP?gWdxQC7kPQaiGa$e<o?eAf$JZn(Dk(Q
z^)^J6aPtAIKNtN^P_T&HD4)E3;PV1wFqUylQBH0KIL1r?i;bN~QFO<Lz7KKT(}N6+
z=dLujN7kRtfU>+$B7%1Z#L}IL2|+zhaqeEO0+OQ=gw|fc!bk<jz0yJB7Wc%F&2)4S
zprxCvk|$!@&DXAK?Ez>RPWvr4V8pRMoc~OfH1u-2q~(UA$xRR&c@adANZm(IlCIpB
zR`0HPb}L8R)1D(OZg2&XxaOi}E3%&s9fVc%^YoA^wDW@wB(=+?#(WC%SF6h%?0lz}
zRU_^Ski;S*vHc|cmNkTyweucN2`%cI!D2Lky)$$RT1jy)3#U1UnifVhNrjm*y+D(e
zA<@*R!*4LqY>l074yOCH%nxTt&2X<L?Z#@ktVTBvWQZ+jEaJj@qjTwadC_Zi0*%pb
zMEdpF?KSTsIg~GG_(uUsdGbF=-2k6<E9C$>J6N+H6#vrTjL#2t6sHLX=IGA%k(*TJ
zotN3&_UepoT8s>WL39Z(q@z@KKh8I`d+QpUj7JMN%|S0mJ}{cyT*C4Q=b19|8{s#n
z9tUmnG-qGOQ9TgeC9t(w<+N^r(Pi@%SoifgNaW)^Cv7ScSQfa{kBX*M>JE6~wcvpe
zelh@LrOSTVSC0W}8NU!XuK<z_4ij_E>oU+EA)=pnOcbhZ7u$WjBO-eByIt{7?$A0|
z4*pfBPilED_U%#93xj(CcVng|;_kyM#~rTotd;~iA@!aVujZB~+9!L-M<HLE-T9Au
zM?AF#Qk*WC_%al9#NwX=n!bfJ++pRj#0=<gJ>IWw6*NfLv%k*MZptil+<mU8+#BoZ
z0&-NIZ=bFtm=(Y3L#VJ_eOsX2luhjEOng3ZOd{|sJru`dljn3)L0b6Bh%CsU#yWl1
zIf5r?&;&0Wsgnb`90zDec%^pJ-f2ibSLR1LOP+qyalYAo(e-cZa}Chl7Go2x=;n*D
zJcS95DZa7ksu>qgVYEcYUB;6ccDs0S6q(KXz{f5<Y(Kzwy4`H-Bqz)U;qkML^9Iwg
ztwlvm@qzwAf4c$x2`13e>FLkCjNR#_M}ae}M>MGdz-1q_CLomh&9}#J>>iRzXYFyX
z^b}@$Hef%NW3|@tJv39+rG1LEN)CM9Ly&M3M_WMn3{N(a`c`)!8sVC6`og7GO&9^O
zkyR7>sGt3YHE0ZPmlwR$(^^YeY(|S4%Nx9)*Le6h^4W>LNv<uMtnc<0!ikxV)<?iZ
z$eZjW$ILX~qVB#I>v1-<?l@ynbKL}WnyV7<J_hT@)R4r)xhU?b_Iqnujz`p0130_i
zz>tV1bgsdvW;vX9-hi-aO~9%ny9?!n_~I|r5&s_4jQoTSw&r(mfu!0!(7k9CZJfHD
zK$5Lpzn52EupU1dehbUegz9@<EFrq>R!vaAKn%`SDxIiUd+3Qm(^dbHC>kM0=*ftq
zFbo$~rBU?ZXYl7!*Nn7J>I=;g;^n&77~Cg=v}YH~vb8#Id0RYx|B%cU=`{sYUWd>x
zCpSA4PKKo$bu7^4Ti5)QX}XfeEmEmy+D%f0>#_RN{&<@xWPmr<hXdu<l#>!}Xn^Xb
zYFjKn<gIeCra{eo!r5Tx#xfWgH@Zz=3l({o59Z3R3&{)~65I6-DW{7-?jVX&vcS|P
zXrAroF?f@oE!y?=c~97AyuaDyfq-hL=a|wRq}SICe}{~uWrt^AF^+Fd>mfuC=+Rop
z!|rpo`%;?Ro?1i9ns1Ki2)U1*<4tNA<7wBjQs<xhB;5AVL}m=&!9od6g9f&ORAe4%
zRe8d$raHn1VaFh_`ojCOcBLsOP_m)j(<9ev4;KM`CKi+G{JVciuKNmEpwo9LA&}Tn
z^&8YRl@3fk`t0Yt%QP8jKAYm@oOiCEK@AW1bMKAEB6I%A&E)#rTZja2koh^RitnsP
zjQK{QJ@Z4NFZ&R?4L0BV5e(Ws<1A8$=?SadT5s1{A_q)`i0i@^N3Cuf<g)Tc<md|L
zRi$AZ1fXpz6rx|@(J8Z)^VJ{NTJwL7&f^;z3`v7bJk{(0iu&gtlh~6ejLpVxV%4zd
zrS@&Qqe@_=V|S5>&86X_g7b(_ZD-vyXbw~ICOgE*x7whmUsv6=2)3yWYP*&;)4OLL
zyN6c>oi5p2$l5Dv4<T;?(LH7$w4UYHdv0W7+evIDz5rA)@O+b6uE!Sjsv4j(Y~^o!
ze)Y1UNhqQUAo^PPor@go3#b-LK=|&?Tc{{PfL7;s>NX6g@$OW6Vto=sa6E?%Uuy!@
zrNmA+@7~EoPU{JfHt(`v3kI9_{GU6XYWa%9UF;t3-p129^|7j<SaO43@bN6_Q!RH7
z6fE$nfX@ZXm*`9a8s1n~YO?e1U$+wRtYEWVsW7<d=JEmt6~E3$F#^)qZL4e*TU9D1
zjW0f`0~-+cK0<z#2X7r%M#aO4R!NsHvzT<R;P5J+J{P~p>CQG8-@*1Gc2(>U>I_f5
zfo?^6WL^mf^8%<(0b5%+jHC)|K@cr@KJ;hItt|pq8<{XYU_WBf>McwY1L7a-XsUzG
zKkoUJ<61WCj(1VEH9f}tu^sz$9X|;*!bM_nEz!zGlv4KHsjp&)q?$oDGk8n`3ZzDU
zfH{IhY!4-cY+!`<*?<56=P_a^{k!!bxHMB?8ykL=fW%<W0!X~4Yw=>Jp^ZXXg>>Sv
z^m+~lh^@cNEo@%2E3{h=fid^c6<;v14mrJDjTImSleVu?9dehF05YXV>@fOSV1zMR
zS``$9e-w$ENT63j3{(QkiaT5c|5@A}=19}El4nfDP^RFygT;$<9clX!>*QFd=g+>|
z%GM{m&zKioTCReqZr4e12Jr=M_*I0?`SBt*_RW^yp`jV)U+3*0Xl&S`(0aClkzBdw
zTCHoucVdM$%i^G7*W^U)!}PiVaVAp$YTpRt+U!Y(lLo}MwrhiInPm)Je!e*lBtUUQ
z+`&xoof0I-sQ02nYg<Rf8RVTl23TFcD|>hN5cV>yp?e!-7k2A<T0eshu#G#A#H@WD
zfv4@9o_a|cr$)<lP;0aNTup{&VZQ~sa+hDi<YIc|OPE}QZH_mfXHANIF&wHebj~s<
zpxu(hH?XMb+^Ydc>ztx~y&O^HSr$M`Q<{4|W>VbhJa|IlzFoxWv1l`xCS3gNLg&tX
zV=!Gf_A%+HTGoy*M8RkvWm6E3^4(<oP&WCeGGSwbGx$_1+Qp)?S-XMj^cbTgn|jpA
z&@xbx^|5JaO%%r_8b<fz(mX4)h}LB@pPX@=bC|gNzK4Pd5uOV@Yjgc`qrZ?Pi)u>*
z2<bGy?kfdncM#hmv=@dl1Me>PkDTw98h!dSCM@9*dOEP>g)tH2s|ABjZ#8g^+HURc
zcHd-yVWY)j#_^7)OJoR-+2H3!$+XjgN5mang<%9pCG{DQmwN~tCk*S<3Vre894$+d
z;YFYLdR!AXKzm~EcA6hl9aQOKzs^%BDZZmPpLc7Ay&9FBVg@5W2dr;s{KP~lum?@1
zPkvxSWc>8tugCRH4oRFBg-HvCpc{Zf+HOd8x3z;(i3;OsrJCC24w?p{<6TbBP8aIF
zq3zE9p4~l;-0W_O2+C1w=<NJyr`&Gn+h|7whns#U7)p7FcmxTS8lTB<>}_2z8Y06u
z1K0VY06fgMbK@FgR?5Rfk#eZN`nb7s8*kqQp>zGI&nP@w(iSLA&-Sq(L7+x2PXkq~
z;GMw8`<G}9@-~N{G@L_}(?f+I&ioKC?J@)_0nC~TM|OPOw|tOIRCA^p?_zJJX8$oG
zBo#FuF-tEW6s`+KpkbQDBZfAqi$xSPz>;?a^-f5<Rz3@S7Dbqmz^$cnwc4^17VB}?
z>m+spR<GNO4JUiBQYhYB>R+@*WSLO&utE3DdC|+|TN>95rUuii2-v(S$}qJv){U&a
zwI;*%OifAR_v%2eo#m7fj5m}IAxvXuYb~&y+sDR(rDZJvmv-w{owK)Da6f@-<WZGb
zxhvy!13vTw@2q#c@XfAszOWsL_`7<SI@-)^LMR>G<b3$V?PN6<^ty9rGZY_XsuloQ
zdC~H_U|vl9)Mb>ZiE?0LDtmZ4ac(;b`P=vWgetnG7-G-O-Bbr8csN?m;@&vmZ<vKt
zj<8*^E!R0zXxeSsV>)6a0$2izrSBg<3)vZ-WepSqL&ZriM*~CNKVk963HYrYg718P
zfK=07Khap4Hs=?Fd<d)Vc@IRZD2L2-Q-QS1JV&j;Y#{IxV5L%B=hZ))AP*$^eHVBK
zmYQF`NhawF;I5?717tS5a7_-8sh#txIUjKyCaZC|8Azm&XX*$W>a)R|m@^gCXG|`J
zQ#hoRh-TQ}^?}oQSqSSX4=*39Lu+n-_k-XZgFq->XZjO3g7I(f8k0aJngVZW?|JoZ
z@L;fz!^3g+*O?kW<IKO{tjJh+5mxiJY7mq`cpt(h<?JvPYu{9onuWcV%KV5cgm|R*
zCSBThS4)7dp*_c&R<ksWyWB(Vc^iAiYrH6QH?iKk$24@^Bk5Cm)k##*ndAEQc+C?E
z9-e*6>#N@(OgHs&%HPU#0LNz;3hQ^pH?h~XiSmZy5I$2=zT1`94bS->{0W)jp3rRm
zgwURj9W@l8Eo&Rk(YDRx#IFKnKyCi9<zkpH9Ma+xPePSs?OWua;FK@42-en>3M<7&
zA9mvIOj+~|TY_d)Ai|k}I>rGZ3$xBHW1C&}c6wW5Y(1~dN1WZ%`%y4jf2l#w{*Fhj
z1PJ8@NYm(4*uC$P$=7+N#HMLZYOwrcyfbKz^!nyA$Y*EGp^mPL@1a0W@a2)xEJEzN
z$k0_o!Cj3$>jlbbPv7yYSvy!%=TEI<boC2(UvdRkrd&u#yVJ|0SAPf0fY0)0mrSPg
z?r${9hTKE4%$l^$#C)`Gu;!TP*sU1rHz)*~40|g2-ySl(*6ZN4-!bOjZ)C-+l@l_W
zEY+?4E}d1ND}g%qn7*b#M{W?E??C8UjJ2roI2=}K9$%zY(P7w-_8yRjJDWMFXHyL7
z95&zK6<!He0bt<LLK{>z*K;_&<AV%noV04?-%?+G2unN_?qT<*GoJPY{#0#JcU%_J
zY}Y4WWp9o-X9qL)XH`!>TRNgzl#uIy?aT6;m-Raw86sdu|3t<lt313pS#Uubb$Tpy
z<Ru=IXi8i*pz$)h#j#J|>(qF;fmF)?fRFggO+cFkEAI$%!YvvL_AP#s->)%2UvN(s
z56!&hSd&C~$YL#TeE|12mK&)`h}xGEa~vbDu(J1Y;<Q%_?8Z8^uDC<mv?A}FweXi)
z*!A@YH4rr4&*UgBi?xRXh|Bbz+e>kpUCN$pTsR*ZBp(|XQmhA+$I(v#_8DI|hGfmi
zHzT{ZJ8*&$L^iKq49fRIL@N90iY+7h^}<k9b8-8?)W&Y^^9VMG0f*K6PR=1(xLxZ*
z#5hCwAWOtrd`n*g98C;K54Oi|*YhY^`x4nh;JtnIwr-3D;+F@Mu7~MAEYznB0M60^
zBxFTs0ZQ_-D7AGKSvP=lOw6<vX{&J@U^v#nw{m2Z#tGf36L~a}e%WM^F=B{*Y!%_z
z&$QNjp|qo}le&SIX`jne|3R?6@Dnoj<#by%WSfpYOy7wMRR`$$JD-nW9!;Z9y=o^E
zm!U8m^*c4Q?JoO->g~<l5F@}8w@BRzlbwepn_JFcOP`)PmZPSP+C7Y4(4q`2*Up&f
zDQWxw%FO(+VuenCd#$U`h-UwcH&WM%t;-}}M>){XuCifw>$k^e3Y;k3eI(SE8SoKu
zUAGiKSa|3C@HQ)@pnm}B1v6kxmIVEO;eE^HP~YL>hX6^;cHRL)qffYKHn4A~m#OJ?
zV>GO)0>TIYte9eq=eV(Gad-X$OKnPN*c?lXDp*%R(r%{i{eRH`7+slg>3J?AY|-yR
zM$1j3K0l?GwMabK%Vg0V%~vjn(dKL^2DwGIhpu>wND1Zo^B(*B+dFW)R1HFXPZ*6e
zIi?IY;H5>=WNf#^>*J$*IXtfHN0=!LD&QG%*E7k&SY+(A_8ag^N3na=cEgnbSCr-!
z5HTIW5;z*AROx<cTL4Cd0%jn@U^;c3ReNt|YNWVnkEh|}i<J32r){oSl$2INaxbk&
z+$#6a(2&A&4(`tn(vBUsgR~<&At6$OO&O(LhLS=0jq^}3Y9;4BX8mpX6H>kBBIFnn
zCq;h1J@I8LyBO+GdPFr}Z9GJgI%KO)fKTD68jV_NoU8D@S4E9xXcx3)XH4vx)W!sB
z*p*&dyF^vecejM}<<fH7ejf=ny<Zw@Xhyzfa23zpovDTtD0qY>Ptmw)Sd=pO?dmyQ
z1r?-bF6jb<xjHA+r9*MRsF3is%AhA^viSBC4<z|%In70X(6XqZN1BI^m%Kb)4LPB~
zh%UmW53NRxfo<=gyhdw7i8(eFUkvbq{QXqsu`aC4W#OP23lgNKMmcalXl017DbTeM
zAt7Tl&Lk(lb_{4LGJ$XNSukNW_8c~S9G6W-p98zDqMvx%WA?FH<ODcOi+;fc$DK)C
zQVD}`_-(r_xrZ;t3XtI_x)+gA$c$Fw5}hD%!Mn5>L_2zMbhd@C<~<a$f_n$Jw)}U3
zj1KL2qv3hb1;MR}vb=ZF#XA||mR*X2Z1z$Pe#*5XgOyQm@Rstf2Pl7AaDdh-{a2&h
zSy%uj4Oiy603nqrfwhNCSbw=3w5i{`DWZq5WwsewJJ`>CZJx;KR2*hE^%CVnZfgnL
z3j%aDkB-OJLl9ESx>ekU-@$Z+;(ZAm6Q}$G23fhxr2*nu(VxFy-C0xCrpd?VD_JDa
zi5IH#?9NnGO!g-B5~)CfNrBy>Tdk87^u&3vxSk_HT&q1RzfP~nzCy=@=d_Rm?d3=*
z-R^)`gM0c$rRl|nHae-mtPypU#YhKYUW)rxULgsOtW|8Z*RczMf5_h{*dy=zwbWLb
zGOYQ>_PP}3aa6g@3Ewk{EL|m(oehTwE#c%|1WS9OF+VxxwIXuS9S|YMDNS6QE##LC
zX~}nUx>IMR&ucxT$`qfHERHNToy{HSv{c`gN#?y4I7FA8UfLRhw`f4s|1lCZmq}Ws
zp#`)YkMhV4!io_h0yxD6VsLo%X&phPpXR1vhwA`D3cGa+cv3^!2pJ}#HoJFRUge`p
z@7(_OC=!|qVQSQ&wJbCn3Y8BevVEg(mt4bQAB;n1xT`7jR=RV4*Z0kkgEa-=i_5Qt
zH*T`FER3Xo`yP=#{!xYgz+-r`Y6}EBi?OC}H=D*X0t2XZG9M>xWG{xr7&uHSo;06m
zw%cR-%ju98W$Of>Y%tp6^VUtV{dhAayqRof*e@&kvr-o41Nt;=tWD;U8qVD43d%Qq
ziII0237_ejV0p^n*?0rRW<BA@hNOm5F#N$ZUA+OG1%vXTzoZ6#9G6eyi}yK}T3*~E
z*)qRQ3_emGQ`VMG?)Z`&i*Ng-Nt+8%O>)phpwQ}%V%vA^wube~RlJ9c1RL4+9~P<%
zzV%52OzI2+=Yly^{o_nu2x>xrQWl<ZkkP`B2QIxT28?BFz1x2H$^TM|KKg%f?wr&}
zi|`S@97oR1PDvamb^l<|)1|>$U8UvG$L@BG%|-I}8s!0;BppWgqV({5k!JmY$w>B;
zuI0rfSo_XR$=5XNHPI9O;qpr=9cFMjU)wgI%HOK^(IV5->ceA-kU^Vol+%Pf=~UC+
z>r?*p=PKip@g4Ze6}jQts*R0wy#7mD|2GJb!hi!*BYCZ4kah4BHxb>_;y^ngGK|Sa
zgRe{+B#moNAHv&0g@i|oG)^nVo=h-k=DyiWRU|TMU6&+!V=t`3gzA5Ktk1-1!v&9l
z<Rf&8k@P(0o_<{U)SBIQhgtjUshVT3&>1qdcjmci2B&~5CJf!i6KN-jpW|^V)hr#@
zTgceK5#;5S0%hG%qxY>yamFlQ``@@fY*IN*ksbxf1R3-A{ir^35KP#y{yD25GPp>k
z3f_xocbMR>H{so-QO+mafq>6nLI|JBb)Qmw`S``q9@4s>PzPMlmCKjI_z|GK{GtY!
z4VDFuVq%{cIPYyf-2Z_r+0%uFAZMASh6+$0x<S3q_vSg*KY{l*x+jWzBZ{61k@>?V
zC<TM&cM0pwk8?|k6ordDdZW<RdA!A@hVwpd3OJ;lctFbXAM2Tci;(_c#3%3|+ol_d
z%jdQGKIL!j!w4YIWWil*N-TTI501DDP{J?9VT<N}yEp%JgJOe^{YU-v2Sfda9$cm%
z2SF~v|K5-PMP8GA1fP#h)c7AN``^D|N`bTfF!!V%;crjozuuA@u)+{TrvKwOeqaJP
zQT5!f0$~1;sEUQEL%{R<^PUp0Z5^gn%8#-nRi^M3I{HI93IcpnmOhdK5Lg+^@PnoI
z;h!IM6b=rn(T=GO6CPs{6;5>CedJ!I2=t?ozl{M`)xdDMgqSF3m2zJ)gHVSV8Q$GW
zY!7sS3muJ%Ij3s#P5DHSxDGqpE*US=>%{i(#amBxN)hZ|j^M|o{|)X<5qf5z5W7nm
zU++9e#mlb!0vr|wDMLxE3Y3d0^Ppb4f`s$VFb4%@s(SwOor-G5K7#A59wv{}lAL#C
zfQ%4#3gVln=IQ?}7DCbE{1=b8g!AUo3$oON-cJP1q|3&K!(v($c=I~)_GiUve&FA)
zZ3kHt2EewjtYNRd$7rFEUH=n}4pt=y=2g#=&ng6b*?5}pBaM}2MgC)eeuL;@M6DaW
zsd9r|!0Xfio%kdJfTuqYgG*1A1UI2+7goh}{O%eI7&|q12VqgF0)Ha&#a1836x3Ny
zFg<x+!7G$<89RZHa$+9)QgHER2uPih<~9Y4qOvjv;zy^`g#7>obXEZf0II4n`agFa
zpGaiv$5WTK^`J5kDn<fv>hmWm`@kCDnn`Vl$_C^nPXaJ{_s3V6a_K~lOQo-~(2mXs
zLZj}ax->0bUN9N%A+(-gR#G*j*Kr}xC(Hfw?)2vS*{;P}Z~SL1PoVWtF4Ym{El?`x
zVo3oXQ46`Z2N=08Kv(@}+s5pHYd^3XHUi_5wfv(EHBi2OgQZXs{%NCqnz)8>gKbUQ
z&8)M6p792-wJjK>_vF`g8SMQ;@T!vcAvM@%P%Uk}-#4z0QUAQBzB2&TcI`f?QN#l{
zH%T3!W}N}+p;M3l&;gA{o$ENn(YG~B#w45&mmQP=ASWP2usvHk;i41t{g)E3>2iY*
z;J}&c*!!yUb<CXmai0L!XuH7?y<K|=DDNX{ApKm+kME|Q!dv<lH0_l4&_kQu$H91c
z%f`uwY>l!}Dskai1PC6Pw?HxiLP3jz5T*n;AuO$G(!W;yqk#Ln-iEXe1>8HrKETam
z0?@{WMRU;dntE?3`;h~xs#kiQ;!I})Ms?-{5XVtp^F&8RR4U_qyL2XXl*Qb<$-_BI
zYX4~q%i1UwYZ<O3_`3usI5<v->3F^Yh+RLn#LHRS1<gUxFQA8TaBTtxP;XjbbCJ|V
zi98nS`td2D7YCxG&zga0z5@L^<j-BjUjwWKe_lIo`ZRK6PED^Se!!^-BvPJP1C8T8
zUZ1rZN0~(mop#}pEXts9ZTkn6K~)Djgt~fM4-%Ij%{I{bli1Um0QA$uGbabjlPl0l
z9iCcmHF&fLy4NP}1n7Ysm&^)9k!$gRM#|d*ag}>To<<k}yJLej&hy*eF%ZL{7kDq1
z{CrBAR6Na1kdL2?{A{!JFE}2l=mEQxh}$-o4Y+(JK0q@ub#Oj^EF61H>P$A)&R|_3
zkv9hH2WuAlb(Qijm{dUBf9#RKVPV=n@6y@tC`t-wfSpI|baOi_2_MCafSfsdqG~R-
z1U!_N1l*{aHY4-J=;By44B!hpN@)dgqq>MYEnfL}%8kx8*PvT>)W|91!x2^CNZVS<
zHMu(Abv-F<^Fy4$=0wH$_m>>=AfD4@+tZdEd@o^ePwKtH3<KpkLW7@d=iGN;NP3#D
zhSrL4#oo75=9*x6Tx5px4WIm^H@=d(m|9QL!<n_tE19!zy(*}?6(M1R*b7)gyRmEK
zMxb*_6&V0i6~-e0TTcXX?twM%Bz0=Y2ja3TvJwHhjG2l7K0Rqow4xcUG(+FSELvlx
zmXj}cg~RXr7mwev&H!$(Bm|oV(|#2!kbf-gBGh1=_vZ{#5P#s)^!%XjWV1OY?DnY1
zwU6KqOMj@)#%p(e(J)S4NW6zlQ}}0ypK>2WQ?W!`H0Q%i80V}G@FO~7?a3{*8D!JU
z5AGQU-8ZrQ`)`yoC$=eoZ<@y#tX#a8oj5G8hbUSr-4Eb;U0vcq;X`iL-$`(`kKXaa
zDx1K)_<7a!D80ZTb6TU#D4OK-veM()v4vjDNc%4l&IbH54xD<84&>+Ft@D_J*4a95
z6s&u%EDdv+W@GbE8EeVDkdNfq89`szAE`jqfMLnaXk*QsV<J$zOv&yOniog_`;h__
zrfWcX>;zGze9|^p25J(Gy3ID6S^T4M4KOaI+z2q3vy4RU0I;dMC<qD1NijB0lDHlS
z^X?3aT7dkA=5XnXud>wu)|de&^P8zrG!T7!q)&ts2>BNz+UWsEV7(tmimokAa$P_0
z+(Q}YVkyf*ss3!gFl7D^@=bjnQz!v4zJ(L8T=#4-mOJqFm8K<NdR(7X3G4-e59g+f
zoJ)O(M!<1K#pks*OT>SOMmS;QawmL{-I%!f^4a{i<js<`1oC{)qj@d>)Vg#}1)BUJ
zUmFKs>vu42!6%zj5dNImGaF<<8um=LvjJb9O;O70)m@jgoae8Xu|FJ_*+B{UzR>v4
zzy7S(GbPlvEuNN<Ro0>0S2^P~@`&3VQ0Xvx$WZI8eU%lg0*V^8jo*U2z5}sc-?e1>
zE#`QBV&v<1U^EULg4MCKK`6r`Wydq%UJ1}h2`mb9mBBbh??g@yYg^(*tMn62^rQze
z+Sf+?r!O*uSP-mO>YpUsY6!>cI4g<66f{bfRDdYt%u840!TssqA-|Z8E$T(Ex^4I}
zm%7<hlKvI!veO<frNA081Hs%qelf!oNel{&@5non2b8jp6H(weY2ovKr+;m}8GfQW
zm^?l5I7t+p`$sj9eMoA^)HVztlY0w^ARE6u;J4`Bh)-;vl5lekHwU}L6jR!{fKTF%
zlR?D4zNZz4&mbN>IvnQ^r|3f>h;5WlrU@(SOfW8kX+nTqbYi*FcyBz%0uGTDaL(Z{
zUTnpNNU}C8%x;xj5%Jt586UygaV*E)V}e5Rzotg+C3pHQLb-mN!i$t$waD5bk`IZ{
zlI0&zu}UkLc<yKKj?%v}o5OAa_f%Dcz*U#gf^m0WnO=v!p_9oOgGzA{Z0%OWtAtK7
zVPv<Y<@RX3C8u+9x~f3XNLrMDtuMU2G+fB<AL`L^C|PBDwy%&hMy*__c+W|ufvNCr
zVs9z`Z$q=esANDqlpFRXN^mH>m-r~oek}EsQhG;<lg&(4JrI?NZC61|)~-I;>-KWW
z_7j+^*dC~JF8lBq#(%t1kxlx~6I=-uqGq-F@QCxFG)5P)FBi&&e)U?P7@pROk&Cg*
zxLTVfEYr|)W3VjqwDDMmwAKVTN5Os@e&vRp08SRTA??SqTQ=Ht5H;PpT-pj28kf-t
z<ZgP6?KPciyA4Vs=Z_Cb!}4H6TiJ+8BRg1F9TDg4N#lju77OnZV}3uIt9Lkt0lQ;>
zq4phRrE3x3DgmQjLVnju6VSzCl%zEQH}dL11FXD0I|nxV#zl!K!5*)xUnO(^qfGsx
zI%7sX8X*-ER~|On5k|c2r|ep6{>^GmG0U{=k_gnXw^g2-(@{;|Jd77gny&Iw`9-%C
zdzCsp*LQ1Rw`1#pKSOq-a)sir{@z{QueLflLL0uECJ;+s!s+uN3D`*5o^&Z+N+(PU
z_Z?s`EQ$+kzD{E|V^O{<y$4OTOw?t^I%q(Rp+5mIpVOX`+`<Hq$ki`<UZ4}0;LCw>
zMXymm+6?flmK^{zRlj6>lOfT^{0jJ+rOs9KLipDj9{kb`BsR1{!~gJyat&%Z@v7fp
zN{nZK-E92Am34Yu4rtp)G)s+uKwe<O77$F=7PPhAl2ckEcue-4V}~clU>N7>c$B@M
z>9T{+@z!O13KT5nm4qb9y#|xiNcO-6e+Jl<-PoRvj2<j7s;1@OCdq<2_hx%t6Su!=
zLUXwyw)ytTOi&Zlfy$qk40&Q^W%y4ON34s1m3v&{yki?t{O~`z46-btTwhLnh$Yy(
zw*sXNURpno5pQ5IG;APOaGS>d^b1HRX%_92&i9mSwE0uXk3o*c<UKmNUQ7PQ^tmQ~
za&~cZxGyv8>h!n`+u%I6xHpI@SszG&4q+r-JFo7Z-3$w0KgiR{k<owq!Jw@Yl-OSM
zW9f3<RMjS4Jo7Y`8j!frO0b524Rs%l)Ka20UAMcoZZ$^Z<_L8Tql8d%{Qcd-M>l5t
zPgrbke6?DlU?Xh?n49VV>RcPhdOMacfTT2_<#lY>M==*$uQT#lf_SxL=_elaB%Yff
zMod)43TdSWFF&HWN%;FK41k7EG&^A(CW+Xva!bP7s(GJVGA=wo2WwM2E>a&D@k5_|
z0~GJg$KEJ>J0=>7EDG2^jE6EllH!gI7{F_}|9aURBcpyTFb!diI>gdH&t2f+^fv`v
z1&|%CVkiC*l@=y2*vM_#H|&xJp&ZFYj-8BJ^_5=9(<SFxk=da=9dZ~O?N6V(<Bc84
zX{81+E>@m&lJ4FU6g_^0$)jgjP4q}?G+kK9b@uC8PM4NLRZbaZ2u;`XZk|o$p~gr%
zXg+Wdk>jBnPA)D4xb&xg_QS@O)KtMjj3@{5ePZtDVGrQcFp++8K9iOxDojYrmfnfp
zGwF~nZ4oPG1ZGzf_4J=d0dR6Z(^UeHcq%Q=SS_qdxP}-=$u^OVf74O`g-LFt=_WN9
zra9?Jkf&k`07u1{js|d$<Zxxd0Z?pn)EWrSl$#}PsHIk0&EOW!@5-g*_hfE2ZDuD@
zY%7_q1G}zM(W+*kn!N^P`}6JJF7n+5^SLGn`5W0fU({_>ka&y<rg190?qHnG-Ou6N
z#L{$pMBG-nh~OW>m#h4V3QQIB&tr5Yg5k;IHILPNjIJKIbfozW1Y_zb&kq`o;30>3
z0(*3U=c_oepE2jKx#Iu8PphC?q;{bulM&OePPeZmU_2bAC>DKhPl4WcMjC)e(OP|R
z#g@ijd|B^%;QbbZ9^AXyf#P&2*H;5$rE+t4eTFz}5I4|PTheLi{<heJ*Y~}}=Qmy8
zL4^3r2rh@dIe&!C)oMCXY|;HJf2G`Au~hA0(nj?Zj)gKEC}K~3cO9n}TZXB}N->>A
zw3X;wj3Mr~kb;KUM-%7$L6uJ1piFgafg=%-&n)Jk8^=Yvq-C<GY3J(k3wE3s^2o_Q
zK$A(+Bw~Q$R14*{H0e7nigPZP_b{EazD%||oANZwwkh^5^~K4b$jE4<Wh<m;Fzx<Q
zH;VSLoS4^LhqAsS6pnno%5pd4NkL2)3Qhq;joNIEUW!)UJqGm)b=(m!7E%Sp+YUdc
z{~OVSfQHmlLJrjiRK^1+%ia0PSI=wy%Pa!UeRRUL3g@+2vM3a4L<VQ=vJYasGg=J+
z)?!Ahxv9Y@W|Vn>Zs*m0=saBrXav~id!$|ttSTlg`BtnBTZ)A2rcq#GW)O_ctb`DI
z@!5VexnqOJ6i%~Fr`RiMc&_1_5oFDu^ayB`g}j7@+&JV2Q>lVnjkuWL9r50r41Ap%
zh<gck_Es66LH+q3^QB2hvm4L4-{hrl)@#k-8DtvQe)VPksrA4p4%MBAG#V#UC<2b_
z9ux?-B(NFFMos`F^=?8pqn)9*o1%_USpik1kjx;7O&fuS%Z^JYu_G8%yt?8bo;$ps
zF2k!$3xg9SV7#wh2}=u94;&{1Q?g-O5)98|vUH_|KYxHIR^Ba2unIP<WRt3RZz;t(
zd~21)_=XZ*Gu$VV<=cTWd*ZOZcTQ|4&^H^k&>fk-g~9>#nsXQ*gjlihNvTP8b99IA
zq%XQVzt)k31Y^$!K2G8n(|3cnRTc6gpKxOzy1xLPfn0*{y?2~~|H0?}sAhwh2qL@3
zf$8Oe6uZO!Ywx>*n%us26+saTARr1-6s1U05Q20?suUsgswkb%OOPfUMFj+vPJk%A
z_ue}S(xe4K6QqV-LWgfhe;z^3e0S!~op0{FbIu<g&CdH~m$mm^<yp_Nlh2<Nm9x6U
z4!sX!R%w&;59b$*5pLgkj8wC}Lr8Eb`WSbX>sgU)E0I-fyiSd`evcU{Ab#j<wuOFA
zO0XXuL%SUGtyyT0VL~55u8Ky-ACH6RN?i8}<tYj=kX*77j?IBD>eJ;)XV77)|5lPO
zEKjryq%ad%Sq#Jvk2p}S^p~soH&=mM*YRjGUW|Tpvh=%GVsQ>4qkF*%PD@CFRuU<E
zj~nh$S3T(_fL_+i`k*05O%cIsFB9*0352(Z%2#I5kh(hYR@Fd#ao!4bphb8&D@TT|
zgyUVF%0pGY*-Y0h<1dr5F^|N1ki!*vU8gG57Z1W;<*RKoa|Lx1FO0k`q^5df;O*(@
z3r0$QbHJg>3Vfrfgc<+I9-|k)$K|%Xz`i~?_@tpOhxi}7ckKjswU!y%AKe!DheiS5
zEqwCw%GTc=z2En!jTErw&sJq?{rlLjWc7ci<Ucn5{{boa@bK&`q`?uLYXUXvi~VfT
zFx62oZ_%XwE2(Io)$?2h2OPsWN@(YdKs{5Ieu}oRm9dZ#c7^uWl!V_EkCt(E5<VPn
zr~YEFKZ^@wPCe&>chnkyqw6AXmY$O9g9$8}j2{k7*S?(s5V(Nfy#{z+19SHl3Kt?s
zA3T7vW6R!gOFqun_v<+8%UO*68!)+o<=DKk%{@7jvoOHb%v%=?5I{`n&sspjHqMjB
zl7B`QN9s5TQeKA*g6T@^XQx}Yz67(w0By9!ATjk75>^s?bI&zUSSFwD*B<^QT-Dd!
zE>t1tn<Q#JW`BKJD0^wA_H336Ob=J0W;Ul)ST$buqQLSOZWze3=XKkL57`53P!g<~
z3;&!9Jt2yuQaTY`{rRA+=zJCT&f#g#VV?{$GAM>|l1%Wr7SsB<l<0b*BXu1XMXRbq
zX&|{n7|!jq^rU3G&up|eP?+$lujqm$@`!wO|2cV}v;1Ce5)^~S1AI$z7sWOSSnW%?
z&8H9(gAr!+7|t}4v>~!IkagNO;5cA#0%XO)db8`sn~|200*?8>oVu2+Mc{V)%q8PI
z)2?g)-9iDct+8V<Oc=l?q>z(Lv*?0?20G_2-jtjcX8oCIAV0bu7w(+o5R|0vz{9uV
zlB+*{U|mG3iJo79$AD|9^j)kcw?I>N{A@D4LYrr5<gmReEjrK4Xb05L!o`<CfdH&n
zir?)`<E_4~v?RTX@-*$d#$@Fr{n3UuIakEJQ6qG2nV?`B&Nn`Ti;UtG>;Ph1=q1ga
zDEGPB=tAq5Gc^32IK!g^pr2;9#C^<xFNbAw=^&Z~caXk^(z{>a(Bi(2s}!_1>7w$C
z4^Fd{Xm46&14Gc@()!mc<#NWG<wPWlDw}cct689zcY~|h{S^RkZ&b>)3e4vMr33ZL
z;=lyzkfF(E6g|G5Bx6LW43uaM0PGLo1(168nXKeY2I(tFdghhLzEtx?a|OEn>jg+c
zYDo#j7Z}4?W5Qwr0lk)0bOAbG0`NGjgz<74w&dQn9lz%g#})?WUozX%Pk5RcDc(`x
zz9)sWP9I*(HfX{Xo_%Lf6##I0>mQXs>SZR~dCq({e_>NfJ)74cvxIfBngJLY^i}~T
zT)YQwl5?8$c2x+tmuOuF6St%Is&VJ7)7OEJ<TtlIh#LcmQu#prHcTNZIdX)DN&4O$
z<$8z3qcjGn#N+bjDYmtwSU~c<4r<*`UeTI1{jQv>%#u$HMtlERrJS~`-9bW>yVppH
zhvh6LcLCW%zT3`IGgRisAp4pEPn^-I0ZKobXQw#F#81`U2f-$N2UY_8A4;%V`YnQ|
z(OSUjpLJ4`oQOwQ<~ODj+VofqxR6XZK=N*%@%TbSQusOb!p?!5Z4WEL=g|+XuRJ(Q
zb?cKw>7oSUrfV@sr9<cM9X)~SP1D?vMKm~c<p6N7e;E)N?C!!h-u|6b$1wg`j>1a7
ziDJCxy!3g1BL-w+JLqD%<LUjT#{lWr*EJ2mwPxX}@k~?@ue0#A<ZI!BVqb{6^cQpP
z7hR0c?#tH9RbMstbl5vatnLPxnQCNB&$QqWlYq_k<vBchd6sub?qYg8P7!QlYL>D0
zIMtVE!pK5#ixyBQyO#|#t5V!*z5#I5lA3^K&%837|29Zk_P9ia7e+xCf1-89VMX%S
z3H9?BkTU<Fv2AdMzx`~RhVwv<;5?AugTpYiLUQo&{Qh2n60_k;pk`+LOy~;Nlg1Fv
z`In+N`UUIAY%kR`n~paL1+9e+_n7dp(V`CS$8Q;<P{^I_BHOhQ6{~~XIP?<)(}ju<
zP;X|At?YdmgV&X1#*)7{!;Z?N%>&drR4YQice*PDTG}#M2O@4<)78Jw#=qrI>h4jZ
z^Gx65Cf*-dKisiKkIUGtw$0#!9^5ma4Ls!uk%o5^Jvr1w=nrBr3;nDu<Ohz%A6x$c
zP@2vS+HqVw)Ktv?{N#db-rV8`=M(JEtK=Xa`WzB@U*b^H0syq1Y%Lp+J>2(`1R&D;
z@34L>^uMdMPfFkaj|w3X;}VZrh6?1Jg1@a8{VhD7as+j_?by9P1XhXYs@;MYX9SDo
z`HX$~#LsO?zX3usfO0p2_3#cLISF<^d-I6$A1?&EcLRrY7!sQBKHPK<Z0x&sIeaSn
z=H)-<moV7apB-C^^ba)=z5^SZWPnYc;?VAWuK+go(tvIHg+onuuY!$z58?woys^LI
z@}~ap9u~cF6#{^+Gy%yHyZi2gNQduwT>!bR57=z{O2=_%m_Ib~x9>!&eEF|dZvvj0
zI#7D6h}$^l@k^1oFhH~e@@p+XRr;YAC>uy`p0(NnRaBR0XC<jam=&u*G0yn`Y|Mua
zT-1+%=zQ+>#!@fP1!h(ugP9t0ROiwtICcAZK`oeu^w$nL5yXG~_9F_Z_2K3s5a47&
zx@5azHzs%;X4KIj=+0w~YU3yz#$zpR73O_7Tq%@2KUh`*bF2#7bKaa1>QlghBchz=
zvVfk22L2GFz8=VD%-Sipi4)pgA60E5umX(vk2UjwUtkMI9<|ckOu3=`5TuIeVu1&<
z4<uaLEGzTz{@EqMnR8N;!*B3ti26&p%|l&RyVTwxc!4Ut1<G8xdP!w2fNQS+)OJv>
z0{REgm9PLa-v@Y(#$U3qh~`)f@C#(#B|v}YG9PqVmP2qG`3*e9q&c+RIT3chEK3r^
z^3cvJnK*4(MNQX4B}HZV;_dqY7!l>X>$Ioc07AJzH16Ar!dAI%HOG-VF{zOlz)6E_
z+*uni(o={uod*FDD^uaelX@Y|(fObZEH%GfB1rK)fd&44O~9lzv(h1-Ge|5l(}P)e
z-CD^Kc#6%%!Ccadr>|>eKJ=i`$cSp~*BJ8@-8peOQ)G9&Mo;T0Ca(sVI6dhOJ?;Q8
zy>7N{vS9;KjVHF9as`)V&4%$Ffl`rJj%ij~9Z=ww)(ixq5=EHY_S<GZ^0h?gPF}TJ
z46QjU)cM8WFInKTgIp#s1k?cv727yH22?shP3XBi$|%EzQ+z@WSrly;c949y%Ml6%
z)Yu}*(i;4IbMNnSETUPSEdkl<3(qri6%{|#%8y}70hN-^T;!_dgpwoxy4!Mh5Tue7
zD>Q70FyTB)J9F$VQ2284EVeFNmW^%su1_)`fP2-Qc3Y|b0gqY=@bOw?fxPF<I->r>
zseZetJ5a2Tms-+PzYhQ?0@mEXH<x%CPeghb_uvA=#DoU9x?9B{+ci~r>MQTcY{4w>
zgs0lX$?OW|?u~G$L;KQj_#0SeNmM~UD4CM`dAvE&3{;O*qx^^XSGMR~qrpIHv3JV;
zh5@ITV|#GOjGE|d+wtPzzp~YEL<|6@ku$UcD#I*pwLgdhR)fa15bGQOBc_8ZK6|q1
zd$D{7kd;opG)m%-I}`FbromJtr4JZ1v(Mw@X3-nV3>rMk1GcM*s;$Z)u#S&nC+Lj5
zX^7%vKX#?4ZmfprIE2A$JP;6VtbYr46gE6%maMZu@qCLIvGAlnb?20{S=sba3df#q
z%%C$v9z3|@3j)l)@VOgGuGC#aiXFuOK7!6hx-Ic8Mo_yRKJSeP&hf)@o_szR0uN{f
z6C<KE=1G^tbH-T-3F?V6W^0NCbQeDo@6zt}^@I@V;~%9@>%d$|*OfPw(}3udIILd0
zsnu`Ae1@TMg3GBBr2U)3#F$nwXXaXctL}8jL1&9|7_U4#Rm|KZv1)-M1>!b#z=Sq>
z_U0H#1}8_g9H+EO8pBhi+KdbC-lUo1hZKt{+L0QZfpj?aBx`h7Qd1i*o!a=7to-{7
zJjw93ttK8+qJa^8NHn%Khw7X^WBv68Gv%8FvB>Bx=C-ArrmLULOZpA!nFAzVIIA3!
zoFK_PlV1PEi$=V_^(!GBfvd7)d<g3k?W!)cB6K0neQ!6oRVZU(8kKm&_J*tshR2sB
zU&Wh09#4hJG_T*F0iQi>wedsyhPXnSE^q<R&1jBz@*QR@l5+8Qes=Jc<@(mVRwN+D
z07M)bfE>Ve+}o2kSt!P&oF`S9`?k*0sknHSB8t=pG*n%Gg`E$26VRQe4kFMXr};Kr
z+XG~EIy{5REif%wBhBv1hi2QFSU{@w9n;OskN~E=yTsAbopw5>*<e-KF%K4v;L_?`
z15`TsxTL8J{<*?Y7j`6__RS|}{`z_|Pxel*gT?tYR6Jf|qp(u9pO+v@jNx3V(>n{3
zG~~plF=HCSviLJ42giih$ARH^w!H=&1QBgQx7`4^eB@yiO^4-%693#&RG^?Tr-WE!
zA4thFLm7wRodW^SSL9u;Bd?iUJoG7+RG{pk9_f5pc;rb2+00N$v6mnaOJ@myCizVL
zw+7b+9g%Mww^SZVam%nsMVli&%%jsMGq}_^Y{jCqa~p^&sJ@wg#}~`K)uaq65$)_o
zIdUr#%-$@RHybclh7{3zi{xu9er7_Va5%IQfS#y3;Yog)D3|~}jk^BI1(2I;ieG2m
zPDQX^Pjn1u-vqbS*AsrUc4GOR%HNBI9{X=hRM~G(mbfKNE{oF$Y<}njFno$K?_cS4
zzc1Xkr|>ppLRfORz2Cj~>x7ucc3T{qIkHzS&?%ZNC^uUoSHA8g&kReB+(m(^(+?U0
z73<|Rd*V)COw*g<a??k|Z+0t?XusCY*a^1p<xGMl*i-3KxXvY4*f#U@px7;fuj?SZ
z?h|mEn^as(^B2yaAx~AjaXsq%EUAL~=GBB>wwYLqe@>{QO|os*?~MRl(Gni~C5Ff-
zoWEN^<QunTbcWQb{z`x<wDM}+2Wicnn2-}5)a=lB1x>&Qr&cxEiFZRY;gR;Hbec(p
z3Fu3^2Ld(AU7YXRoiK6UnXSwZD+Loeyyg$*39q^|Bg$&thDs`aFq%PSnM7$zU^@EW
zQta=}k4Yn;0=nHvmlWlhbwHaYvwWIc6FKgc`r8)#&)+=SaAs9=+Q^?;Do`pf;>@a1
zor&<nO64t_QfZsQr+7$nl#;+HmGGV-!$VDkZ-7$S)_-08AFM@1pj2`V+NU437VqM$
z#ajj5WQUc?Cpc@-*D=!Uu&(?+IxNwudm<fxHAj8dWn<f9H^;3E1jkvLp@TTi7Qljs
z1&m9vy-W#e<Q&>4oU09J50AqqG(U(QWz|Uz6bYi4TLYX4wwt*qkwbdbb7lIEe?f?B
zR;IgH31;g7J<JA3Ne2fSOE9JmB-y|~B&z`2aPR_q=f`(|Vl~^ynsMtzIC-Gcqc-|2
zhbqs3vjc#l#|$LyKG^^@6U+zb+)<wtV`dPR<*(Tk=A~bsyM_Zj$Y-3w96o1=37_ku
zI>m^Eo<Iid0*Jq}ZA~$i0DexTI3zDD9H3y0lG4%Cr&_J6V)WcM>MV*k4C%B@$8;0A
z@-2EtGNgM}L8+BKqJjaizX7#iRCqVzyD@;l$iuOz0uLvxM*S~H;&0&u!QhjTPGtAf
zWdk5XE*&R0$SaNe6W{R|quRU+Kgu9@E%QyF34M<&J}G7C*SU~<k4j99q_!+=$6;42
z&tBA5E1C&S?n(~h(hc5T_)7?|Q0_VjLj><*Zp|b;qjt*!yeF9XY!~1#F~Xra1^{~W
z9h849!WIyv^kJot!tV^IwN=|ZC|EZ?YJt*A1)z|+M0ecU_Y;n1H%*4h%OaolnP36-
zZhemEc%;?HYt=Rb`kilFw!lxz(&3g-g4L`5TK-&4#sE(RAfa@K6==Xx>Kcm=wB0<z
z?z|DCHUkv!`}3Kp2h&!-{CkiPJ_o*?zD4DbuxtR_(_@NXFW=GxAlO-uVwVRHk_g=p
zo4hBtR*d!5wgIke0dZYy3{VDT0T2iZNAUR<x0p7G5lCehe$`RxWIHrJJvWdRernqH
z((SJ&Udt4Jl<p}O{j5w$-)>7_=Gpz}jH~7h9!L9Z<f9rFUfBw_9a*t*a^pr>K$@JL
zWa!tj28syg1J`&jC|d`stn^SVlp@98@Wh}$y>AN!mr7_B+a*B_wR=rSkqZ0i{$ADX
z?HpWjyK)8KO5?IM{s&%7PvEg$9{0jz6u>wh#-zT#GACIT^Z^D)8+F}B&+IAXJ?RWQ
zx}0*#=9XiXKTumYQp1`#xFzq<?)cIObOU3_kjJ!h!5^^B0+et4dgU8D2BPjn7HSFc
zw*Y#hr<JAuB62)iKevp_){Y}a`%U<yD)1$Ky1%M5PsGOREO7_u*jr#pSMP-ho1VC2
z^$bTt_OwWrP6z3iAj#g7H?_@gH~Fw5LBXHgsQbos3e$Aur1zY`E}uTW4Ci_ssPhk;
zj|(_2d`@k1f2`;ya3dOU8?~&|gyAja8U|BAEF$Q<Vp9VjV|ogJqj_D9oDhdLqhQzU
z%rAWAXPD49v8ilV6RDkBV^}y7@xdffsZDrq+qR_EpghqX;PqUrm^V=%E}RK4@f)%i
zBNM!tSj7@|Z#JML>D({PFb+r}dq+Ll!;@t0#9(yVWNKpbVFS}It>Yjas#^}TsXz@X
zf3Q@+(H0}Iz^G#Z@(O-f9)JHqLWGNFNcdiO1ZZJClP7vp7L*L*;c5{EF>7$Wzh=8(
z5CUrOU}oM)(k|xk0A>Rnw+fH}SmG`?lUZX9IJ9<)07IRY&cc)YQ@e?uGQD6b_F?>%
zbB)+Vn*NN(+HVa@n#OUAQ9D<<1{5PN<+aC1ZZ-J2u`tIL3pWNAt&jM~amxrT){>R4
zTMJM|+y;D1(191Gd(2kLUexh%mml+soLP%LY&)$J+*Mm<)U_cH1?<j~$C%wybueb%
zjkUT0K^ZIpX{uuVhDLKm=rhASR^!xSF2IXYkr8I=gWyS?pmAJZ-Q+Qv(-s)@<#u^1
zv1%KQ@DdyYkZWL2j+wPFlQ9`f(%366tW-#bjKv)BH(h34|2m3Ar)d}2V53t8V&W^C
z0XBQ;7^JVqy2R@_fxrV^zj$#q=%S&)#+PuOChx*NzVJL;UdtbMw38S;c^HI);+o1V
z>M2K7hRDVWG-Z}ciP3e#02s|=jO))@faz>0pj|1(^Tcy@cdcTtm%X50U&tA|vcn^5
z@&OQK<|?Ll%kLy!R<E)VGA-<S%@pxat++S2Gcax$U_aJBG+pEHk?^>1<|R>h_S2_p
zbF+HM^Yh`ojm=N<%zL>iW*$R2lL3TDc$pEY6@6Jdd5!E=GgntH1&eAV5=XWdehX&0
zL+jc}3}_o(d6o<QDKyVS;RH)T98kv9Pg0k9$e)B5ACJAkLm(e$9nkHRGU%JF-(Bn*
zIjOp}P?LtHP(uN$$=FQ4n_wq5&CsWD=0?WDtx3tQRK<7t2KX%8%=sn~tUHMpa>BVB
zl~L+PS*URF#uAZZRJZyy>|Dy5Y+??szLWHamUL;qc)7^;Q%yvYfa{)<b>|E@p){*i
z3_DLrpqr;dBBE&Odw>Gg-r#)6Q$BzgR6_HjBvomHNCO9*OzYWp@Yx0MB&y2c_YQfS
zYe}R)Qkdb?DT?b)w&n|v$Y^Jfk{&h?vHQnTf`5tz17)f9PC{=2?Q9RHUzFXQKVSC)
z@YLGE8&oDc^9$a-@4Ed&1N5e%(H6x+%GME&Lg3x2tpiblec%6iS<*X#rtf|Nl82O|
zqxd5rw}_0AEb`DpM?CsTf#M`Ecj>Tqj|4=Nh7U)Se!Rm*5;V;f+A9A;KR7_|t#7uU
zJmeE3`FCIkrt9AoJ0Oq$?}_5@9=fDwurO=uY-Qk+&&5vHoC3Pf^szq`YbSRq@^M3q
z+vn3IAbO?>SY|@*K$xG3Y0=x~WEsppwXd+-)uwqV5ZUzY?Iop{mzC<=H8r+QeCH~W
z#H}RsqR);T#lt5gp?`agzVx%c(yRac^Y72w{GTnv(>bP{`~B6QZQ1zGAsVtoq<?<C
zNlG%fGMcXP<E?+cMiiV?&lVVcq%R-(Tm-Kg;ntrg^Q$Mnp80f+WbnyNmV1AGJ}D^*
z@9-~CJ-M&<Kc3M5FJfbS0S5Ey6+i#7Lr7@3Y29=E=RkkA(cc$^2h_!q{Q3C@x})FZ
zMCLTF{r>9DK0G-|Xqg>Ke&x^4R~YC^ry6upaPzpo&4UfR=zPH4L!Y~Wx!Wl;3;J`i
z|6PQ?7yI8u_$^ESKkNh#CXG1N)HcA!=$)39wnW+r;pO$t%geI~baZsAYi^dac6HU!
z*3}L6hYNpihDRVTy9=+rIj}*(PYNb47#bNRpaurmc_EO5Wbs#IkE8_i3ksTS)YR1K
zrlw4+9UOR93xypn_%C-fOvob+Y`siv4@uI&dxSX^6?(_Hd=?~7wpLaTW@cw8=7%ac
zTUuKwlj8Fp`%Y{HwzRGZf#N=YOXIU69$n|uHx?N0hJ=3nT7N#r`YbVV(?vceCMKA&
zM#<UHF!}Lq*@CQ>2RclfUfJjc!C)|T?d|Pfv|`T-!AOsbKd^Ff(e57{<j|B$8}(&E
zHf?+^P<nshK`+kC_d~oCcN&%_S_Mv8d}Vo~Wgq><I0e%TpFSlZvf6y<&c3c4#j7Eo
z6n2_?)PHl7(XigGOq{x1wFj}t#yGRhHa^aL;KaQUT-o>nlEo<drC7Z?_<YlBve2jr
zBWC;7vhA${2jL25rqVY#ms3zJAF`g?Jne>?nv47m&8%jE%g>jv0wPZGG6&B7${Pf#
z*Gpe#9Gf@8M9*?ee_5Tv{qI(k_R+Be=i=~3qEuOUK>4G@sn_czts<Omt79Q!qX1&p
zl+MwuW@6i|pf&oLwo?7h=*43foi*0BYmHchwO}w95}CglM|)s6#(E#NR9w;X%e7QM
zj+n~zCYs&X>o~p1Q7(W0x(<{*6DjqZSk9lU>Fw<;)z{dvtq<7b*Zme--pzuT*22Q0
z2PP+skZzhT{7-AKz%w&4FF6riT<7G?j;-S@uW+%g7IF1uDkgpGEv&%LNmd-Tqly^)
zY@vMTKsVHjD`lQTZl3!1M%)`(Nm?a`k9QL~vb@!M&idO<fymC(7cz3!SLNDzdaY~_
zOQRM*uhs!~P1;*@vp+pLrZ1a=_rk!BX_cLTdy#)L_-;h5h?|5l*~GQT4EU~^$gXwX
zr`ie74FU+6K<7epetkzK^;xC^;^{}AI+C9Rww7wB(%VHF^0W6|z4b&S3UXB2S~#1;
zaLQq%sw7s5_`Ss6hGVbweMw_@ano8h!iTVFgmhkwjx<$VJ)lJ`FoTvlBLP<A8s^|q
ziqw)^NurWyPPT1#EppY!b!#}V;G%nYtT&hA(@vo{-FBC`@LxR`UPkWB>_tXO=ZMyy
z64#SgP-qxRUoB2s>@>BL`344P5h9~SBpm;O%i1q_Ys5vRCPKM&z16_aNU%`)G1WY@
zG~M}>dUDL0#n|HQs|SYnh$!_QoUGttURGV5doj;-j`);sqMb;NOryLm3BA5VhE4S|
zVuq_u-Q~QS@3k7>e7tjdvlFSVT{fewwp(0o+N<^!G^0nMoi{Pu<fHy|*(n8OD(J`+
z!2-?zzg5};k}r7L!$HYEs-ZY5E6a*F{>2e%GkI;<gdOY4Q8_m(0?K3W<SI%gr%bit
zeo8W1H@3B?VH|6qN@s(4MSsI^D7Jje#p^4TxHU#M5LqyWHP98plI>RvDy;}quas(N
zX{E47aJ*-t2{jasT~&>iZFylAs1?g;t!-}><7etB7gf+662WiNO%@LEBa|MGOo%A|
zVwNBe#?ew*3^fjQrdE#xpP=0i$l`_gN*OkF?GRpeiA$H|*j%(+Yn{8q-JloV8Y*<c
z?r+o)eBHR1aT)I&3Q{honP6NhOP3K-F=ZRr;il=#l90-E@XXxAt@8NXoW;0rNNCr)
zZm9I-FY5&MPeme^q`7Uc+z55VD6hm#@mdPXVKQvDv+Dy|TH!<Wt@mdvvx8f+>$!#f
z)=TB6n&3sn8R!cN1H3^81$PX$vbMIi&@Y;WdHey4@P@w-)5TX+b_vbC_Aw|LYF0iF
zV%+>{hk!)IulC$_(-;jSH?MWta=B~N&}wv@E6FdHb4R-yuH@W>EXuK*eh!8CCiF?R
z9#9V-!tq#12XWi)wB>k~WGJ_pj|Ly<=`XC5_V@I{E#w-=W+jijCF5<zV7ThJ+3j9u
z#>|y7i?CkRsE3b-wV>&~b;hEr8EJaz;f!4vp@Z}7huixhAt7~Gtb<uH>5<r8;;FmE
z5ibH3XGPjW;<czp!*~VaEQU6z6!34buu&6RDe-Ze)GPNvU=glwj5OVo=NtGs?$#|^
zwBB#XyBVb%f;_MUH)K76PFhYc)Dzec8P)rzc+rxge24UTbltY?sVO^Poi_(F#Pxuz
zIN9GDKIbE$)^pLDepF4*O`}Crx*)yo!pIIG?CbK<=SgHD!}-m~f@Vxp>FDFVEB4A;
zLMd<7zl<G+_yGCc!XDV)l?(XYXZ`)o>pofoWf;a@=~lQ6Y<&_2cny89T>2u(r_L*|
zS?!Gj8iqzAdyc4-RUI823d_OLr1HFlWsT9-G7P6f^Mk&1?(Eua$>GzFqDwWdH(gT5
zv+N>a8q7FO)kSiZfOon6-SnK9VA77p1Y6^GKZdTJakx^Xy?g!+WVoYsH-4sCWM?VF
zz9d?3{~kTgzR)AGA>CV{#<`c}Afm=t3XXS{vuDqm9?NyA{QOxO=mFg_L10~TRIi+t
zDxhWmnpECEc3IxOTaKzGcdcXBL}ZJ$HP?-O(+x4IkxN{S5FfiF`TJ6>d`xD1ZZNeL
z2DPqcxYdhBo8?+r2N27%T|%o`b=1T4^BZG(d5a!js)olmKMTk1re>XaB$%_Q=BS4r
zO+uG19azCkL66ljo-Wt>K+l*jf5*aZ*X|I&&oG?mOf7G3EZ3Lb4Rsw9kL=ns5}L<d
znh`u+)8)eGbZm|70a2UOeHd4bSYBRMHLCc&)ZmLSE}3nH_mSGW^;s*Q+CM>rD-5{n
zo1~tfKI<(eJKvbKZ^o26$}qb{fm=lKz-fd#$0-N#rxq8jNzFX}m)e7j7o?z|pbo%{
z1R_1ixcUEky+r-<0%Ib>0(^N~j>&Aemg>6V8kZ+xCF^F?)th?>%iKd!Cl8De&WFdk
ztn#hn)8pn)A0HomclV0Lo>vVpzD0Hs9X5Hvw7t=J>ej+f^SS~e6*&vO2h=82$%kV#
z2tF0k{MyIA)kL$%szQQ^m6hB84Z%Eq8$A@fJY#&@WvB+!eB7s)sI%jg!&vC~f{xnJ
z&jCzz-QB7iuO_Iesq6asG&N@C=F$s`idZNG%S%f^wjDRWSK&qfW$xP6IN<}^MzjRa
z|9Yvk9cg_ngJDN%wV_~*uuqDEn^#&IaGU*B>eYojf^;(BtMWml2c8Eb@>djvFJEXa
zzIkAsKm)M^Xm}_e<oBmF{=f6YhscHTU9)^Th7UK%;Hh|KZu4}QB#6bnNeO0aV=gT!
zD`PJ~C%(T%tZs`>n9KTh_iV1BnM>MJbA>2DlNf&c-$SAoeMo{gMa87E+&QZp(?2#o
zK3;<DrdEL#v^J8_-#Fs2x;3;#?0!OXvIC@LX*7WuK9!f`xql67NThoBros~}GN|G>
zSZ?<9^{1lK!DbSC=#i&51ere;yyE!b&9{gsH!anioD#Hz{q%vGl^*{qk9_fX5;{rZ
zTl3hMLGR$)<lUxCO8coGriEt&5}1LRffH;~^P<!-8{?RaCQ~d@$n3uv#KQRYcBwgM
z{9bC2eR6u)5<YFLD1P4JR=JbND8~+s@#7owgES#sGxPJ53P@pCjLd=Q_@L%dnE%==
z0`-+gVc;wek>%XnoOe!6PIj}u#2(ZFE2rZ&c2ZE*<1c^V>jHq-aMAHY8hdx^#&pV1
zV3*Kd6>(p(s?dI+nUtf~LD%NBM4Nf>5?ShhA9C9oprSyd(c(qVk}`<rW{-V*gr{?=
zRui)59A#J+JG94(-Wlr2Xqt`^ao4S7598gpyo7?v9)<C*%@C-sc`a8Sn_L>K)sLIP
zo12`REa`>dMTob&cuaU&lim|iqG&}{F_Bd_?~3rD2nq7ry)7WXu3NE-TpIJC+hH!7
zW!iuC&_;+BgV#nBc5B#~T^hVsGX$`8*7o))wQIq2w3mF-7bFT439ZfxG5Imvkn$+J
zJ#D<QMN=`cxJ6sBrUTpC(c9DT5)%**sH1aRl2&U@4B0Osktb{OWVJhL-CY`Z<O9-z
z78v=hY;2T(K^0s84fW{w9>^D5S&4t;RT<Kntl)m#ywi5MW7$qa#h%vfoQa3z7fxCg
zcdLr*>oAO{mPb~*sdj+K8rET_B_POe3d?I>?@p?zsj-psog4-yqtnNpanE?U6Eko;
z{3ZOac;B<?<?yqCz2%8;oFAlsxe2YO4hmAUKi{2jUkouoXYnmz^!AGRgmhezzoq8S
zFznN2MWG(*ql^|o-WMV7e6}$a+qSHZ?738fwe+KSoL}Qsc&e>Gt~PJ2jio8yjfkf^
z)<CB2S!o<Emug8?AY5PYsg5oZHKLjnYTZ%C4>33f-350a`|fvzy;Kw$&3eDobt@mf
zg&sOjNl6LyTj1R{Qv6_65=WNW0-3nF*P}n)@FWs3O{ht1sWi4IGCrlxI_R^P<iB^@
zAGuhuHfD!BU!l*JhXJd5ipaM*{7w5~+Z9P8M{-oHY;kTKT@*)8q4eyR%?(N9w)Nsj
z9i0pFk{dZu1zlQ+Eicil(fP(~u{dIy#pl1*&qI=lgiLKD+{EZshoO6*JJ>kxH(u4f
zMx<LWf6U%uihKc4bU)3A7qKF;Hu}a{?pb9f!4<37X|1^P7oEOTZ|qQVdeiMLtewBz
zfAhe`kR49RaD%$hz_oH<$uzUKN~O=#^<4xz!{9oFM@xOZL=lI4-#i$}&Z%18LIeU~
zbNv37C$~jJ=v=L}&F<^x@mH=>+rZ(<f?Ja@`i_nn+RQ?``KWydHht!4kI)Ue`?n*a
z{N|L_H!<yf9~#D1mDOOJ*Gi|7m!}W;3sZ|l&GabBrytbs(=2|@#iST;V!kZqJ)ouH
z)Q>RW#FS;O+yA>iM8}mdwL!o7!1n!d@OnHVBDC~!(p~ZXJ;^^;`~gv_KQH<IN&0zG
z9-~knLZL1g_t^RT`G}|}e3W~ir2cHbkXgA2RNHUeNk@^Fb@od#uAMedv}jlkA7FF;
z{3(xbCsXO@=qO1C1pyN*ysU%a&#zQ6oPSknxU)2p%_kzjT0?!bg<<#B1^)dK3nt<Q
zv&Scs#sxzdERFB>MsZ<r>2)j?>pd`_V>WzxT6ya9AKzaH#n*?8d@{O{P*hmT+>KpB
z4iHp=Z)dIArSS3dUl#(N9v=h-Gcu0NFQgIDcNaAOv+GCkW(jL!!$B@}PF9wp@YlZD
i1}00N93P*Ny(8DN6wUDJckPdW|D?s`@1@^;==om-haErw

literal 0
HcmV?d00001

diff --git a/docs/source/assets/deployment/anything-llm-upload-doc.png b/docs/source/assets/deployment/anything-llm-upload-doc.png
new file mode 100644
index 0000000000000000000000000000000000000000..00c70e9c01f672cf4bc83fc4277b7f0c60ff3e55
GIT binary patch
literal 114117
zcmZsD1yEeg)-4t^xVr@l?(PEwhmhdzHn?jDHaH}>J0ZBcySux)yT8dN_kZuczp95i
zb*4{GpO(GXUTZfYN(xfQhy;ib5D>`Hp8%gBAYdFIAfTP#VZnFiYfyb5ARu)u#Ko1Q
z#l^{#9BfT3tW6*wXk(nCMP;Fsu>!}YK}upw**$2iMP%h@^yX7yCB=5+s&X|U7ZwEJ
zkBLbjmlr5CbfD(DN#;)=&>=32pG48(3}3m#fYX{>9*$srYE)OtS(>X~>9^4aa%kFk
zYxXoyVUg9*@3(hsSodfF#q42HDuo|v*0uN*b5NxBPkz-O?en;J=#;6(E?A84c&fbK
zx~8UEj~eEZenIb*CyegwE+kGtTBSVYro*~uU2pEA`OZ#~(Ihx@z;l6YAB}y@a6xHL
zJGHyFRBvuwRra33n@mkpgMv7d0Dyt;E3xnWWG=z3cCRoy=BXT!iQRE$H_jz(d;S0|
zFA=OmHxjL%!F%#e_|7js4d>ci=P!9olK659(lu9m1(gV)9zQbL-Y2zI!E5W-txWrJ
z(Q_JGISOpr+ajM6^Nbe3-b;PkP1|aoBP!5%*GT$psF%iA%h)vl`q~LR;5;Vk(x&qA
z5Ff$U@DPx}77#GtD@gE%0Q`Y~fXamW_YRChCiK78(0`tVP##?W00AKaAq^1y;s$w~
z_U@bZ%uJsx7oplGcmgu@^L3`4FdSD`Iz|*=mhL(<Voz2NacFhyCR0ZiLb|@a>)qM&
zXfwY%KYwM_!pN!V8=Psn4)2}on!@hu-tv>?GmrK2_lB$8AbX>}fVx#Ts%7uK6NByk
z7=zw$e1q5L2T^J$it=rY{V6j{+vR9-f5O@rMjW@7sRBh?b>S~f@St!1l<8=ZCl_MN
z#7dtZH^&W?xXWd0G&d@KGT4yl=5|on*w|S*O)drvJMl<;q`qGB9z`W$Q6z$5J<ShS
zo%5_lokh%oxTUR^%m(}8Sq6*M78v#f%|qMY;5kU4;CV3YlNEi7@+iol{*>`p1hPy3
zdLCV!cxxEAh7T0XT;^Ejl(ypt=f+6gDxy(eBy!x;$@ZL{4~|Xq>mrNe?l6nhY;o@z
zfjjMAP5j-^Uw@HB$Rszzk+ujjUB1$R8z({lr0wRX*dPuuw|V_a<*@kdcHiNt(;kAG
zLWx(S0Rs^EUkU*w<aP4k;NW>bWmG_@A9O2X#?9HD)h36e+0IXRQ2P%>5%NJ<AJU@0
zf8OEW4una5s|go=I<8WBxPUwJ<cP%K(&fk;WQ3wn@`PdqG~80r7W~g3|2^6a3ozqA
z&~PCh&v^=q>*+F8Q;lWXe^>guDu+GPK6l(!Zv^ty?ugZ<e9Y5-{XT`#8%)r9;@B^f
z*{}2}$bY_#7KeDrH|%v1v=j&qLy`Ee*Ck{2JD4hf(`7?P1J5ixlw8^Gb#e2K@9&V6
z+YYfH|J?om%P(@s>lB$unf^j@Rw_(L^k2A8NaC?ad2ZMr|9d)QVf>1tRhDL}Ehi|5
z7J-H3u6vONHC({v#53T#oi~Ma0un-Q@?kpm$%3yYVRrMX#9irxwWvimb`f-5!X7qm
zrEk~g`<^OYsSPHQC>ri@ORk8F+(&qtA9fhaFL%8;IETHhWZ-w=)wGlbX~4rA`>pe5
z*yco!q(Wxki`IORS{aMOcE4rTZBcQh%>HDa?EU2s^N>%uNS)0}XDEFs9xO5e7PWlx
zbU&T(vX#-=JCD0l;@${C+#qB^)iR;3W|abkSNPGM3;!=oE+?JIR;Bphy@V+lj;CA(
zEIR8PGO2rgJbJ=pgGC`yUn?#2LN~{N@wnAqcp>6;(f)a^rV#oKp1~c4%~G0ivBtV+
z(C%uGK9Dh9huW!v`E!Y8-R0-mBtaT$ZW9?E{pm!z+4U@g>C|LFbgKm>#;%b*M$B6E
z0>}Ny8lBo^=aS|V?nLz4HDd>!qg7Ue>D*+-;pJ>A6LUw&sBxc{Jn^Sd;>w+oipzXk
z4o0~(ORq%dqy2Jy!;$BrcN8Y*f6b-(cZA}KUV;r~-&aTVe5ok3gr;r#XxC+TGq>l9
zzLp)|hao-iz!alJ3LO3_Uv?M@w0e7-JHMIJ3uYe`sQQqEslY>i+%l)TvHEn1bhqrN
zIB*?7-qCnenqslVy%xYe%9Eq3(|Gvy%ItKuDtKIV*80Z)+-Twk+SV?ZZbbRsi{I<D
z`D`pZj+NX!9#(yKSXi;^LjKD{oweMRD%O3eh3CMBI$3hI)9ZVOz$>3Wwfzfzn%xgu
z&KHl}L^qMWY<T^r=Hpj-^59QIfK;HWI+K%*2i>lzNs;YLzVrTSt>=8c$&j6uIvUEG
zfnB1Y@5YUF=QK(T4~R$0cAS%8rbK1tQjYAf)%vL1*+im#r~cgrNJ}z!A&<hMSOAb3
zN$7%_*5q9BS@&qRMEuBPF(4F~z7y&xlJ_YY-Jm0N^EFR<ru3gZ8o*dPPoeF;6UVfU
zhRkMuu8(U8`ouc~NG72^1rHzN*H0BJI^GM5R%OIC%@>kU3cP2{SBFIearFPQS#A_E
z#|BHnW+!aVkF)2lbehdGN)6_279Xg-i$^v9oobA=idVUwAM~fVFXwB<<Ppb~<g&Dv
zJacY0_L`hg80p!BkG#)UhR>I1r~@!CKH0FEkj}~U^A2|tQrJ64q*|Zc=EYm3EErXw
zr14M*u-JIrer`<@2l8RZ@F;gnABd$l2O@}RFL|XsY($sj2g9w^S}02tJ3Nh&gkVaf
zdhOT)Umvf!nosfr8_GB%qHieVXQ|N5Ct(ahmtZ%N;>5r0bwzGB5s2Fex_Uts4p33x
zr2@q43WQD1&rtwS1lGUV6-eyno>YwqAxV)s$|V{q7d7-=7fU3OR6m_I1<O1^1{O*N
zS}Py*GuAC%TK;JLlKgqj(R;g|%YAZZn#;G)E3dS``<FDoOq|-GN5jodz9MOYmKtB#
z?cXLXncL5(?g%S#wbn=(9#=?!DVBbB`SCLM{2p$qQ=`cRMUby<Sj*?FqpI`3HLAn`
z*jKX@L|J=^*NV?tFzAj#yO-eUguv`AM7exBm|H$W0PG6%bVY`<ogf#u^?8v*0uSdx
zk*rjPLR@5b;p|5h#Y6{*#m{1+y0z3OQ-!q5YED_&qvX5MLDhKysqrb!=LcW%28{-1
zIArO`3pCFT0^k``!7jkPLiP6`ATu6@Yt8Gb{gG2H=ka08Hyzua_H^;6^vW5y>hT0@
zOD{DnshLHbdpoh+$cYj{Y-_mw+~#!N0T<kbBFN!XMJIN3(ia5@4I7~Mw##j>*&T9F
z+&bpL4_K+Tnfa*g(}TS+!o7BVC3m$j)=g=BGoyyeEb!)4e^a~E+4g!<*51e@cc$r@
z<_t9<hDj|4KlXgrwo%%2dU<_O;<z`uQ6WW&yykP4Ru^(bUsC<K{wk7R?CtfXD-c%)
ztF-O)vAnj#_bN6{);m_v{c`)fGmKd^Oel1xa-}bSi^vEHM`e6>B#qGL{-W=^Y3-$S
zkdK`v&FM4GeWt(IQZ2CSc%iCG%eDpk)!IjD$!#kFngZVnmOPm}^Q$V(D`J->SKH%E
zS?8djXAmN0z2sst7fY5lL+#Fq>ng$$unD*iUNIY}T)>aO1WjS<mJzP3mKw*y=|Z8r
zkKQdx_<4(!QwscnQ;NcZ3{}Hyqg(!{Lh#Fk)ko!BsETSE-nX7cHAyRu@Xp0Fz0)lP
z=^leqf-h&j!2{IqM>$LB%bF__?iAKqH}2ZrylYb(7KknHDDSo>WtmUBA#<wJ+%`je
zUUH&@Q3!3y{hk8W@=iIkd+XnXU)J89S2YbDc3MUNRWBD&zSlLGlQ-si|KJNm^!+M_
zl*+(VI(FOuI`+MQf!dP^Uf%(BVE*V`P7xxPZzX)1U(Rj-Y3~<7MFe>6`L*<nZI$0>
zPLYfGM)O`ZwKq)LU))AaT1!mysuRV&4t}c2&to)cQ8PE$sTri->~03}=4x0y3~LHp
zc(nop^DKbQdzgWS9;Gos)qnYSuqXdT=Lg<jlRq&Gx*EwFTwdSd)`Na{ru#8Xc_wh7
z!}C9vcX<)u95u_aXF+K7!6ZDLw#?;xdInt9HY0ny`8=Iusq{r|?5ODugz{*ovztYt
z)(aNCY;n!Oki*_iyS}~L1xIi%_8(DUm>1^7BlZz_SBFtZPqbQ?Y@&%N-gR>;xPTF1
zXd6iFevFK?Bk&UK8wxAyA(WM)5dV2Qbl1aP#+k~9X(GH|iG>x?3Q!Eg3fABc#N=6=
zMazCguhn;w#bj_<FiEhxT`+rchWS=h);I|FR#6f=QBx$JHfjXok+V9gIs9#qUXf_w
z4I><oOzbk>;n-b!Y@LnBHQyajT@jBi1gQA!Z!dK&t<}mVQ)AL}Qj3(J<D#)|KrvdJ
z#J<u7lw;T=(N{j!eo8?g)dJRhRoAB&`=y*C_H=vF#T-5qL};@w&7Z7S+z|j`U%=^>
zTwa`5bTh4lRcSazsq(yhkC}vKj~4Oq<434)&0L}_g1ZhbE!&ej_+v|lvYsbj*_3$D
zSg}=i_w9_w&7~**Dr({H>VN&ghM#MG&?{%6%uJ29gxg`NNf$~>(IM*I$pY1~M<-hi
zMMIAzu#+v#dLS`@W^Y8;TFkU%B`$vSx&DjAR8gbLuU{99t?o!A8v~jMyqpY6;ugEu
zDk_%RN<YS5dor|w!{E?+TQgCH=<9f8nE!_0*YGUo-9m4hVG<7M{jkSn)C9*(Yd!{d
zg16@dg6IZ*pU{vmc9_CC_0N~*vKdPoc9Y!(>bO^&RCva^F^-$g+V0+2WVr7pwP!Y`
zAUB3|E==Lbub!C+U-c8fj>)}cH!sfLUu<<L-YriJHSKSPd9#dW4}(M-Towxp9t#N>
zewd$m-y$?EV3nUS9KucP7FUg4T4ne=7D^eVnoP(`k+|nR^#=mw*W1;v;OzIj!?YK+
z3WSk8H3G&caRk5ge^R_2gd_D)rxG;m=52d6iL<U4ZaQn__aspcpv3`X{!|p=eYSpk
zf&TtQXd+VX06gjMNbWD5k(k{w+Amh_R)=QOny#`X_|d%sP=iVB0dA)adjbQx@yx)z
zR7ZvRwfw6U^E;>@{eZZk)7<U!=2aYSg}VgY_^V!T31765ucs|8X$W7IufQuQr7p^o
zbrqF&W?u=0fXa_t9PZs8c<t9KuD^NxVDMXFiKud?(?SEDzNuu5$&1%owS-6zxHoFh
zf8VfimGv;U{czo<VNf5!TbDjA_<WI>*C}px0}r%r(apD;shg4M$JcM87)gq2W3>o&
z7sHC8WHwoQpJ4>XeR^a_1>4C9W>KijCRf1MMeD0F>YWQ7hopa_y4dz#6G=T6c;|Do
z{bA=FMm<S~0jxE<>*6Q~Qd-@^^A$*YX6z9m@OS}%Lq1PS1ZoX65)S%uIfevyz15|p
za>pGm%l1-h@lUsRzR#3|kk?ruRKkm2=&XZJ>h6GEMg&;UPWgI0g(BKdLpuqc;MfsC
zO<mX%5vy}}fqGlu<$8+A+CH7jsAw^kjY06{z$Hw9;*d5&B8yS5g*IObOBP8Tz_3HG
z3DrBDb%v=b=loRn5I;6)5^ISRBMNT;a-C4;Xo?mD2Ur`v)J`j=Q=REmFM^nmG{h~?
zaQSTp=Ut%bjzNRIFMOxck6{+>6NbJ$TlFTEr0Ad3dG81qax%6fPRt%N<Y1B!At`3v
z2EgIpVY-O0Rsl7T@Yb4`i=pD)UhZvLYD&yD#eOPFUQ&JDy`{Dl{=(=ij=8wF&hTiw
zO3S4FW#-U|u=f!(YcchkqI}f;Y)30t0jTBn#ks&mA|N8s9lfpw!{L?R-7?X*4KMz)
z%qQ{1&$p>1uL2g?fdwg-KNP(#9B(BFCgUsJN#&8JXtd1YXrnlC<3j!pz(wQ{x<wr`
zguE|J4dfHAMfmk7^857zyFp&=Aen~kAF`J^gPx`G6ajKW_|^dB;6jz2a0|`{hCAgo
zIrPqKH?*tnEneR5vcq~wfb-|sd;%S7Wnww{6ySGEmD$N4QV{9HnH#bJeMSW}%0W&)
z+En*709t~wk#>$EVHrME%)izN>_)$%dez82p&bp~_;10<nNtw_DhOc!w3AhCYigUP
zTs7RO@E&i9Ei}CJJc>M6XNN>j3^vR;c!!7i*8MgKP`?({`YU+!qw*J#$6;+PYwwCm
zKjB`<z^PfIPCDa(vz}#rkI&5=o}Bu|FV1Rcw)pc`uR<EB8{$`T{uv6FMXhgFtaXtn
z!7CgekFVyG4c7a4{hXZ-mo9x}>6MR!`xN0xBM6A87i*(0J8Mv-q7ZT!O`=-3Ilb;@
za#;+tC5dFhc$K?17K8N^5y4+w6&n?J3kk`Qn1xB~f_N(gO68vn#mGwJU;vp02(*9(
zgNLHqOZaB}DZ~7wMtV7|$E%FjFH~2MTrGi|1|+`E7awCtzhW$O%<1`_pmP-`RDmNu
zGv+F;AyVtCMcQjSgM0)kffGb~!#W*T<YLbsR`%t@IqLNa*T}t(>=Ph(4abt2=W}c=
zD;k0rQ9PB#O{~HaAabmiOYsB7$WRrhMfser?@HapNs4Xj#f^u$3KgSAntd;Rdce{A
z?LZ@EACYIV)CcsyAGPSUO01hwsiD0!IxedXp?E(Ogsu0L6vwi=CJZ0xsAT{fWV%Bu
zt&fLAK9=-aNOYlu@D%o<2g}z7Tc3d;6SCZEUR7mk-HSYPx}LyB(6P|Vu=6QrzCN+V
zi~6ZktHZoE7&0b#PCn9?cKEZ~x1<#txx0WTSACw}d#s8Ym1`X5G5(*m3IU^zO9MRM
zbZ{uko2pR1c!M25^jx6q2#M5akuq`H?7Gac@2}gfEa@aJ`}zh+rL$(*q8XgfmKcK_
zy1T&MXX|ZJ8N!ckFmpOnkXmrXbm+4oC4@!iUd!v&yKb;y`t-Xx@TIYPs!+3LlxOgo
zhQc1rW3-wBJfT&4Oz>u_{9Maksc~jeczXw54lb!RKa5IIU;lyO)_%_G)+#;ui>?wR
ztfNzfhVE4(Z3jQ|lR)gNQ_-qmDr~$Q2JvNbrChhr1-)Mpy!C{WzBx9?qc6_lw0lnk
z5aW2AR2)~f;;}X)!gft1gf4_Jw_x2?b4uEj7l?Wr`K#l!OI-Iyd~m<6=<6&u=<3Yj
zdI7!rQC9AOT232LjZDDNKWC>gkk3wzvFy6$o$jNjm0(&kql%;`_>e116&m<WLpX>|
z#6r}c7J))2(c?NGL`hukJsrnPAX~cECW4-qEAkS}=Y3gB$V-W^cE209Qud1PG=}8i
z3}&h7Ij8BKrx(=cALzaKMgxel^=~V0Tqa$j7+;4mb0aHn9M$_>YvuP2W{TAKq~l(i
ztXl~4zjlPa3!OSJP#+X}d%j{>%pbccFR&JT9rJBm6L|S3iHAL;_nWb;?p=b$zB_h`
zCR~>xPEU;sj)rq50?qyRUIrHJNU#f>nbUQwIJdpM=6o!wx|yq-)ysQd%Gol9wo|`1
zwe$3%|H6RjmoJuSHa0xH2qq&ru@iO*;5q1^0L$HUUk@z4!t(`Y+Z#fYR#<#dBW`uN
zf3VSk>KqKq`0C3dA*W^Txm&6Ob{$BaMS3`6{7X>Ye05eDd5KIXY!z%OX7Qx&zT{<M
zz}1BIXALVm?gmk|l4AJ?t}mT0*R2J<uKAoX-}1W7-}wv5hdfp#NXaMTM%J5rxgQay
z>~i#$3Vh9oI>&uVSZwIB#n+iV97CcWMP@G}L8o5h{SfZ$X^uDHTi!OG;WZb{GE^EP
z30^3F@q0TMZeuvVs5bp+5LhKZeLhzJy8O3gYel(D)@?lKFX42Y_LiuMWSymzG^`c-
zjC=0I+!QGOcaFdc-EMgjsmPu}_TsC~MP7lDkiPW2Md4%R4Ztp0yR|`75ed7g9M3Au
zx5VH=Sw;p1?6t89Dfc~s8;V@ATBTkkOvmzlTe&fa<{|_kKzMcar?^)x?2nH>mn3<f
zmq=nFl)8WBgYjncucI*Iiv6Wx9lOPu><Bb($&@t<N(WHSj}SRSV%*vGtTcNW6=R~S
zau*~q!uLiV`PM5IP=ny`Xn(*kecyJNu&$59HI=z@$ca?_eERp8+<J%MYQ21d@^3~!
zf2uwbV%zoQ5ED)XEGlq|DP+f;>Q-`*yO{ABKO2hzWzT_;J3~+Lw8>$CT%py}a8#g4
z>VO$QM1p$;lp7pwVJ<97^^D+jK;t!fxW=>+-ccwV0v1F{WU`{0ua%0g|HUK<L*WVt
zl*$-`xtdgZ6Tr4y3a9XKwN2CcX<b$4GKL>jZqsYx>4Kq}`foOC{iDm}C)JxI{ss#@
z_AE0`Pnp?PLf3O=ceD(q#R6F{SCp<eG>D^#%FlSLR`Twl6>nNx=i_IvxsO5rLxJC!
z_eXaiA1++u?*t2Q{p$Ly4f7cW&%DgQe>LWI|JW&3_M`N`%2#VqJ=l6y#2n)w3Iz2G
zIN@vvz-OF$69TLl{=9If_oa^B?(2SV&lMAL7Y6GJxnu2q&_xJnfrACwFo|B_6yg9H
z3V%d=Wi}AvR&U%Zg%myj@8Xlqb4dhi9%9Vtl5JawJ%3n@Nz2VFQd>2RS;ePoVe1KV
zYhh21sfOEWI+#i@nUa^Ba!p)pH2t*(hEqV9i9b6%2$fV92+aOGgWWjihJmL7*H%mC
zELXPWBQjoSp%?#KZB9bp9%H{n)ztnp==Bwg(*N=1$=IGt&pD*skY>COhre`Sk$0kk
z-Q@rSWxm+^zF}k^U*Bpr_rw*BiC9%DpN+`Xb9Vz<%4UU?s0P?qe#c2ExNwH5{n5Hv
z=E$ynP4dBm|Dd_SqQRv@SEq|<AbXJLjmJfS=dHt@WTMO7J1HRbk)p^!A`=14oIy;X
z?sO$pRos`{5=w^}S0b|mJ~xo)1>#GS&Aii;d>5yh-boUAv8iV6_~j7eUSEQqy@cC9
z{t{zhJpDF9;mMb#WH?9!-jSS)85wli;Pz0Q0r3wC^6}s43YNtztORR*EBW#L70m2l
zD1jUrx?^==aD*W#A`?A&uxK<+;JvJ)IHbzS*+R_80Yo_ZyY}0%l^|%w#o?DO?+;)O
z`wz?QM-|atj^K!!0)pn3*k-VBjb5aLG6DA<otKs4J_S-QZlmP~n9c=>Oi{H@btDFT
zwNyUNUlMa?f|6x|1V7Q>T@)W8qC&1zRG`Wk*E^v2KjgW2IE+Jv!YrP5tRmQ&JRrQs
zCzvDL;<f%OFjs$u;i$2^U{{OyFFs#B76PgB>c<~XD<X~_9X+MA{k80W*9KyxMO2Cg
zVe+@ZO*kv+8-N^E5=t=soA1|`rXV9XlRlKMk$_32g$$(Oo{0Rp^v|!3zenJR?}<zy
z_uvEzBV^>6g<l)7|5t$okQsso?hSxX#^%p>{4XVdXaySFEV2UF2gE@${~Lk)PgN0n
zC{4-&9#S2!q@?~HX0h(59M4i2`@aT&V*DC`mZIR`E=%)YjphHTF)jyRJ@0V@#0@7A
z>`ht9mVigX325hPv+uwNNQD_VIDD8oxZ0NeSQzGo-!^r#V=MY-ja203XJVXEr*qK#
z^_KmhXt(S~X1nN&$L~6RZmpT?3rjw|*H@W6t^Sr~uZr5hC8yWn1t&qIDeOh9#5VA)
z=Ecj=?V;tl2lpO=O|C6H;8jmsRNx8Ywd>bZaX#N`hAPH%=KPZMwq>t_wkUPW17Kr@
zWiP))952xuiB)`=rRVQD?zy?&2SqwHLk~yb%JC{KuMSs?*1il?^DYM7J3C<}ss}Xu
zA$2_^mYz7XdID<s+kqM4tj-6|Uyp_o$}D|%6iKgofQ_r+YBwpi1}!_cDvM>qZ7N@v
z2xXb7<@d@4_Vi}1E2a;dE7h!NZmpJkt-c-=9lAEAA3hi&9C)zrO+Fa@?p*qKSI3JM
zg|zocZ{R-eK|u4qL4-e4%UY4zz}nSAi_@3S+j^}Ni*u-s#rss(*;kgUJxpEQcCl@F
zvWT?cDz1-TMOEkW7PZ=PM82-z)Q#ENmFcIGcf;K8Rkfri!L-&!-v%m%;=1K0{t^DG
zhS&_X8iEtRHN8)|$zn`O<)M9}PN5>Q0&dOk2gAw0lSs2ot~8-X{IZ=b;<CJ}9><xw
z0}tF=wHv3vQHEz9m8wTWUZi8k+Q65D*C)_isBK|^Ws_`yUVn*)r=w%RQU!~i+}EmS
zR_7ZapYN_^W9#98<Jfd^-B<0Du`)Nax#o<?lfuQei?z08F6^!7-8f-ia2=|GE48u$
zkGEco(7muiL|fcl=2JZzgo9VcvNpDx+$TEWGD}WX-iq<Fj|D1lJxk)R&#HWSi)({1
zl)=I1l#<)bvS#3I7H<ya{b;>;Lda%p=U>ZSB{ce-MSool`7rpS?IR4I(>~fqdit(h
z>7SO%iHq9<YHk${T&r$dcE_xJ^_})lCw0MPZLf=A=U06sgiKAxl`PJO(_TGXp^jjJ
z(o)^$VW*4q^*mT|V=S9*B40kmZYNGH2#(Z?#p`-P>RnjVY2#5R`<MWxE?9{8Zei(|
zC7ZxxFq+D1@OXVtGoV}GUfR4FdVW&37Q}zr%s!ppvJ$V$r|osollApzcH`x44K+&O
z^3!bFVr;@lGoFj32Pp%rT5~9vJ0mBuYlqzEL}GpqW`=dNXMU$kkd2S4bZtF;e|hw-
zgfCH21)I+NPhj(tY*;p6_O~f*hVcOlNkLbm{LE)HVl-AiY4kLD1{Yj2ygOs*R61>a
zu2PCYKg+^<J!~GUEevjpH6mtOsCU$c;u*Va<?H#c$HfWL)<}Ks4e)sE^fi1=E~YYq
z231Qsv8mznBvLC8l?`cFd|$oRMpL*I0sk1OOj<}r7j3L4razh+3Y2&diYC#_uSjrz
zTfLvYDNPt9nRFI|x#twyX7qIL(}waXMIVgYd7~)1fij<+JlCUHMs0@ys*T+gTa*-F
zt%kbUNsUhXe#7OUfgkc`A0iIgWsjp0Kfl2aI8yCvzIPZ6c8eATfhL~U@K=mD&im2i
z8?#F`NL3?nm$vmCus*FiOSJ}*+XDvpSEx<kV601?dj$(DAO4)PRW!E!L&O*D;2R5l
z=6bEGpoJcMd0KlTp;k!Uhdw{^z3+%Ja41Az7Y{~80&DFfS}5dgYC6twQV6WB%#zh;
zoY&`FsGeUWuLDoO-6gqQaa|iPG?LnT-u|f=ger93{xcQKR2zU%g<~W%<M;3V>4uWk
zW3R$b8vDx)RdZK--nYk9?}i>%&U|nsH3LphW>u|$M@>H!_~@2B&srd<?d>%7b<gG8
zS5-Ak;zR(7PcA3Vy87jBJ8F7?NX+$Qav7|Ye-u8a8lT<P$MMC-fExUvleMsu){nix
zuY6#xA4(agO3vv!_(fk@7!KRvYOfe3H>$>j@2)ON@Rq<lG~4rZcPQeK>~rfBg<APe
zyw>Q0-B{PIQP{Ej_JovCjQd&EMpy9Di;}W3I@hvYm)*lo!jqsnNV&E=m3Yyu@kOWD
z+Pb1+R+xKXjfm($Yuont3Cu-W`Bu~ZMwQDXuY^Yr@hZAK#6CBeAI}=Nc2Fb$ZwE}D
z@+Z&p^$v-*TeiZVZKOt)fo@m`dkuAMPpABEPhki1I9p$Bp48M00__F|Xr9(qw2j_Q
zo6cZgRsV>?3bj|_!@l6qIXgYw^Y!5zEbKMV=bb`h^swC*YnaS@?HW#E@4}1{CaIk_
z3@h(Vwt9TIO7*^NxqUp+HLV!?r9R8@6u!SuWmbQ>Xa)Ux=^A&|8scjd@G|u8GR6ag
z39OtPE3m5z3xt?LBIv^M@0pyZ)*KMRQT`6Px}H)DK9d8?1MgP4EQ?BT7LyVu7GdKi
zG#8k!-fwHBKuk__gYtmUO_MV8?|cr-_$CgE%1A~9_lOjA5BYYU2Vu{Kn3}Bb_qU?D
zaP_><RcrjPr%+ffUD;y3i;fL(-Q=C|pTST_;We}Vl60fnuEW*04ZO4llbaj#gw=0-
zux&v34}5(923RN1$D`sZg?aOw2(R|}(e4o6*Q*Top)3@yoRcfyIoARV`SWIA3^%Zx
z_Yy?u6*{Bx_L!iE%HJ8Orfn~=SL+v|N~)m&&t|ps%^55nD4La3Dz34_AqAb`C0t-q
z2dua(jlkth|14vGO=*#DHYkresk|Cx(rte!sar!qrBN}JVJ+DN5fgb|Pv}xZjaVP0
zV9G^zmPMxV1#-@+Q`(Iwz1CM4b-66ro_R~+H@s+tqfRYG3S9QHbOmJX>A7_r@!r;N
zEn}Zb04Vbz$g#Xl?#L<Bs>K90ox4Tz-;>XmfaNZ~17>)-y);joef_&P@Eel0)zvLq
z(&Kvd!{vD0p=}O;Hw-SQInnrz<|PE%a2xlk*<SCwPDG;`VPX@aiS%RfnLg5#`Ex`k
z##0Zo&xT?5H%RDbUENifnCms>Vzfh<my@2#4Y6GOsc(~aE{#5u@tfcki0*oDm7wcE
zXSZf!dneCqgyyZ+@oKutl4X*Q-~sj+di%zAnuSkfwRG{2DPT%x>J2uswk>bw_$;U`
z5C4|8JOd#&RwGn}!Rj4YZ<yy8c`c!LS?dO+?uXSzB#4~V%px=!|AABEN-$V4|G}0V
z<WK-geSlJVe6OZe$xhonW{vWoP3sQMTU>^d9L)mQ1nLllJRs`m*{(Q?6Jjc#)^DNU
z+J9=BOZRL9b!RS*1YpeakwZZ*jZ+}r=kZHQVqGp_E)iybvPlOH3j;FM^t@+&UeNPq
zBTOJF3f;uoM#%!cC+PVs<oI01VIH|1gV_|xcwBO_$DT~#ITx-C7|I_ol!&`Tm@O{c
zkh?%a5kp7ocDXb?pNBjt`TGF@TM!F?<>g?^*99{4gC{}LD^8Y}F+{u;&UbutVR$s#
zmYaA~Z8)#1h%n`Ja(DV3j<k61>>nV>&<R?2Px}|tY@^rf0%4P<s6w-4Qb1IF-s4@Y
zio%7;!~L@NnBd0aQ5kBEWW;5S7g}Y#l&AU!+zAp8_CZa^{ur_A^2Ct1c87!ey|kxz
zv*$7eTX_*<W9ZY5rwA}-SkCD}o%pGjfZZyYwq}HDDPWHgDtBHm;Jj`jKP5;MhqLn~
z0NZFA!D02Z?d|nEzs9OKM6deiFY<siZaFtvV-O0xmN8N1rZ}qL_Q2!qNs8-7L((vu
zkGFN)HqUNjV~e8GQS*3bz|*W{3pe<T9=Euq*FSEBX>lD!7;3!lNVCl@%HVGP<(u8`
z+DKl4?%{+Y-~zh|Rlw-*1Z>h!*5+Rnji(T3Qr9P}0k_QtP!oAvoRMSx_^Pg~3KjKz
z#bX*_AwC~_$tAbDy%zt$`Xb~|7!HUr6YhVzdIrdR<q{j{nJ!e}<sLj(j`t2S_%XC{
zGoafFiCGpUv-4o=CIu&WC<t`&2@nJsfw^E<cspLI1UgN=w5`<^H~RZ#hq*;jtoGBE
zv(`Vdp6B+M@7vRwlXF;gTeJL(L8y#Pi%x>(;2a~l7<^11?lHc(r09n_LmX<!aEee`
zf+<m!S&;!XoET&k#zsReOiDZurCmGo+@MLQoqaZPfC1OgbBNgcok(KtQkl1nrCGWd
z`jxhxdoq~gs=HN^4d^a&0PaJJ>7BB+ezMiFlyW~^H)SQUpobH~D5VdGi~I+?(YK6R
zypb=5Br<yjZFo7$)kN$jdpuenmU1kLfz`Z-pC2Y2Z5UH<0=zE=Iwh2Y=da1TtF`HG
z>ZEdIcvVBy8ct}S;5L48_0h)1hmZy9i311mxJFNz!$Fx}<@YzQa>u>b5maoS1@)DQ
zLQ!EuC~!<D!m*#iDKrtnoZ~mqmNt{F|K*`1;6L@#K}jWmy#)@c-~Du8kdMdgrUh`S
zI8ZNC!_kxu6B~3qZ<3(XX_n!=YY6La?sH<cKBUQNVe}i$s+y$)UTY*@SRasLT{%E4
zgq=Hn!xSIIsjRML1kMypkK+7Teg;HF$!5A-=rr&;uaUickbkj|-4nIeKX=ixL?*~r
z>k-928r5ewZ!E0Y*qUzV(NSo8PCjPex3-;<0Lw;erax8AI8vB;zs1oB0D_L2rxciW
zkAa=<j=pcR4_^9N9U3wlMg3AOF1}-Y3@}emEAcH%@bf9fZp3v+>h-#Qv1R!9{XqL^
z;Ze9R8NyzaO_1r%NA#QF<5wa2UNbPcenTe?AI{u*pIv6_6kU?w&-%;kg=kia{QEnt
zuKWQ2eJ^GHz)EKz+n-g`RNv+N_)!6!(HS*H$h!>7@QZm#>=q-%dxc^%KzBbl-w8>=
zbWYoFQL%c^_%{){2;QA27EWUN^yI6HV+5yZ=*!7YKC`oSs2cAH4S}uN(V3|5E8b~U
z|A+3uwht%FyHbTund+-zb29dt)abrOub+wbhpag?hjUB*35>VNm16c%75)VA?S@qC
zTl|EN=;>iK9rERDv!I54*fqeXX5dkhJ_FC72PIyZD_hNWxq#xPfj36JYI$wHqCAWk
znzS%U^Z1RT(95(h&`bvv29v@ar9uLP|AAeeE~7%k0!|Ax0Q9aqPt2#_YIoN_bV-*Q
z1HGc-k(Op5nd$vJ{xt@2SLX-=fxv1v#_UG}h^U?TpQSm}PiKoBG|z*+;U#??*DSC_
zzU3ZC@qx4*iz0$R&B-Ea)$<{5?fIk{+eqQp`;T9haKxq>Y9o=R)*|^1Y4u10g=kfA
zrfNZ_naB104sqX4GSO7r^z2?A7R>1{E0_@WM)|JfaEV&0{621yoOI7ug1Ce#>sdE3
zkLtFo481^_okBI9uoW@ya&pl9zLP;A&Gs(XG@bk|X8sZ`#X#vQn{?g5%rFRQgPtAT
zkp5CU<&WJITrbp-ou@u$N|o~COSrO7{&s;a&HObrOds@#{R{xY0APT=(@P*|mAvl`
zbr*O;6U4&pKKVh9KEyNSKm}LNDMoL|yJ%!h*9QM!%dk1LCd2&WDJc&i%zNbfnm}h-
zPl}ac-l>0bY)neNwn7h&64w;rLoq-*_Frj^6$<IigdtvL?r*e~%;~puEUmn^Xw<SL
z)&-C%pM_QCd3n6Kx}m@eXXk_qi-D0_Di0wgFQw~ZJ}b6GFPV3SP3K;9%R!x1x5lD;
zbDq;l7#NNHK_tGdU+(jGP!J3UeE9*M$f}*)hOvBwi)q@~tp%&HmS)D{cVa1c99JJ0
za5N-1)(h1N7N(+0+cscv)hC^+s3oz7wrraf7T#aRSNRc5D8NJGS(UM<Z};$#G0o4n
z6NyH^6Nl5W7c2kZ8Z=a@FN*8wEXPyElM5mL{#!`{z|E3pEP+X*JuQu!e2i;2Se=b6
zZ!Q)P8kE8CBfLErFQ|i2;UU@ZWgwaNG>7$mf+7eUb22HKwX-j6x8P-vGV=Mum+>$n
zcR=|;l)?uWM|J%y1i^NKNWMc##;TB$Xh+>uo;g~V6{-|?B!Rr(mGL%>kAI5Yp*@u2
zT8r*dS|@KlP&-htU>gRE#|_0kGdP5o4^Xx}5E}ZO?bC$GP==;iY--$kbzUA?l_KD(
zr`cy&zxM9ua99k|J=6tba&ipP_ZH0yigA#fC=x1m8!~<fP|#(t8*S-KR+^v)yS4{I
zrTmDcWwy<S>)QNOLk;lcGWk|L3zBJfNc?MD1L3R7(U#uXui9n-{t$}g#(!r^tBWgI
zgsK#^;-Am_K=44Y#8#$*hqM<VNa_9dvp?1;7daa`igd4HV!I3P9zgFbm~SNO11oNS
zDD$Y<>g{E4lFIY#<F+fGtqq9}wwVqQY(zz>)1%l^!w!~=+OYsSD?>KnoJEKQ{yHl*
z(5rHuVNXNfRYgkxcY;2HH4pSDA~`mW**D9vL9PML$Eb6blX{xGerIu#iz?`90+pmE
zlFZFNL|-l;$z1G{Jgz5WcW|vk6jq}mXf|Y1qSW_cem8%>b@-ZvLL7`Q<<t&IHVY*!
z^c?ZAjY5?q9(hh!W(VTilwoRk;uyRmyib)EQ=VK?QzEw<-fl1BVuaMf)oh*~p7_C7
zzw^xJuE`0#l{AB(XHB|6^;n3ut~jyv&sOPXs{ua45b!1Q`1l^q*={VVMPwukfU}5y
zAwh-q7b1mz6#tV9m5ku$<^MG~()_da#D>@dMXf{&$xz4=-UWJ;Y@A}_vVdIL0QaXz
z6P(4}cokZmiYvg&Haj4?8YG_l708D%Ar)9Wt}mpg5jfcIDwI(8!h)6f8mrh%?pwNc
zz_tZddEnS{i*!LZJ*%!-A?gcsC;Ne2`T0&svpXwXTe2q+qe6NA72w9uo1N;(QI~;K
zQ<J|Jb}<MPRtz#E-}b_pD;{($nQkz=6D@2GEKL|H#0wU4-f#ZLm;=DjhXMikH?Mja
z+YWbYoO!>U;Poz{F`YST^eYV-DV5I4;HF^LbV)tPM&8o2aWVUSuhnNM6-5=Msr=Pd
z?YE>(+CN#-kdR@WKfzXogUoOGdbMXq0_mwnN~H(~7dS%+6%j6Cl%$ERX*|o-|Ki*j
zv+;xfe~8a$s&Vyv)tj?7i&>fzFr~`z-Y<~8hU#Xnsy+vl8gyVb15U(_rXYi)c_-fV
zpT{DS|3H5GHv*4{4*0^rbSE5Tb9yLb@pb^==BD<rG)(F{*3YHsFvJMF$Wt@Fzy6cB
z_D@g>1{+BtL!t3u+Chka63LML-XUc?n}0_ZixeiK4CPKOq7ju6Qvg27y8q{ie=ai7
z(abgI$P3V&{I$&^VSYS(VcM7%qELualU&>olN=9$QY0?_tw&DtThT(w>hL*xTYj#<
z8%!!mD(dIOc?6-H@CVtp5h`u<p>jV;4v7BCs{2tziVVf$PXrW9a^k1}6pRiK9wvwo
zjH`*hr2ZBgeSUYvo*n6}>Xyuh<9MjsFAamI$1kWBF4|q;xQgb%t2QmYx8uR*TiBeN
z0G<Y8_}n*_WQd=taV0B5vn5)=`0S<}kWo9?|MJ@+jvv}xv8_*+8zS-_7?bDkS|y<s
zB0(a=X#E20qZ}wpq6Z*!UdH2YC`|j^B@F=aoDiC$VOoRcCjq@K;0gk-azF(mDfMe-
z$TXa6Dl-mYLK?GV<p;*4MB%iAAhW9q#fBb&jNb{_RjLe%ToZ=#U=o*n&0z{hMorIV
zuKIN`zw68p_}il4X67|t$tE=Bjgj+RhKkT=5<CBS&oAJT3x>mI?f;~aQH<Bae^IS<
z2);j^R3GkBkLbaFAcq=>5s@4&kI(VHZ#uvZF`eklC7QC|9n;g#R-pbd_GT)rZv9^S
zyU_|f?=DO$W&4-;-v_Ui<>IwTTqdchwviY?Jie;}Ymx>*vp<+pxas^aQ(+zqR}<GF
zjX<AkxTtq}92(NUwr;lW=p9RX4zryd8{ai!7KNxBT(dIzEctZQdLlbto8A};GN}4G
zj7;sy2J?rk9ji(wG^*ywKMXR&#$Syrgnu>GG(q|~Pa2UrHk?!?7K}nHF6Ako$kH0)
zNq_gB<&{kDN2fv^Y<oP@pt2|FpUEnJWyztm4T5J-fD$4xN*#R@qNp;!^Q_hDm`_9c
z#S70z$7+V|RZK|UM`VZI*PQ}B;xRs`8&{$kI0zWF-{>Gp?;Gkw+nW!w1Kj3(*A`v<
z;5$5&b4{DT0TSN6Xm;EUXi$EieW4sH4O?=GwEhG4<|iy`{}H?J?-1%R2v*zKZroCn
zMmFg{#Z=jcc<kzPX6fGEE2;y5mb9yC?QjdA7IXX-&5hBtBq(t%BQ?xN)h>1W%IQA7
zd7XGlSyk@HWHLT664R8G@u1me$#C`6>1Eq$G~Vv!2^x}P13Ss&mk9)?>1TE`(|na0
z+qDi#N7FGIfi?%uy(L*7?%yI7<ey9iR_V(VF5mAxZd@quagE}B%Yfw^`F2>HSNCXZ
zZ-#g;F%}^${jKU_k+t43rX3bo({Qb!i{or0Ui)?PQE=pPaexj4h60wx2fK-22w;DD
zwUf0DS3oWnYg9dR_FaxvIdW9zg$N7PWXlt-{YdEPuJ2t>Ykq%fI+}`a>JaHif=2NT
z=EFhAF?m^=b%tYZ4wLS;E3fT}D`!hp++O~h#T3#$Zs923+fKz&%5X9V_goAF%nU;<
z!`#nuNV2{$95~tuCoCDi?=2W~+$tK^gEH=qd=^=KC>q^Ue!S+7O#&@r6r|SFJudZE
z<8$TI$Ej_b;$%r51&yCp*IZVfVjiVaQt!SeFA**qm=L|Bn;|jRYWpLXna{%cw9VI6
z;NP`ZTc`+rEO0KGeXU!a_lgz%kC~)^fb^*!^+K{D5_pE2=-miyM)TX1ZVyrmug3H3
z0TP8(?3hq;gpINagb&~iu_;_#;BVDj_L+q-ZL8OO8EH@bL;#bDLd+j@d$QE@Ys)ZD
z$1GD4fNU7<d#~X^5FpZ(SrvyMa>8H>K2K6fp@ikY(6lE3kxe30>~)R`0h`x6OoZk`
zhM2?*FMbbrc>lVm{sHx7jJ$$a4bzV&!-y*8Gk01vp69B@ZZK-TMM2t?!;k71>`q*F
zbV-3p5L>Z)h0y>i^rHW`L_-Yagxo6Z<@Wu0f!mM~l3<Ak>xzP{|HFxoW{dKdt~oqq
z6XH!dx_9J$wb7$bTo>$1)K(c%+w%{Jk+1o{!~_d(agWE4*PYgZ{5;f?#Mfk5KF<ZM
zM6<q;7gDCSy$GvjMg3l(=4YX5t82p4%^?@d3*`-+uVuEFGz#y*#C72u-<SG-&F44x
z=^zNBSQH#y0heX<r^sXtngXt!X{q$xSMPeE#NTr0zht58wS!WpvO9YlRo%GC17b8h
zSaiEWaW(ocMB9+D@Ojwsen&`5SMT|k`pd_ZL)N2G$fveX$PnhT_ve!1oq_nbfkqT4
z5BJg5Kob_S9WSD23VkvRB@}2M3eVj|XF#-htn9<IEbj^0&titb%=DyN4k7>?&qHQ>
zxN5qmkvqHYLPK}k@wd52x6i(cE%rkB@|RowPe30x!XLLD-tda1B;LkC6sBaeJkVBA
zbGAMd9!f@7UTC@cVQ28f+aZ{Wnq<#zg##%M-K^*=K0+77!bR0nkt>ECD4(B{0ogW>
zuraXWpDYZ%Yht~dmlQUTUz_mhPC}I3Gtne?WK3Z~Ki+&1O(L-{IXF;Lv|2WF!o8U)
ztM-nUFuwRgiF;W*jN?+OA-VJ61^<j2x8;~bv_7EfZKYiP5kFt~Id{xZ&$EE*2lum;
z2@?Eh($Qxt$_p8r|4Va)7xZAjU97dR62_`Sc)tHj`1aF=5*n3Js61n*`C-%x6mqdL
z@CaZG80PG~6k;+O?!_y4uf;)5ga4uNlYS(b5}5)^qS^Ww2WkeBF$&wKF1+ZU`N+_A
zj`1D5dw9i|G2*(B7Lr`3&|f@b(mTAv(F~#y!{?17@y9>IqKgvqJ3H9J3!}js;-6yP
zY@vOlgNwH2HPhm5>V=r+@2;Jg58Iokac1y`tgd4<K!1m~_`P0K$hPbOPl7stj?_)7
z1$)<w?_=OIYa@sQz@nTnXLTUx(HEK1V0+$cyVRJWoolTt%|8VUcmE_S#IS_2CEvDA
z?+ywL1^Q7IN)t*5bh8J*WSiCpwN>}AnM|SA%uEEd)>7AQU|Jjy4klSJ5AG#;2sRE)
zc7!`k9rNw<?591%uMva{jf|N$wXF9FqEPDkSnq8KBICV9mzUhxKen11Pirnz|8~D(
z56g0`Y+IvK$>(50-HH5n-HSL<mUmv#H-f~C+W@J%vrm>~@UdVMop6M8J8w_;&JzY@
zi>vp73*weNy?1n)i`HNwfQU$%s8bm>`u(Q+#)r;#q;6F}h|zTcMCSL+_`fJo#vyJ6
z54!=-m*`o35Z}=Jc(8g14?m%*^5**thTV8JfKa*ar5Ma@nV&mQ-?M(h-SO1r-xH#N
zUcX*Xc=S%gg>7wahVcZ;sR81Rr~8D;=+y-!HBpTI*0`j@!A>w3a|2xs`(4&n%ze{H
zNQ?e<J1Q~7#Mt-1n=SsiO(6?bw;u_Q>!_~bF60gmB5SpyOwx)=+NtDGE#`#Y5{tCs
zWR2Z?UTKt8NG6)Sa&20R^QO-Z7vV0-33-%a{b#66lD-u8<HEUlwuXF{<Y*J7*^PL^
zl!Gy@`5~X~rLH5_jCLu7Or8RRxY%<@UsmfF!<(bRIa_o^*QTV1rs`%%FIK_5D4uQn
zN_g~io8@^&Vj$BuU~msIh?x2MK=tpjs`3C#k?sRy#b29Em*DL4`19G{?qDnLEJ23I
z8Lo=&&PrnX{_Ks_B`k(_puBi_Vgc;`s;7x7BhE}IjPJ{h#(DgELQZ<fe)yRZZU2n#
z;PdzDh8krjqe4|CeSb>l&w=8{cf8}AciZxqs|$b_6jdVY?9;0ym4M3TSxjcGI!=}9
zQ0G0x6botb`iSLkaL9Va^#NMH8y#2vxH*g}wN4F7@E0y*aIF#!1@<PKA+nEqQ?U(h
zZ1FP>5ac!874M}TC5E|o5p@+S%WGdA*&Zl-PoQ^NmFDlPT3+crEP+2*0k-;<EQ^~b
za+gZNt({4a{d;nZq<i(f4YY$jHMSI<gLhmS4HoGWQZRVTV~^kZi5^4Mj1@?pvxovx
zBWqVNvbCFqrY1dJDl*K2XJ&{Pr~GFfjL;rLpbSv1TBYah$|hZxaW7hL01@XAo?frd
z!`=h91z(HDq@qX;R1OKUnjED64HNVcsLGEkhfE|=D~$T??|xBL7h1-H^Hv1W^oxlZ
zEeSx>%EK3|+C*c$z+ojrEp9QCI3-r*VIs+U5r!mS_Hg*YR8=Pev=i*9wfFc~`x@*S
z$9F$!W!zk;;j9XwnZW?Kd|cLVeZK;)TW>IFHE~40e~OM(e5{UM{~i&yrP5aHlaXw+
zL;jJhmaKU_3W-V;r+*3xZWH7RGgk~0qOEnz&C+E~;5!N2|Gox~2Ex`Z5ULgWRO6Dz
zI}Vc(Z|W|Mi+i1i^4u;p`O~-c;XRyJRxz!m5^PX8KnMmOkIk-cnZ}CyxCowiJ4-Mg
zr6YoE>zXdn;TY=241(=#T8qxm4(XI+h5!bQ^xhrpQ}cOBLJg8`h2Uuc@fIrZ);Thq
zTxe=lo$1ES-Pn9L{=`23I`vry9554BU^1<u1@5Y>9KWs$Sq%3I@_1bTD!-mw5r}o)
zNiWi*y@Juww+pY!V|`*;`w#eye-8!LB&tma_}o4yJS(&qSId=v<1+PS7_U^*98W?6
z8u2Y-YwNA8MevzkesUV}j`MLnE<7{7U?dW13^E3m@@se;LJ<46C}pDe!yy>@h&YB&
z$^!5r)zPBD6o~$wr0shcbsULd(C{S6*X1R1!KCkPem*OYw=;KB!;t2mlGT&NN7E3L
z<KJI)M2v4h8<#G*!{FxQDZ@gEYxhxyBE$2?N_nT|Fvj}1b$C!!r(MD0OrR;Ov#niv
zJj9a(RWH3mUE1_#udZeaP{+SN=7v3~?(Bz__3CsT-GDQUbF90*7nc<*1Fmq>hgDi&
z_Gw?K{TQ91dD@PuwPRM%jq~~-lNaIcW#!!Q<661iKvqj1S~#-d^ti})R+I_yaRIl1
z%jY~>FHyJ0y-nCUqm9yc;ilJSxyj4#aIvQ7jbOIs=(9mA_*7!VQ@|L?|CONx#wlx5
z_Y>tnt$34tJn(tyRK}oSmUA7zUnEft6<S|KxP`Yq4&0c_Fdd3}96z7dFN*2eDz68;
zPn+Sut#)+#cDUo(9yFT!btn7Y$%b{KPLLvFB_kAKCoEJrs@PULyG_fg)?vrL{|g{i
zE{9ws0M7rFHWMwvzekkB1dtVgYIWFEHw@>xB)1*D6OxT;vdL_S;4`{I04(=_SQ&gy
z@!$WR;etF7;Iz#i{1SK;yXS9pd$My<!D!K}!;^Geb)?LqYKG-OEBuJA;jUH8)Qvx>
zOzpMsh-|oL&@uFmmOi1HDU=LWgYPcvR>Q~4OzW8v!;tGqy5$~_RK2hy`Gly>MJz4-
z^Cwrz)S|4;HS!DEl*w7}2WE!UB=DksS{2q#*r8e{h3K6-6O~1uS^0QJz$-lk2$y(Q
zAfJL``?GiszFbgElgg9GHGeD-C+NM#Jr(%yjdA=Jrj;gV)Rjg?%qHO`VzUV?0Sd0<
zNs~{Yuv6Sqm=(B@Bcc5TN0X?JAsNB;{Kos4EGyaTIdnQfFl&T(QRg80w^L$Bi@y#`
zg^7@{lMI<d3)$4tsO9y|g~zcA_sm(RstSuKSVCmjv$5aMg>p01CzgNv*8YtRWk09>
z`;i7Y*;iFv8<!Osdr{~L3d!DjbAvge#2G)*=`#zt{wxJu*2AuM==6PKDfJ>AgJh`L
zN?6i7w~o}y(Q{)<ytZLG1<1{`G_wOvwAo0TNEv91hC1+AfN?bU*b?qS-`2R12HN^)
z+%CAN(<}u%P2IsP;_8X1rC05xKAwD{s9M&wY$aGTFB;MlDq@qLv#XU9NG8#<dM^Km
zy0`F(vfaOjB_u>(014@mknZkOS_J7%>69EAh6ZWrM(OVE?(UY7?)u&M9G~-??>X-u
z@UF$0HR3RF-Pe3#fA-!NzokB|++iRyG|u+Bb@dBH!%;Xl2ebfIqa-pJx7HZPNH~ug
z6ow(Y;hxVzEj3)VHHBhm$T5D+1O=M)7hs}+YD#ro@bLUS$-fwlPo%~<5M=q?_CSIK
z3yjh1<@5_YxtVb_!>4vfej2t+82&kn)&*MmA=&zsE^<H}oy+-drA`L~t<sL^wG+#g
z@f-@$xZ}=PXDX{NRm)i(wPNl~@s7<d3evMLk~z$g{!rXL6{@1^DlW@&;8$$%Cmm72
zREaJ%{LW)pjJ;ap`=4O@4AbQ9v-avHKBVVgx7@tV+HU8fLnnU8=n(ASG}8@_B_@6+
zNCnyJlcyNvrk{!r&)VeP=8~9I*$<vU$?;EL({GXui;*Faj`_x<FCK3=&9rN7#cvy0
z_C1cy`vnpS%kv=1Y&TNHJcbvz8{vBcpmb=CDJ;^@er^TV>tJ>6A!ipF7@}ijB3;7T
zRum||(<ll#l&Awg0S)8URr+L}+wo>e7>=1!6%)=jY{wM($;&0Vcd8p+1Qwg0!iJQ!
zwzI<ZCXk_{B*;{GOsgf9i09%vtJYc0W#buJ-F+!Fj?-rYd8U9YnO0o;={_YrHB{mv
zrJCOt`4Y>W=()(1o;&-(O!W=|mOwhkAL#1o!idXwQoY^pXArQO0QYgBnf!v)D}i1&
z)52%bN;DRb&2;UUW#bo?>N3rapL7;Q@v+8l7a|phL?jpsxAM*#6CcXGEnBB5V{9?-
z9MVa$gW#B(Mqox0q;-jJvr|op&F(x8peBPRljA1fAFOm)<HANMcsSFE6S5r`#;PGi
z)6@ALd@^f`ysVEmL#oH<w1sQRn2Kwl=O2uM-+f}fOtfv$hL|C$amDq5O&MYbQFqH8
zXR3<EyEjUuL7<tfv>pINOn1t;R=tTE!_fxt@A#HH9OKQB!iAA6GTd_F6e#cUZv)s}
zm(?G~vD^~OOdI(eL*zU8`6IuiSm<^kX_J1z#@)v?td-Q=Y!P??s(sk0f%W?MI5z!o
zRgbP}|KHvmY5a(cWAw|r;yprCCD?7kui)H%L?Sn#S!T?5JzE*2t$IT@3MGmtI@$&Q
z!2a`#7xZXr2rpr}-n@wxs>3%!KbXEM>>V_inq9rXK(Z_e5TuKLhm)w31VsFbIOs~a
zIW{NzKtc$CsMa4rNNZq^S(j8xzwU!M#%5iF;kF63_!5<#%H}U3@CQrffs(RbiP?(5
z`|^sc?HCK5>w}o{**=}r6U`&f)PvkimLrlXzF*@+rF;sax@oCqL$h;G`lO+AYRkG4
zV%mx*hGY|>kw{SfJ!Ebd-!<OVaU#<Nsfs>=kVRFG(j2R<d^oqoQ`HjnTx;m_Wm{(7
z+;3mXEmUVM<vS+HDmKC<|K;WIL|XpyV=@4X^@FVPd<<?U9_c0fh>`UWzuq$iJoDEx
zb|{fvtz4#SuZS`}af(X|9_dKAoG`73;#ePdS0NCJ+8FFf?g|MWIcS+n@uSKx+dR{4
zIaRp^rs2WQ*4litWo~6%hV|!PNcjkgkfRxjm-?|28+9<}ec^Zb@QG9!O?Q0<-9vwV
zG2r9<P>5S^Ff(KE^hJ5w`!QKbSHin(F*T{}V$q%i<A)CYyPyIMgHp3AJTeW=?O>5{
zLVZ{aj$J9OQvM-Vm%VYvKu(_(-(r=0GPkfD(D90B{c3>W?exX>sMBLDoxO)SHn&x<
z2|$H6)GOt?imN@nt5^#0yiBpt-($Eq@Us!PSx&XUZu$D~M@aV%W-==mbTGvRBTj`E
zsfchm$~(9%1l&{FuWf#>@j_VQLn2>6XkX5(bF=M7EvMcXz(}W->8ry%@*h<+yj)vP
zOYOmgd!(Q0Xr=N|+AY!deLj^=!Z*qUd3)v0fPlkf>$|6_&~j3iN;lu96RDf>v6HLs
zqT6fU**Ay_0f!>3gL#IMz#^Uh;yA&6L;nT(flXw4RqU+0SWBz1Re1I~&U!qvjyPB8
z{gKWy*%VH}aMQ+@{c>t^gAQ6ly?sLV=?4n5nKBS=1e%R>=^?RT_ucW-0*yXXLsJbE
zmaqf!<>$EhNKMhJ_erl>n9eKTBp<93HDqWH4Hb1FP{8utAjd8>kqcNjM)p*xm{U5m
zwQ0Xg9t207afL8dE!dqIO6iL&+sWBXUPns)nS}it5F5!tp#2u1FJl<d(~{`L<ztSu
zB$gEjb4`WLXl>ux4!2J`&<|acB~o(am5sdOzLEf}<H};{oNt6qG3WCDp=Kl)WeR8g
zdk97J&UB%MFQzXvTrmVi8u2{?dB`xmj>>C##sqEpu9c{3H2Bu5O=$0!@CYx4V?)()
zQj~{UCsFQxk>sRc?{{Y`TNb)IE6^;Q@&Tw7nI^i|i>N#~FYi37gR!gR=BbfVt_}`L
z?KXrwOJ61wCX1AiXd~&jcrJ-$XGz!Z<LBXdIF-t2F6D0F9i)Np((rF$F1Q3v6YZ|N
ziLU%mHG=UMtm<WJ5tJkOc6J!5N<<*1ZtwUem@t+x$dWJJe;lq=Zq=MOH-Bv30cy$G
zGd^kbLz;P8)Nc2oKh(W{*z0^l@$w_v5Iaf`N=M>i2N@SeQIQjOF`9I&JxhUL`od>0
z;I!OM;N!WuY(TO_M31Pj`umAfguaJzyB`sWk&60p#!IjYRtO>l?iI*uJAR8w?iT3#
z?584_1p=$qCWW^lJSd94dZ7W46b-OfgMB$Lub<(AL_LMIo5M%nqr$aer3(fE*JcZJ
zFsUrQr14%QuSUM$24nL~_!{y{ssM|#X%uE*6!19xTwwDS2CtF2dkG2O14bw;45=OG
zMOgviuHHB0v*3oS4kV)0*70;nIc){FE{79&qWKw=aE&tFNQ_NIBiiNL*`d+grRgn>
zOXvlvRQriNuxj!|cyqK@t$b_=Y3S(@&EO&35>n)+hTNYtTE{7?{GON4uuMag?(=t$
z@16hNXMZvL0meUVU4%Z2=m7i)JE(djbl}+{+=|T{%Jgf4E-wKeM{HhNzH1x}QsE;~
zWS=kLxI#T744d=LqR>So8jL?LTbne=+A{r;Gf_GS$Lx~N%(=v0LP$kkvFumiO>4rI
z+Gc>z{es{34|~RAU-S00UX)VF60aS#ail66ZO(KKtjvzB5*?&V&-Ei|kDt4^?(=Rc
z(!044%Zj~hI_R82-s%OP&Ci=|_ln?UAavR6!}mE1O;|7(TRkr$x+P~Ovp9?5UJYMa
z6Aqg^co^4i_-cC`xVQ9$v90@~W~uZ|=&5;QYC&OS;<V{`Wr?kR^H;3qlK7=P>AqE$
za#pi1`v67ACR~>uJj+fPg|Z&+3T-qP{mpJ9@eiK#C#10|K|evx{qBM$Fjgrno#i1o
zArd5Q&mrW<+wLu<^PG9cp1mU&UajyK6fcM;Y9P8Wvdys?;Tb|P4KFq5P+=7NjM9Px
zN(py2aJPHIdsYqSZD{AeGT?*MaotK^5)xpD;Q&fvTgX=u0r0qZN8%1vsdt3Wa#k+k
zZU_JsDdX^}Cji+V4&22`ud|Yytlz2^{_~w1InlO0*fG;ty|Bri2aWFt?c!=-uCLSM
zF6caK&yP|;wxIzzHq~y3r#A{S;E2vNAWn$qUDOq!Us1)8D7VL(owDwCTE+dl{Wr0!
zVf-PNihiYClIsU0W?Hget7a`i2rgk>>a1nNF`ek^uipy_9JfirTb+~+CQbgszwBR_
zryvHIJf(0L>Pw3j9QZZ<8Fr=?_`C03<USHaZu2eU)xDlyPo)o!h4K)e^HWv2r3gTM
zlS%FCe={7lVdyzrQu&@QK@<tT@ujU@zwB6tn=+NLTkASbYg-HFjo>ZF1_uWtyUj&_
z-diORji@G!PG@<H_+t%YJ6ChW%dBG_Ldx5#!-E!yBC%lnb(;l>>!ZQ@DtqbSj8F&{
z9R@|yQCT2YdEnbJvn!nH3cnpigb$?z3bg6CV`G#GNrB+a8(FoY&67=D)6DQDndIP*
zWQM@OnScxyX*U^qgXe)NDXCWeR;mwO0g`WSQo;Q%Vy1>CZ(hhQ(EM@x_`5g&c~QJe
zv-HLPX6e)ZoZZ^MdJ!6mj15u|>F#(GPv9o5(VNDc9Cv2(OpVp*$II}HQvH)o?wTZm
zGBrNPTcth<p}sLwzU0bw_{J?%y_X8qOk)tbZMRh;SWt!y$TqKCE4nk7T3D;dA|Gc5
z{j_mo=5p0A)-WQArlL1i^mQypEZVfNjY4;ZMFtLLrWqCLYG58IX;4N4a9K}F=_<PY
zh?vQnO81PW%g!lt7Y=!MYLa9UEv@Q8Hr&|hrca_bH?zhllsJlI+Wf&mdN`Nruf}W-
zQ)zFv#NHJ?>4%^Z&h}@=9}98WK2MNV=eXZqTnx{--9qVak;YBde>r4yyE=LjRE*Tx
zZ8JKb7|_%o<yrs^V8Xew$!*~R4>dgw`Op5;QeIn9X}^5&aVWI4+*Srxm~98|KSa4|
z$S!@HGe5TJuA1dO%@zSk-p2HOvfwx#eI6p(A#~IdU)MqWnk1m}3)dkUx8pv{XY@DM
z$KCE_>zpM;Yh<Q&bTJ&4AA0rsrLILeNpflv!f~2N#wwUcDd>d{#^b)ScXWO;jSGee
zBpMpH$b8gN8B#aLFkJ)7e%;qy+AA$GfBWLSXcF}F*t^67{t%M}i)~ep@xxhg(EYOc
zYThakM5SJMHC`^!NGEYp1N3IH;Ds#xaK4=Ps+x87HYENYw|$lF7qdJK=TquhOHH@G
zFf3~~mBX#+J`S<Zn-c(CAEsr^YM1akd#yi!o*?Xd^gMMEO654K6@ucJxhZxlnUhI!
znZ9N!NhZzCR}lj`vrRvl9G;78=)Ol94|97T6GH?)Iji@<VGEP73~|qqQk*j(O4sZ9
zr((>294w_aLIQid6#~1PbLfFJS^)1=@(ID==QR(RvFOP$e4@U-c@1>DOsC>8V}~X9
zLBGtoCj&oPgu6qJN0wEiZ`BT+n4b^=h`>%xTM3W;R!!GAQwKymjs@G>%PRkbZ#cYe
z&}?_-_d1op_{3y0D#@Z1#P*C1eyz-U@Y`jI>PP)UY%iqlbcEyN4$Jhl_qs$0w2LVN
zr+TVLs(jcZIeM3==13{uV@TfgAM%T>MQ>%l0@l|-SZCg%8WGQ;LcwG_M<(8Kuk-w1
zp5yHBF(K|gSUZV;LOLE4FMD>}O{BijHxC9RJ8`J+gieC&!$=GbIN+<*06<7{8{l{_
z=NCZeGBa)g*rDm>7goIlj8w{rs;!pklU#PG(*e7~hs+Lumj7DI{amTJYOko?ck+|=
z;w6v$&lNQG$A`;tFRIr$yiD#_y_Bgh$RxsEniW>Yij7Fydwj>3H!J3Ar${u%Cj>j`
zb__H3(_&mKdw3+im>hk)-(PNSp1k~)JM6x<P^+k-AIc`2uUzaKqpUW1sxbY6reGzq
z#QnAjkN@Up=G)mTg`a>l2DxGnAPQ6C_A0)~MEM(i{Kn@cZSt<XTORgVo4=+bFic0u
zp<t>EqEnz1-LoEq9|={sYw^(9A|rhTKFzaAk=<mNzpUzT9P#g(2_x0alH!8ZBbqWu
zj8Y1C%K!3;(CU`8*@$%NwZtboxv4Hy0P&x$0PvY<TJFP|IULQDxB}iq8$g>c?<D<3
ztVbMlbT~W=u1b{%0#y89nF?T4{Q=TOcYDdi2Pwbz>Lsa=Z||Zg{6aaFsLY%19l-=p
zkUQF$$64dl>xWSZqRq`UyiI(!BcYU7?z!qi(39mK;Fn+~z+pa90=V5J+^eh#!4sIY
zeV9LbxFzxh-hsM19z3bjiE1LdMI-&~P6zCd0e;~Tz?^6;V?2!KmwdLx9kKlYlSBZ(
zvwq8ayKjeXb0Rd!^><kD21xdbAeminWXSqNOoO+KJ~sc6fGLXll)^jp3#x55?$Cdk
znE}IkPlQ-tkmSUQxDEJ(6<SZJ8v6r2bG$F{<Iyae?l0<(QXhGvhPD(6lW+j-c+KUK
ztIWRYl!r;)SE^ws0r9(aqAxX!ZJU6chsnHV)qhr!2wVX?>h`W1iPTpGas_-Q>vYqt
z0adSaQ&3z630&jFoSAO2!xRd@T`Xp=3~I2LuR7mgF&;t?MF$`uH5_(xC)$jkv5w_=
zOl{-9&WpyZ>9GekwcoGB`he{e*QN1SbduQr7Mv0Xc={&c6F*0dgN-CJ6;ZCCo>9^K
zS4Q$$gN1ls15S+Pfgm-AEr~6~90YArEF2I4aP3zwgXxWqN<6nyJxC}R>I_TQ&v03N
z;89WV%u9sWQAhds*=B~M;~7UU6Mp%H5(z9-i=ItPKw4(qP6v5iAm4jcG{}l63_B>z
zNpN%v)s02TJIHf1bjNdzr*@BF8vn)P1j;BuWJHtCO2|Uge+Q2iVQ2`WG|_uJIZx8}
zx$&hVM}Upm8q!M)>h`GA_|Lwe=&*R*!o-SFWtlmMiBQ9L)Jy_k<BAD}ki7UgrEZl!
z{@G~^bMFGML~1e&6mi(~eepLEXHo`BzQH<M6t(`Zig&~s7Pyk?+A!O>0*hn?b-O=S
zBScUxI)elNYX)!G%FjwWX6^VcKS|idOye8<>mpil*~!F1jvNfCmR!u#Ii1c+00yQq
zbpWdbzD><^=USI&<x5=Fn)lyqQH=)TSCL)=io%;FD%*kO?~0Vm1iVg=V6lQY#ZTfB
zBY<)cCk2Q$+VucO{Nj6lhSI<1l`JIkr#S0I(4vEX8-P;Dcv33>8^W&jNv3Wm1Zd3o
zkDr6}e{oJy6AmY^Oy_<$Xu7prtr(X*ALqSD6bo|xfhm`wM~4>z!JMjVr7}L*pw<Ok
zyO1Go@llERd7rF+z>P`OA#zBRuTP^3`Q6!HN)R|0u%8$K{$T7f<_!{?pah>4ktXS-
zAC;RtpV-8<ddliMsWMjEC42$uH))Gyc8v0i3F^Ps7v%K~WHJgEem=i@K>_^M1Pps~
zHrs*6XV8;z|6}p4=Xa|S`D{ERKB$ANv7d<$X$Y5HVFu!y@S#(UQ+lz_*ycuAqnbkd
zo9*x@T#m!$R4+!+TN?E`$VbbBznrqpWuD~BjX?~}wD^ds%lvt60FE~XdzI1}TzTPC
zI*clGx;0!*sXO0awaJgDj>eT;OXK&aa2JGvc0;57)RCUo8%4QIXERr^hV-nv)92Kv
zX!G4%1kIZKM-AHA*v_~ADD@|hpb8@u8B~<={yfW{^8c?2cY+TJk;`HD<D362!xn4=
zWZM5f{`0(oxDUaYa~Wc#`E~z|OZ?R@%6qn~q1k>|3pGfB=1U*{*O%+B|NHw^-dphg
z1y`VF6sDDJxD;n@Kk%@@sFpjxu6Uu=4rcxu35!(@;dABhRR&fqujDWj6O&HF&@kHy
zJNql3+hoPJ?P=A0ssMgF1lCe*Qd=#et+;lqwEDF39G)RR-fge1u7?X0)wd~qeR>{N
z#-}=D%scd;J|c==A~R|o&a(LjK+`fYfHfLU!Q2{3obPG)!|RlK0R<+Gw|X2pvXU|E
z?;ELeNK|7x=%Xn{;5lqitN{XSJ0S7I04g`0N3;xOKWfEf48c5gz^JDkkPG8mHlA-C
z0V+Q~Df(-r<mpW4r$C%Wf-q4S5rt*;0b2Hztu2`yZbs`5Md0_x=<#f3M)Af9oPSyc
z6?|Bb9oB#v2=HoES9Smdq=Gp7ZcZ~si3j9M(PEazZzxE&pL~g?mmeR(tvfN};BOA{
zGQ+goWb-JVQd}M-w8Q1Z4-GVU63XpO1<rSME}z4Lfa2UF0~XQ|2^kmVk+TH+ogso4
z6fMnscX%Ha$<X1^wgF-BSBRpb333vOLcl}D@=t-^(U{c%+5sa{QL@8Jw#Yh4?NFFT
zoArEN;vraY5p=|6!<ziVQs>93-?`PR4DpIJ|2lhmKvxDoM-eTM%*+KAe4eiz(B!_1
za+WzYVCr&D84fH67J44Q{p-`8dxge{z(E)&LSTy44U3O$iB6LH4HAX!)bsW!aPY%k
zWlR6rZpiOEJ%ZJ$EmbF;W3d~_;NRCPy;96<lx!aLILq`+5}%i`eKL@OgT>P95vobg
z5N$cOu-!k;1=zILqWFlMVehO+N0Jahk!1ghR9^n%@=+eaPTZJCc>Uk#y*_C<vn|+k
zFDF{UY}7vO8yqh&4BPh*PWlDnjjRs*yidVh3ZI_#lNq#2hg114Fx&4E9rwA{w>$L?
zvMy)tn1UVjw8x(kf32J$bS1X!TE%yAbV{De7r4}`6m)AJ(X3wujz=HjUDN-eM*v@d
zuxH^FQx!jQ77xyd?w>fxfR0;QCz8KYqoH)_6PnFFXOZ7Jsq8C<H1cD(7k{?Ozq`uc
z*mEAHmze7vrvldwHa+na@lwYpdnr;`^7KFQm%jql-?tM%f(N9w@v?@K+1gq>YoP}n
zJ0F>tbCpx50cY+13~vAWrywS@19qf-XO0+t_`vrMG5@YY|1L-Wx`X8>c;JVh^D&!F
zHd!xGbW3vJ9n4tM|9)0lT(1+qBm*_f$*%88s@NIvQGDqHe@eAKnEIb;rH)j-DI#hj
z2_t;n*!^?NfvH7O-7aAa89=h87y<MQB?HX;S#qWxOc*LQotPp(frw>S_(S%0BbAl|
z8w->OB^PszX5Siy#@IeKI!ryoB!9XLG7!N+KYS~FH@z5w8mDnzT>4MIF8xO6#do>D
zg6}w^E7Nuis0L}xs}~(7-RSI_)Y`;d*5Wh6Q5s#&KW8oV(mlOn6JA*Q+Kn|v^O=Z-
zVn8XFwS?ce^)0_0(Bply+QfcaL%-`c&40VEnD-+2FOsMv!o;udN}WfhE=o+?Pk~i_
z7*>{Z74~+Fm$u<*EqM0Z6yUyIQtZyua8b8cw!LeYtCX*-3s__#3OpQ%#C(2{T>MoM
z{;yCASbSq$_)yB(@|_Np|ESzf{&J(t6L;7_=(<k`w3Ig^^_Se&^b9uJ;^BlC;-R=P
z7So<kPiE4bGz}+xG_!Vde+@Gb(k=ax>d<X$u|Fuw0Y%ybombtBdZBu8-&;DckgV}%
zz&Z9OP=?3zntr&kS9#K&0`AQDNi>(l+U|aRGk_4<4Y_oB*(vk=SK;k^@*e>{wr=5*
zaf!JLXA_Evyra!Eg~7~vs=rl?g6<%vo1MpK+Nz8U(t%WgX0<vPlo<j0{h89aYo`qP
zrf*u|JAI3>KdLD<H)-_OLpgel76Fy-6BSu_x=6EZdyMD$B)8b?x4mnu0ki|n24EM2
zX8xnFnv5h@<10`v4EhU7-Ilom67qoTdIzM_?Lnk_fm_;!qc$Xfv-E2<D@xWD$`CoY
zrFO*|sSW7e@f;T&rc3(zXn2T*&<alheSZK@mG_IK0)!g>C9^_INNfVTg__f&4{96y
zMNrj~aWcRA4QvymEVI{7?Z=x>I9#?_+7F$x#sPBt*W?ZNLp)5iRvb$vnT(Tt?;n5k
z_${15q=4qr<jGK?@ebI!{*)<BZT*isgs#30GM;s}DXwk4h!_nE7;?O$&%e)G*3(s(
zP1EDs9<}OTV=5n+U0sZbp!xv?%q^3}eJiPVJltfNVLzkeK>SgQ52}AEph{L@OTcAw
z8c93q$qaY7shrm0**8?2UUE8>NkyL;5-kOk?*UfL_w2m)sck+HK!e8lq%2R*be~Nz
zzCV7l7XW(PW-X)iYQB&i?!5vG(9+gGG9EurGCpNEqo~VYi#+>4srlq_!C&&4&!xi7
z3sro%(BvRTu8zJk`gRs*rUki4i|4AXh@L3P55S)e5v!LtKE(;Z{E2fbUcb$GJ9+()
z@HpWXfERd@?1r3wZ3;YIn*bTl!tD{T5kMw#Ka0Fiw5cJ|w&~o|ZN9%?i_A2O2S(5}
zhfTNbM2}aw=6g?`9(%>jSW_zc=ZwZfNk@6^Q`)w@#pTnp*H`PP2j@1N=M&@iZmIlu
z{(zAO^W8;tGZHHAQCcbwV4Y4&x@CWRX>!jv&0n0rUkN;IO%1DI-^9_jz#~`U6wA!~
z`Sjx;f9~bkupX)T702Gi{39Vl{c^OAWP5Y#w1%BU&Ev!EJ@*0caYs}eAX6p+(w@}a
zLgn<sq+8v<1KrY0cra7Ha>nn-rqYH;JBQ0v>8CG@+?;XpW5rOri^e>yRVOq1zI}b(
zsi4s5Ct=>LEAcf?{z0m>$d*Wpi9Yl*n9I`ju3FBvIu^woX~g^6OVrbSY?+JZyAvsF
zqb49)0N|SSmjpX)kNe8QQ>1O%etJBh6>4NO=!rO5ay`6`n@E-sOyf0ws@D_a<&S|f
z##!YH9){MF4By-ybxkh2ajeQ4@g)Kx0<UR5-|#k3+xQELHDZGX-BBL5V}8J)gYVI-
zxcOIVZ~#%t^;lCo3Xd;g)d<T@$$A*ikXRD<{-*Xw)b+|1uW;<aa!NxVdHbmO@t(oH
zy5Y&bRmA}?WuUz1bYE%lYI-bBy&ovAsC*uR7VF5eio}$6SbylRKoLSrLEs|!P>HV-
zK;`r7VyvGbz>DI>Tt~Y}KVdZP3=0kJ*to4DP^+7Bubqy4d13F(BJS4d!|frF&V4De
z8r(}9P;4cbd$xn;hA^{mQM$*6?-I%<J{f!xWgQ==rwDkbez}G35@V(t*zbAL@{V}I
zay#tej1^zi+K&jq`vYa=g4j<fhDO?kvoWbv&}{?IYN;>G17A?)TdQ70xqBMkQ)zK>
zi5X5{K>*YerOfmbN8Jh-upsdlxZeq%Ri<w|8>8_5!mh9h_lUo_8~}7_)UL?{9S=`v
zitDed_3+Ei{cz4T$bK!590oi(1UqVvyYSl3B=9YvNB5vS;86)*Jsdt3+3Qjm5x<Rf
z1cK7-DsXxRO1aUXE!q|ku|nQ8Br#pAw~1kE;sb{upcU1uG3buVaGdXz?97xZI$bVa
zwvmM!fK9G|9#Lk%W2`t%FSbzhw|EI4WUAbuab0ZIeS_!=%OZ8)*LTGoen#V#a-z(0
zCM~H!WZk6fUj>{EmVY%fHUXc0HA(wn%%gTat`DexhdP55u$WHw%ZIs@I|$k)(LSuz
zvL43Cc&=c-3o(fXJiqDoOYSco`YGjBTC^%_uQBM>o}-!U6;==VH$R@wncchGSacJ7
zIqO_r_ISKM+lg>ZL5K$kAg<8^Bk3Z1bQuG)Sexbfg~cMT2A;fhE;!N+m)$QNgbYVk
zT32a;omx{o9-PBTg(F(8zBU%#+xBnMm@(@>RA$-dxnQH?3g{FJxTl(CFiNAUhyur)
zHzN0`edCZhW?DSB#7f@bWnc>r0Gt0HcU4k!ppN2**J6}$G&~-D6bwTX+}fTQ&bO7j
z%(_LZ$G6Zlpf4(T-1!)*ZgtMTwa|y9;E;t(du7O76a_kjnY#}Y6V(KCwz^gKyZG)x
z4cw<8EO$$gdnwhw{KS%AucQHU(t&Nq0_<bwP_`*%ri~96lg-Gk`=z<sb+^A<-T0DV
zP<e^(?zWBVBU+6Y0LT!1NZ=7#&Szy<xEBBp#5s5KGVd22@0XPbsoS@;SRWSWfo4GC
z=Mze{zw9$Ct;0nH)UC-(?`i;u@_v~$<Lx#6eIVkR+XG+`?~>wADR$!1Ram{WCBVu!
zxHrUecmg3nUpa&)YQP3MmJhcB+Bmh&vxd#*W?8M(UzVKbMH7+u7@#pMfP*L~+W=R>
zjpBZW$KXKKOvw~B{%U6uaz4N!T$LpE6$`1zZvxW@J_0R22_>4z6+E5z3V`Lg&lYzl
zNsQQ#bdUYE_Vw~a{`ne~p@hL%X4FVJHHBbhT|ih^;<ii`Z#~%3B_jODv&TzN=nP&v
zDDaH);B4Hz4nu+GPEV}|LLq98W9}_=({IppYBqbuVE%MA#Rq)vi?TpZU+c3>nLWMG
z)Y^;9C%kt!XY<oQYP^ZB;Kb})sugk?5%`WN@xc{C4j7+V2eMD(B#4#1O<9kds;~)(
zX#YsnQD)*6i-9$abWEdWjFEeFmZRY&Jr5+eU(=w0G5==)3KTH#H=f_0jjr^5tW^4>
z`X=SY0{lyB#+%4JQ9UuTACRB4IRrhw(^)#g(P1BenW|>r0QuZ0%Oaexe<u2eq4ZBD
z8drA!GCvy@7Co$B#J~X9xkKRlANfT=xo)nl&#{%c2nE0m!JNnB;|6vDhX4@7^;qym
z&7TPIXmc%?;g~8h+3_Y2>2O1$XBJQ?H)wU>yFvV$P1o&Sdr{)14g(o1Tw;^fP*L}I
z4@iQgJHy)nz-6Q2y_gYbFZQ@kjBofE>mg;<USN`5_xq$CndlZh4BIQcL((vJ1hOT@
zpTzbl)ad5!LEgz<nBdU@u$}KTt$?yJ`2DlvE@YB!Ew|dwLBL4~a(K`Z7NLMez+hRW
z5+i@+)#O=S^Zj6^=v0}5JxU3+fM{n)mn{E0aNGuqu-Fi3_w~>z7o@aGW{%T{Ak}2u
z!aYgplP23hmG-;L8lfc?+Bsj4{nVarGVxTR>N2VB7?vN(Q0IM(Osdp&`k-mBL)Z6b
z$GEAJICtX~klWf(o~%$q0A=Ps9rmbi;YO=vABKaMZVZieB_fv2>12_Mvn%ds_+41G
ztdR@a8y1NjCnMnGT#5(ZO7ry@Ll$R56I0X%Y#&31FKgdFpbFfLOmWRq0IoF(4~HE<
z&2FwYB<AR5FX*<tOa^8Ys_2G8CDhAsHlkw>JLV`F?6SGz@VPfR2BZ!IZiCgDj<akf
z7ekQ}gsYL<xk<KVVgwekm*oPntym=RG;s;WW4|FJT3uKyFip|0e*zVT7(OiCl%9#P
znWTNmC)e8mFEO@j@8K2SN?W%e%m;EkS@*h}GO;vOG)cIsm)@#WPWv+hq;*p0Fb0IA
zw_{kjS4Uj0E^C+i`0Wtz!W9!atDEj$1QB}@3)v$h{bJ?|#8z~TckHaCQUNHztB2!s
zAIOV(7590Y!%kG}GZe-QODSZM=sSyZZTJ$hY0G@iO|(k7e~`UB=IT2m&!N_pZvBBI
zV*g&CE}6}{gfp9F3M%t8MNf&mq03t=bA;q@?FD_8s#49w`EQ&%JvpJ~HYORCSJ_x#
z(Z`NT&z|m<?DClY36|geFD)W0NLi%^+~VoQ?*6mKJPJ6Chq|C@?z1Xj+>q!u63W@W
z_Ao=8o3yq!(Gh~c^fd5j0?H_-AL8mcYCr{^T-9<ENLc!g0Pu~el2{~OT5z*7YE0Qq
z9iF%)O1y9dFrY^P*KH``cM4UHb&HM9yweb<zDA~feI+I+-*JTa)_a4>nq26kZ7l6I
zN|;ClIpNWLWEQb%8P)rdaN@6-k{=Wzt*cib4i68&uCCigjz;T)zxW)c#}{MvrrB83
zhZ~fXHX6g+Da({)ISW)jY2S~tZ|*i7oj7=y90oHyu<5q?ycyc)&Ao2jz6V^i%nxR3
zsq!V`wS7IusmlA#>YkcBuTe>zpMFUon|&mNbhuW32DpL4?adM~8daQVZ3>l|uLPzz
z=Cx!8CRs)7fi335D~`0a5$#0&Dw!eTB-@prv>j~V(0sMgt{oU3N(@_dat@+%S4#5M
zHI0PYap@3V1}Bu9rWNM<=uZR0lk4^~-Ex;}dHzjmS^<hVqNWbRJ~X70GCPdm!&pta
z$L5)e@8{-p_$NAXxzqd8)0Gy<&{&|R{;7xBZmj)9y<oZz_4miFfam@VbX)$nJYi?u
z(YE4jl?`Iz(ybvKPG3BJA0i-~R)io^OtBsZoqK^)-x4%$kea(MPIcDK?+&1-?RlK%
zr|?gqzVO<KtHMBP&S-OH%P*$>j0ks!Hq1Btv#g)tQnNjNsOyHWx%SpFmHoo+eCqVO
zXUrao(v-yIQ!mT-6(Py>m$PML8@=3<>z?og|Mf=H0=GK=TwmraIL;q{jY1QU81520
zw#o$`fSJ#VfOHA_AV+g;`ZC5Bpouxe@J_U>n{B4V);NNx0uMQ-#1atoVq8Y0CaoJ6
z&|Vku?3aiH+j7kPOxywbsAFm!?MD}^)Sfua3p@3WYf<eUqShUjRMkaV)$+<|Wi?YV
zt5XSqI&Yb3?y;sX<%7?S27ac<?mgdPP$9IaUaZoB%lJe#(6{3NoG?ZVof*>-fy9Nt
zq#Np^!cvV40okZjd-^O&;E<#Ful?G(LAMq+^H8k+<|&B49b+^tBVXzN1Jo>g0ZBso
zcvXtYId6NYQ_rnwHAPSBIAEVnX|AyH<e^pg*=#6@-!O=B8&z>iS#^4<W3~K+)Eink
zy+N!3>k>v?8uO{a)>0!ObDu^}3+J$NM0zZirtYax>Uy<@yRsgl!F}>Fpv-hdPeC5f
z0{N)$z?+HX*c=zke7(4>Dq&e00<zby<|%NW<Mj$LfwGtic8q4FL&V!_XAi138SXDB
zX5J<<(pmzphsr^tjZNt12Jf-!Lpv&pss}Q*Y%vVl=uCLxqn|rh#djJ!M#WM%QaiUw
zpB6XlzLoP<@!6yp4${oC=50<@CD6ljyn8#Z&~KG>VK+wxoEOGehnQr^^ZuBi94H^_
zVqO%w-=8<%7Wt0a@?Sp9!z+Ngb7rkeeLfs1Kfb4|)B*!+IHwz6<*K$;)<516$Rknq
zV?0+%J4TF`{%S$f^`O!M=_q*d5!It+I#=~rL`bz`*=bz{Fgl>o7|`Xh#uu<PubgBD
z5B8f?&6B`;qXMJt<Vzs&m$KQEvALD(bFGJbud13WYKjmrol!tPrqcNI=`z}Mjd>Di
z{;XKcQaatRi(CdsiISfHUL95poN{2`Iar_(Q8o>?O^QS`)WletY;&hDnASR+0RJM)
z1EO!pLSNmHKveEJ&+Z7djAg@LJw%<O6jNZ+qYYqGI!A~d&*D#k?X>e80Dq8twB?Cx
z-ENap>PgN?Ob7E2I#h~8T>~NVAtoV@TNLOeGQd$gikhMip6>Lei;isF+i2<@&6XMB
zY`UIqm-AKv*q;D6>MLom+Jl(yoi{g=!;AbRTX&jur6sbABX~Lw*2_1VhWra;*6x!9
zrm~#JqNrV8A#AumDy!9QP~-Fb7?o?gWGic~TBDgkL4A7{(d(z~$3?2gzRy2!Nl*X&
zqP~LNN&$TBbCj1D8qYG2UR1W-?ll94ZOv`33b-IpZZ?#B?#^iB#f&?|ag28X%E^Vb
zQhX1pVm#`&L$(5On%vFUJ+VZ<KP-k&xoq?3<Q)2fAT-8ec2ZwCOx>C9<8WzNQhJlN
zd1+@OM1siOATRjC1MtEy#4B@w_Xqvkl<=2#2TP`H4#oilj*fJG%%8vi!rVj+7DI^%
zHB|!6%izY0<#Z!qnpvBkHUWix1)x<N7ibuQD^z!6cx7QWAt;56Z&WQGzpIg=RC+2|
zt(sd5*%zMF(P!mv%BjriCXcH>Y}Ra=&3!23+vL1&1W)GVop+M-oOQNFYrDGIN$)&8
zCCH1*>)E)!iygv}AzDC<TQ1+(NPFWOysstVeVeX@9B2VSlV*0cCo!b5+T(GjW51^x
zA%HxGawDGDhunuz5rHdbh-{%`gvq$NLst_&@dM`DNfXZ>?2r~oV#eIy9NMma;}IRs
z95!GutXgiM)b9}sky-INV*8jOkHIRZ^dC^Lmx74SPv0a#-!vbwus}AIB}Qw#dK+o_
z(y_qK(CsrJ!<8|B)2-u#umBwS)UG&!K)rcPnIegA`*cbF@iQosW|)^BpoB6X2;8iN
z9?HfrnA1unrGugS{+y=tAIPsx1qert1OW2C)GDFq$p#~Gs;qW=<IH@XHAnIv&;0kR
zh{))e_~CyJhqU)H3^E6}!lv96P%8=kUex~s{yr`94XnRbPawCir~V~>^4mKHtbabn
z3r)~-F;j&?&~gKP<@y+Z^Xa|mkYnHgoe7n0C`%B5`fhBv!oo*38CbZ`XR$7SJ})B=
zBI}9f8886JGK^I<a#U6{M2&bh9KKBj1SVo=vV@i+BrdW9<>a~m&+*C07|yjOS?1@#
zE%<V2+1U<Z#>CJ5`aI=vf@I5L&G!d~W7y3#OZ$a+?}<QADMZ5xzY%D0W?)3<b9M%@
z09kSK@VMfW{5-3~=`R9#3xjqK7N+m;0+-dDN0(e|aH%QRCi-aF90(g>4RZKhQVZ37
zPeI81o?ZZ$&5J#SJ~Nj|N#{Jw^NrW1cTQoyKMdnnM9H4lzPZ@Yuo=j9mh<!T_bYYf
z<@4ORqZws)W%h=;kh^-@Rw|6px>JflojTjZ3=}P}pUPw;(x({R1qs5?37R#dgT9WE
zrmay)uAp(Cf5ogQR5C+h-5ysc{Jo%O7%Mz^Xu(MUqlOrOGY)pH);(Bn(nH#?P8OWN
zyb`~87C61YjQwTXxQMTy{m-92S)p>t2IHB?WU66=a?8szm%ZVHM?zj7q!2mRG8%wY
zxfZs;6gkmisJr9Z({y<Q_Wc>s@w-^L%#m-+$PUbh=U=a39x&{Fo6gs$?sIX7i&JGW
z+;l+FZOkql$af>UM)4KfnD+P<=`$1x!r`!V`#kw7UtFI0uTgpnG5Sh<z+syBBETa_
z!csMev_I*k>}AeXFe&FEjOFBINWmMH`|A>ML*2vjw@~7ELwUnw$ROYv@YvBd|EkEE
zno?f=9$XV@UM;IdvPL|m<0ryz5sPYU|1E)qB^dj%nt`$paz~x{p~WOiqqy8qcCG*=
zIl;-mJ0vE4+C5~lo7PJstM?8N<bWvqESLm#`>`o?n-o>GifuI^!)VF^2NWM8Iq;^U
z4rfYP?(v=FyTy}&8DgzCkg6IF6PR~#n>JlmKcS#9<>Ke!Yb8i<Y5DGGq_$igV@TH|
zrEu>zlK<YSz&;d_c_x+A1mRRj1!zs|$<}ShJ71dXc{H6?hsOX6AT|YjWBOQ6z4nvW
z74eAUC%{WuH^XI0ej$aQl9==~q5<L(y6w&ND}7M-N@O4>gwQBbzfH;&u)zD&fe?l>
zW9%(a_1qQhyS-4;Mu1`e+0J3kmt46okv`hNU10=?YV&1cx`Fz2ft!nK$B6CZHR=UD
zzQI)3x>-C0A9Z5-O)yBmsxQXZFqij>Mq>51uaVWs3#^S2Hc^?9(brU{^3@l}LgWh(
zxbQEE{dnJF29}la7V;Lvhma}E&2_1PHJ2=X(@!kX6~eI9KI1)Hq<%9Hx_IwA>y;Mm
z{6*DO<0aWbV_97%!OnM{1RGM#FZ>Deu`x=jvcc7nwEYKXq-SFwsE0Jmy}l~AgI#8`
z@_jG%XywR%g|Re3sE3pz>c$6s#YgP<?lOJgj1VcnfY_A#u)%H&XvOJ~l3k7hn39OZ
zW|M>bLYq&M`Ph;_<ILt$0nsU)l9(iyDCBhunP3N(FLHHP>34DIfENKRXm&-l6f$&7
zl)g|WTm8VS^`Lr*-6pTSGs0%QD<N-m3mQrocIY|Au0Fgs;t>5J3AsHICFJ$uns=OG
zsjX&az)#RYkf|($tKeJ&Okq|tvIbU`>k}0f9w1@CcB3V{tnjh7l+lNly$HY%4w#JU
zlee1t+$Nt}Ihq@pfDntwrBAyo8~hc~u<))lhOMRF8m$ZOGjKjm&)mWBV5-YZ@^r$9
z6C6Zq$-hBkSKMtOo=HU(jqqi;?RRY~l>UNWo4g1C*r8UXXDTFy!woz|#%Xc<Xe-X4
zc+-ypMYkX0NuS2aPZMPU#FD9?DfJyn-lsz(fTZ-31BiL<qS&PHq=M`O5H_<iYNheq
z`8iHUV%rr6UfVMmuxMO~7IiRmj1sY?N}e<aQvl5IvvdTefC|+mecBX63THQL99t=j
zldOGPt^|~??>Mryd)7!!d@J8mT!Fq;qS9m*ghuB0IC#3<r^;g6g7i$VuM-{D9Ny4b
zSXx*NpHeKF*L_r#Cn1cLX0%5l?#ELIsJV)SU)_*Xr+S8#KTgZWJW5sT(e>IR$9*;u
ziIuKaaQt)5_z|k30>!nw@vbQT5QJI9TNF=6h&y%8NI0~zW3Tfg_uLYlmZ7p@!V3G;
zyw1B|mgzs)<R1x?lo9mwT@qyQec9o6;=(x#-JAe91^9DyE5cPB?Yie)RGynJGWu0`
zm{-X#lt6TCB)A5f2HvoGa2bKlTWdnOs$gP@ctM?5Dg?CX@ujg}JHji(0&NLj{jw3f
z;Wp)8-Gr(aZ7B1RT<A2mJE%k(jEB<}b&6c|Om+4a_<>?TGTe3wfft<<lqWqnv#+QN
z4D$-_Q5B6&x|dJcm|&6c$LvP5hB&dm?7J+~l$sXElxGk?=+RJ1{Ts*xaAOZLVpQiq
zSO;ET5>;7RVVd}#57FjQ`M91OaZ+5g^v)X@j)xV5fLuz@d;7e+ayj@b0=9W{@0@<b
zmk;fVt8e^AaR4Oar+mx^m}c&fb(Y2Zn1D!Soxf1SU9E!9`-GA85XU1)Q*BkUs8dB|
zY!ncXFfj;t<$`_Fq7W${$Q0H(V%{rjQ&fwjaGQ}7_=ZtKr=mKgjIbzE%R-Dcs~w_<
zA5-&|r8_b-3~WX_e~}S&ygbn(NQ&oIszN_;qgv?1U)Md>kZ&vN!Xf>}iOORq$EBr3
zA8qd#B9**8__mwYD1OdQOdjE#R*H?M8O2LOZ;Z~-B2E)^bqV^74QJ_UF^69G)*vT+
zG*{bxA}jUYAG!HRQwVCR>hA(JW0tpv4U(8LQTPv+c3*kpAm=vz&+Q-&YzH{O5&?ZX
z-voaR`V8VWVuTu&))qfh3^S0Jn#@_eX>glkRatC^-)1bGMkc=KT!~66o>xa_>YX&6
zMh_Dd42k~M)d2T%0(ct2>a9}lt2s^xJ;G}P1A0+HY`+1kkj-}qt43QE2&)tcUSE4@
zovo|FkWG8(PHdq%`-F&FNlsRJ+LOg72K?}Yw+{yB!ayZDf`k>llXOzQoYnYCkb?0~
zj_B5c&pkeTmgp(<ABWq;z>@jiV&zP{6(6yoxFl(I?Fl0r?W4nrvijYw!LH3}0LMyK
z*`x7)5T9C}{l3_|L@V>e%^(4D9U^*IpBm#I$a~oaSoE{Pouz(G8yGh0`)eRl@B33`
z=?)<s8{IZhsMHhvSPm7iJYkVxP9OrpONx)gA!#8U(K)fPyeZC6SWqxR?ayEBT0YmC
z9Q(Z2x01(#vg&)TS7&4dh*ydF5On=+BEq;=IL1%Nk-@9pFVSBfZ1Ve(PoR#iZ7Z`u
zD1GN&+raH%SJRg5p@>pgl~cdP3jEkQcWMw66DTXT6uo_|T1Y+JXH(j{J3IMR^|2+L
ztoq6ekDg{cr*U(N)kra_tHNlw^;)rN6RKHqnC(1UOF{hDyg=h`lIcGRQyl5vK>$PH
zS$7)$RE-T9Ink9d(l4Xc4v~%oMqOcC0YWs&`B`dpc@As|+UB09t5vG!k5t`$5REHA
z!IoOjl}{IXE<&ozY2S2ZeArNU_#-K=S6-x(n8{M)F^2ZEQFEwYaU|l3;uc8fAzhRH
zBt{<v(|*zb3FDtssLs3RTL(vRp*z2Q@5mrrlR;Y)%>CinWo&A3fWA@?=O&zsrL%_d
zxM^*|WUzMPh_F-eBqhyYd4*Y9p$3K)1X~YlN^D^r6bAYXifTjkU<fj@R&<1&Ssm^h
zYU%=+p{dUE4Hi)Meg6yj_`9(G>yA4GkP*Xs+(v$7i1T|^?<ptFZD8TWVh1B&zz4h;
za#FvG^|5G2qh%H%$1o*Lk)UrhFeJ`1s3&&jJz;HyB7ukU>vW04B9-fSE&9e66JPIz
z9@1Wwb(d~7I`nhnV#5?Gh&U~R^j*n&q4guzJ30n()U~h0#iLQ`S-whxDp(X%jJwjw
zp!dLHzS`A!&>tA<);gfC&qa=2)!ibnSiwJElSUu2AqWt=;64^k)X0MEKg0dw1;DHj
zP=U`ntaz<Ata>vb-~ryU`Vsy-p40GL@XR*H@xY>lw{${?DkJm?O}2r3R|3j;j$K_F
z;@H8`JGYLR2G|q+GZp-GpHb&&FL*^jafd;<I+vtasPYx{*c7O45Pv3KV+0%>;Dgbe
zUqzk<_Y7kXRNAlCf#9j&gK-j#4C;AFs!Dp8+hjKaA(00&28!;s9(^5eAy){*7PSbh
zQ$q+ex<aCAu!w76L;*je5z#k7*9kqQQUky1UX6l9(P$lKCWEdJcq^vWOcaYwI^*b2
zpg}VxA`pJ8&b`ex$)PbZkQt4`m(sv1io1@fl?oV%L~H)S{#`i@b<gf$jc%bkosEOi
z4Rg6)cfIv~Y?jPuh&El%fvq>&PUkU{QoXq|KB!h!6D<R_d~GOJU8OoN({W;I9~)jx
zR{cJxLGlUZ`L6|mAptf>>Y1MPuQkuj))s<e-TfK!M$9rpcq-9&bNN)&NYT2_o&F|*
zDOn<~gdl>FE#cNVk0BhbZ{}waJ%V-H;B7#dKmvMyydziDAEn--!Y3)k2^|uQs_d1A
zWLhhZpTG$A&VK!A;`Mhw7yG$aO!f9m0d>Sq#Ox#+AqPDt2HbH-BwbLo&vgU+yr{!n
z>hrv*^mj7&M9B%pYs<JDo%tHV@&vXDJ`-ZKDMBQYE%u#a3bB2Oq*X5AXwvqbXIM&I
zPBb`=GEopLJH$qI4J?S~Q0-FEtNUSTY36g)(2O1>gLT9f5jSST*z)uAYz*@sYf7sK
zzp85+l42hqb}fM#{|DQu<=ImsN8Ex_Kqk#NBSF=RH7B+p#M?t_L#oxdMyjQf6+wxD
z)?PisuA4sYizSMVtrPKDnsJ4@96^sn-#5Y1Ca?kGGc!8NH8iZ6(nFVkiS$-xf+*74
zY%F4QCLtjPO&D&9c7hB^LNJJzg$cn!^fkVobO54yQlNyEG*bW6$PQOy5HI58jtWJ^
z)sRYU#QKk*?RZTzc&LE7k(>g}WK;h@&VAlaZjMnbpah5$?&<B*8Qm_>%6jwZ%4M-I
zRORJs*?tcFccZ-89c&#mNEggxs53kmG(6>!eOHm<zuT?<Qy;)Z0Yr77wAX7$8OSI?
z&!+jVh6`jC%Dv)1&xGAb-bH(vL<F+4YxZ`)iTYWSeD0`Yo&A_5omLW$MW@c=f0hv3
zG0V3}`jNQ9`oIu<+80!WCL&B8bW}jEnw%$f71ewQZKx+H?U7MY5RwFsc)}hb5Vlr!
zW9ZO#S7zo>iJ+RK(NT&AOE86k8yukzzoXfs`y&d`tl$7cvF({ML<}NmU__teDt2G)
z*gDgfk0x8MBbzkvnG)OQhHA<A`5Ec|*x$%cr!zgva!~-0fSy_FcNv(~8%!rOBazn1
zuSU8!BMOnh`u%caNhkYS*P`=6!~l;pfMH;73`RGhQZ|WYkQ75R1Dw=L^NWw3G|X94
zb0NrxcDznH?FS@rl1xf_$j7X$u1+oZu75R>1H5_^T<1%){l1xsWe7s8(#=TsIZ5U;
zBM73PB0<7$11o2S$yMj+lh$6zLxabZt5Mt|5UfLuR!1w#sZyV<@JmRoFR-l7Sq%3#
zqWM?3_V=w0@l!+Zt3_?`a*ZqCsKo(4JI{fEOlCuoLx|Ap3I|4v`=qs#D||~>Vk`C9
z_az3p7#FyfUQ$VQse;C~KRe=U%Q(ST)fH>)gdz=bE^y_ZiJT$MgH4H@3i~;YdL{zX
z8!XALzMfo5Rk&y=T=Sf_r-jP75DZ<L9D2HaM?)?{8)gbLLgTgGf6E5`YC3@179HS7
zE*2GXx>THo?u3H=X?MCG#ioJbc_Yb1xhv?Kabh^d`mkrVVZ7@aVY}aC4Rf4*`?!^W
z1<apwsAS!A>|$FRlrC>-jm$A8zyRBbf5FvP4~o+?<chI*+g=-9Mz(AVuKN$L>973m
z)k?v5CybK0uCV_W{<eK_VpbDed4VF9{Pxs91|Qq@<U!1-BhaqaGU3-pW}3I~ThpPP
zJAC19XZxi7E$;gJTW}+zbLESVr=#EKxz1a+nFJO<eX|emRVJ^vS4TAaHt>HCC|L=-
z(y?uAN8wiCpaL|#HAZ@eHR9lxWh5jqahYHz!o>$NKZrd6)c^T&^Nx7_^Y_%iJqnh{
zE`ki1CMCO1##ICk=JOgHsAB&CCjO(4v4$3**-$}~<WgBRP#66&!#?U*QJpJ`VJ|#j
zgG9gw`u}2WY9YR?jnF^MTfoh2_f2%fu+EW7O#yspB8_Ie^2SYLxe>U~?OfbtU;Hg|
z_>Xts#cFT1K|HiE&l13f0u9g?8lsk*Zx7Cfd=X6nnqbp7==Pyx9b}edl8kZw9u0>O
z8eJ|Rp#1&+wPx^YC1<o3merC<RWnrPP+8UqJ*^7lv;rt@rVBMTfpnQR2z(Y67Jl(Q
zsjvi8J%KGN7zj|5Lq6Kxw*URcaNa&GM!XVA7@Mtdk2An+SwPR7q_ws#h+zTvV3=(v
zv0OQ8<OTLmO%PnFXEV(KU!dpP|IUGoWe6_|BlKy5gfbD?3xZFp5ZUObRB-mzS2%#N
zetpEakzygp>4h0SA~#t+$M-538RDhjkLYIjOCJBDz#t2Lnmjellrm9I^_K1^sm$wS
zx36^rgf`1>{Zsv~1;B3XjeR-lmGPeI^h2}BTEO0bd*8D3g>(7cP>0fgm#<!tV!-$k
z-)Uwga#bIj=hL-?5%S2;S~tZ;NcFj`%2iD_H&-nrlUn&%G=`TKMh`iDf52rX0VKb$
za2+n?*n9xWtm(4pvax(Z=jQCoc*AKE2VKR8_YNR?ihtwA$s5lM$Zhho02xfrG+gFD
zCm^?**{cAUnYkYKd)g1zbvL`Ivd!0ni>8rcSP9vL<(ZZ-!>gyMj~c+Cz(g`n2Mf^n
zz}YvqKqF-V@;RnOfCzKnXZ(cQJYID#kJ}CNEf=4iqZgL-^({%HxSf1=QNTkXGD*av
zmqe3z<8`&5W6^Hp(Aa3TblG%mu?zlmGwks&Zhfv=e|&%F;o$->PTgd>{V~{aN~?!w
zE*rM_Oug^(uERa<GFumZNUYyRG-p3v4|`ldTEeprU6*gi0RGrmdnz73%?!ksjh1E+
zP(y8dFhp}*1Xvh0{hkQ}LkLc<RW#dz6sAHLrd%>2(VIFhimjvUasM|c<3Bh};tFiw
z9_(X$5I&McAmy{j^r$+miv8%sVFEAT|4P|G(!Oe=JvSV=k89?Q<v;&EIo5?|#VmCt
z?@BC*@3i8@FY|OJ6+=hXLDjtD%%1(KpMVT72pvBIj!4RZLE$96KMwbJ|NrQE3#h8Q
zt$SDy329M4Qb0jzq>++TQb3SS0Ric5frA_b38h<*=Fr^@Qitw5lyujjzRmMI-uwRV
zz26uN2Pz(avG&?)&pFpzBkM}N)>-_jE8al$=BmR6goIrwC^H2meUoND=e$u7X5UGJ
zkqK0)PbnRT2PB0^NjYvIr{=EIPPyG)3ZQphm-%1Y3X-6!1O2OBn#S-~-hQ4j7JX7W
z9LN#7n*G}c50c6KE_Ep|f7aJpXDB32uAU6fx?ewnfgx#YJ{VN50B3V5nzL?3sCpXy
zyft=K*vNa=ZwJ52xrL*HB2oCX$$yAg>*Gk;)eQtU3G|?Y$PMfMmgIPBR=zojf{;=$
zsp{MggV10tJz7B}GbtQ^n`@<Gx^vjp2OBS@Xl=drn()2)tMp^p{$6<fBB(9&OAu8&
zU%cC$F_QxZ)CijS(+EZ)8|}_DJ@*J;&zpjRN|P5?)DCq50%|YhNI||M$0^X;$?;Cx
zh=Yzi>(|UVebl`r-CIETYIUasNUk3E9Zjf418sSm1QcT{^c=r0pE0=0&UwOk^{}kK
z6Sm%%Re^rc(gycB9&&&SH7Isao}|sDGBPvt>y4t0d!z1E4O~;8^BU{T@q#(fFswU5
zFy-};4_~>37nKSRAL8(u4QAKJcJNFMU$a(&Np88C-8fQ0WWS8^t5y`Cq5z2xlV07~
zCsN?6ckk^|roS}Ml1UOiE(%RoQ50(17onMPUDa8Br1LjScj(i&3p}v1r^#*^WDZG?
zXE>iC+G*xH?=ieuV@4I1ZJhO^n=ugXYbr?M%Oktmk8Zf%Yi?;NJCvalBfRT#NB)f~
z=jCpI^v@0|_vw~~!*sUvKq6<k13<sSMvfGKP`E?~jdnLcL{Jb0G-}5jBg-mjh%t>W
z{;N~!^yO&1KH&<@L2zZUc>GI+&5sk|#kmDn>0<$F7T4zu(JmT4<GIYf=|$@yRWJsr
z9ft|cveTm!rmU|m;oB*Gc%?`$0n6fFCRXS`2|Sf<?P;-CI-)l9^G4q+g#4|S7h)Y5
zl<y{g$)K1N5vAj#Vf{i)K7hh5Ye(Rf@9erPpWy)i>cwM9k6qBuw+?sVzm4v%e&-TF
z@-7|3F`FlXC^zGz)|85*2FUO(96)Awo!-Z8sHb+{<S;90BQDzsjO|0t`{A=>2<m@5
z61hHj5KlF}qEdv%H{WRZ2qHP0U$ecgg7x79+@elj^_Ua8zTkGH(V(;llH)quM=woF
z)9tEuw{B<47_|?Xw8}lEotO`@;LS2UFx>@n;Kq<s7`2KVulO$yp9ytcEwgzWI9kak
zUv6ryePLpi1<Ho10Jh@JfWgRjx(l~O@;vIVFR3dZy?$h=V_nBKJ{(k)*Kkco>2*rD
z2HI@5!maCg#uj7K=h|ZG=DZ@4t>Ej?^nCIq!bh-DpzWM;RbgXG;<zoGfluYSM0oX4
zP2J#*H+6#RO6X8@!$o@Oepjw~gP9_Ujzdw|2BTt4r96uu0!p2M@A=!0cr+Fa3MqF0
z+E;f{&&9LVWLcn`4L+y`g4IFL-uvCWh<<=U<m@%uHIQk%N<N<!s5fiwW^4H9C7n*9
zcVR(z)x)Wn8S3#~!xRa`o-SnQ8*=HcXUp&Mk}2Qq{`3T_Z-k;qYg9~j_)@^KsP)dZ
z>q>0L+>4tbjF%JF`_b>FOyQE*Q6fh>Td})6W11E$8|R6~x~_@_8l;vXJe2l5MK=<U
zwIiW-UzwK23>DNptagQ$ABya?67k%+&eHm*?J)4<Sgzo4oF^X3a&k0i5)Z56HT^kv
zbzE`GEF672ZLjE!PW4N7z3c&@e*;FYr`mUEuvCYym!q$vtm@BJN(ZFK)4>F)rX;<1
zjsCjLDC-;He44qJu5qO5#ie8o&%}G8lAbe<FWE7sN&F)s*u<J2eo*Zw&VC2gjQu1S
z5!8d&bA+41g-%}ST_D_egf{eCsbvD;n0ebaW%b|HbR0(x*A%@461}hPWUIMei29s^
z|6%p0MfvO1`AYnCQMS^K8}%;crArg7DVRv9Fn8KMcG(erqb|;1fN#t(L{3Cb9IK5z
zP*b0sm6bks|Fl;3ogvv|F5bhNa#;gRz4BU%pPOdiDuH{=hMwv!o*BX5q0c=Sq`eD9
zu94?nu#exjZwg8M$M-(OvP4St{KBv3X%g$m&@Ou5n(6uJoPXPq82`wpY3KIY`N#6w
z>wuy+=(;$p`DJ>h4;`qSr)_nSGEA`d-h$hfV$BWmE_@x{RWqRJ^&>%5DJ;EMS<}9k
z+cWH!ew4^#=NU&5#emKx{gs?tF9W|6y`Lb_&-y2RxR-1JCB9I)XeYlu6eP_<kP<}d
zcyEYO;R_wwXxcR08Tuy6x}f2;!kJuj+`HwcAxNMdQ{GaIkyq2GUrfmBb%92{%WBlz
z1rw$W7D#fMe6KjE<w<9ab~lEz5mS01ujEr2D@b1<xNf>7P0a-#-wYZ0<#?(rv(4ia
z5<<XU#9`#(jsych!_?Fl($+degoh4ZIy91fK5_s`+UG>nD<^v2v6jYkGs=`gE%K|{
zF$8c1n@21<fXZ!?wl(?ckb&sfYwI=+T10V6+-D!Ikk#%3{r=`p&VV3tnbLJx@IM?W
z@EXC;SuO=;1mA&F2IDvAT4M2CNxt@613E(m@NC|R;hZku;~%i2r<5LnK15!QP1Lz%
z+x;aKPyL618)a2ga8RBs{<SytCBwnLU-QL*hw}K-v5f<-yjgU&gVQ7Mq|9z_<?2ko
zZ>&aqVS2KEQ^R_ubRR5S@)~!Um1m69@~B^C9~GR-L(h{-K_%fbdiWJ&Z<?mn?98P7
zlX)p}&mop*Mh$75)C)H6vu+TdNRicF`tDs3&-$J<XU~C-k{Dg@%cC2*wd&P*C!KeB
z6XGjtbt0f3>v`^89_N+xm7<&mTzJYQ)m{_I*wocsJ5QQPds|_C2g=OCz$~pNm81Iw
zCvq<E!VE0h6}b1T&`4pi{LSiGuV2lvO&2PQktT%;cWJq;rE5<wf>~gR3`lo8pFns3
zzK1@FyA|*GVGEnrUXboh?@svF8mY<ZMpS6>kUVBX`?-vVnlzQC=Ufj+^sF-9yvB5#
zkDrABrBM>L3{d-fyw^&o7&o*hp@MTVDMBv6{ZwMou^|bDm_XaiG2-%<DbdcI3}*lH
z!RY!A$^2~R&UmQUFG}_w?jbiOs`Xk6juTrCR_c6mRjCZ-&ib~ms`dwEy-QBUkCHyG
z%EtLrVo;r?HHdV2gS}JMBCkm5W(a6B;~loHrzKmke9-&SpwV4E>AucvaZp1qO1&aB
zNl9OeIK99t!Wp=#<9V<(D9d&PAI0UJ?icBH-|)We+TivN5J8vFxU01k;WLqf5nUH`
z+W`pE*ngt!pUj3u4WPvuDYbw}O~epf^mi9~6>Aw2)%@S-X_R}om=ZfahVe`=<CmAd
zJayEyzq-|MIGCIXI<CgNzIqJ~v6S>YKMq)>TmvlDF<&=JMIJt~==~3z>4;~hG3rI-
zT0W36l-r{+W61&%RG^qi)F6^SY-;BoyXeArfAJ$Y64N`3ZeQD2jrP(O)GdV?yMpVl
zTgZGR)7^7-iaM^c7E-T}V-1r9NA9j3%)($*4VF%r#8)kgzxOS>Yix++1NCd^g06P3
z6{=V7Vza@GK{pH652M>!JI~@7Iv+vwjzI_+a=^AWOS}`J)D2(5Gb@}D90s4!(=+e*
zovJAIGG_r<Ji2>E><H9UpH2M%M`4TG<0K9>ZJcTbh3Ry4{i9&cjBzrBjpM?u8x@$9
z&cW%T;?ENJ)$8}MH&!4eYW!V0gmwfEY=IF%J$@l*y|~-Z@y!?%%}2t-L-*EiRv_>z
z7>?blTx@N=`vGi_B|g(aZT2V47PY$<;9w$B{b1A}fN~{DP@U%y1PnbSsdU8fdk1L-
zog!`iv>E3#g?pQ<^?W7T;(RTs7y-uQY9{Kns>S(v=wg;Xjm)16F@qi{IEmT^&<k|b
z#N~^Mxa+qEOeQ8u&LXkC;BbxUUFYhuNz?#IbI+7G1ef<h3fXY&Ubuapoc*+tckZI#
zXezCDeUqr~_Qp^1tHYpKB%j%-+yqdc>4Y80$*>L3BV}%O%x3cZibA`3(5_KiK-uzE
zEon{D!gG`*Q@?u7>)c%{(0jKDqe@bN-?HvXT3a~RjLFhuU<P!02<K|Z-W?k|`N~in
z?ZSE?`pSFdfbd7g=IO=FWMple*6WjoYrmspc-1Q+i^|=rVD4gKJr53VZ$lbCf0E=A
z_4vZ40+*{Gz)K{Ty%#T&;JK&4k9IWYUPFK=aUZke?r&i4@tmn{lk<L1g)H0UoZX<8
zp3#U))1B4Wqrs5T(7D6x<Y;X>+nEdVkq5F;^~A}pD-)%l<zhS(jrz66)j?GSux-cy
zg8BLw&9Z1%lb;PV?PKe+*=qw=b=b85|M=XAg@@jlWdiMA<TepeOr4kl3%kdD_YDRG
zAX;`zsL1$DD+(N!CcuZryY?MkKgWg_cp@{Jx0o>T(6i*~ZW>XAguFl5MNq2Uy5N{1
zxmg)kr5gp_o&(qY>6AN@+uj%JK~L~9_9|og?wOJC8@;|yi?;_d3?4~<uz|`Cw_IyF
zS7{2D&qj^MK%l5GD+`HEF7lnP@`w7Z_VQj&f>vAoX~!|^>2DZKjM~!D<sax(gbJMk
z{UKNl@H`E%&NUdNK^0NMz3<1sF62;ey=uFcX1^%Q%+;A>9X{`m^1^bfx6L=J)%9rP
z-C$~v$?Zc>V~wHn9aKlY=D$o0V%3};wXhdTl4XtM2+AgZ?=c`r^5AT&Mj!cb8x)ua
z2M2J|)p44DCSufwct0{XW4ee#*4`{+bywQXMCmN(Z3REj=F2l6WiBzIAil##T+qg$
z-m#~P)uHenS`IPbe%Ziuq6AT%&J?*c6B%>6=KOlsf{nOG@|NVQC&`*uM@d-zG<}9B
zD!r2NIsU51qxLcolEUh*7Z9<A>22;<x*T7>*+8r_)FO3^EVr)mycYbo;p+r;U1yO6
z7IoINc&Xc^W+d?hwU&k`rWq`t$_*#nG;Jl!5H#R!FMYx+9wX3Qn#`16Ik<SpysGN%
zZL%Z6{0dL#@D0h6ifKD+ZxbkAnKo?aXV+<2S4;812|btp;=#(?wUviy{;PLeJtb|L
zR*2R_k2vB@GTX9sxAupoaS!@_U@=JxU!NBJj1p&`aTt>KoP3r_j)x7m89~kgF|k;~
zT!H6heTds{<{RX=S0d${Ez2Y)a{VtLS^U*JsP2!S-PW=kvAhT8Lb-anLZY1^Etlw)
zp+)y-WwWrQV(3Zq`v$jr?FEB;5;Rx8neji7!<@X=8{<$Gtg6?>gxd_Z%0w(w?5Osh
zVS!ZTQ}DG#fH86kE>%6-hhsAC#XIN){Amt;hYB56;?A3M1_>JVx4n(CS@sd|GBCTh
z5OD%}yjo$&go*^|>MH)vWcWs{8-*@*>4dkS4(h$^Pdk{YyD2D+-jfldzJt0zpDnmq
zVEAH(WXo{8F?lmK%ER+*OpsIbu}+otD47^VE<Hm1h|KQi@;kFUiROWK6SVx=HzyO|
zE@Z)1(izHW`EnS{-h0y>L6Aq(h0x&ls<S{hl&hHlDzm%cU!5ZERJgeg-!)|<Ay%mL
zC)VuWAV(<k<6e>mJaxamjI9)>EM#;=u9^1Gt~wSh(grajZhIkE8!qgqj!zdgCN+1w
zkyymI&}!z=`kO5!tTm2#<YGgJh&h1F_G|4b*{@6yTY@^*4E1Ty9UA*V&61?4$(+gY
z<DsXD6+YtL5Oa}wYjNl^VE|va6F$jM+RNj&@o*v#fh5q{wYAMSUyrttC-Dzi#agL4
zc~IeG7*!KDyYVp-f=Q142z@^0wHSoP*5%OH#TnJ4#{o;Pn}s|{fO_{9`P}IW^Mrm`
zX&bRtrST>h0{eR|6R7+qnD-ckfH};XSA!nY!C>8A>MRLV7P_LroRXX0;6QF!7!_vp
z04!x5yI@~-9l&Qx^!v<eXGU5<#uLUO2=7*BM@M8FCRFgF1@O3;I_D{ZfbB;G%v-lp
zYseW)7Kp$$6u=5`V!y;_T1<4Fb=vp}Zm))-OmXbyq{5fC*dS#M_Aee$mlhUzkc#eq
zdV<s3b|blOctZ__(zAY);4?)ZU|#Ctf`D%1vr_hb))&l%1fBHq0&SV}94P>E64D_s
z_{6bOyTh$y@c9}EFGbHNg4FD_$9w4axiNnffyd!9KPJnsaM}%i=mBKe#Py=<ll0#b
zJ)MqLyN4f519WStVZMYO$F~tEo~PaX93|A*&k41H(g%ywu!Ei%wW+|1qHz4&(BtLO
z_z}<+Sb@R;Fo3)XE-a$;J~sxzXmImm&XG;DKKr)J%C%jLOz~~ax=J6D`_!HH<jx$a
zIPmwNGc{m(dvf*J+v0pVc3VWqtlnCMoM{Vfrk$ufnFQUasUag7_9U(7?v-&ro_L0~
zQpb=&{L&5zLSORJ8}*$_c(r6+fuEx`W*ze@-R_22&g_pyuyd(d`?5j2Ve0P6>q*-E
zJetEaYRU~Q)gf=j8U3;{ww?H_+o>4IW?GhI>58#Sm%VOp7YG*DLT{S7f^aZ#DZtdn
zlz>;ta2mrsD#_gh(0EQriGuBeAM_Z~-(&;EcSphw4p5j@O#`{B`uJBFN<7*Pf3+L-
z+D1q=?Jmk)7p#cghY^ovQEN|BCmcafahB!-P+RRrXBONMOnPTu6a#T>Ne#l*l))bV
zQM?Uj=~4z|IMnoSzLqZX6SJMv`gfH&aQJh3LNH1?Ld#LuIO&H%iA}z!-g(a(k2qgw
zDNGKcukUnqM&d#->ST~YIwT&0v;lH&xA8o-5-{_1EW5!GYIpkN3Pk8(6S^P4^i5k~
z(A#!5M#5+xOTP6t`O*`~$K%!mqsM+T<lPBiNPCjriP~JJAE%l))Zn8|5I2T#bT^o3
zt6S`-$Fpl@ffE<&+gwe*0EMTxUSHzxqBv|Gi|LI2!s^h+BHUgTX&63Od1sy>IZuEC
z%0R4&3<1z=E_hM3nj|Yb$wi=D=LE)EJ6p~>e=ZOgzW}B&bHaRV6)4{5FE)QRZ>y_g
z7kQzNk^vClxPF6hZ{p?FykCjG-D@Je4c3f#_AqttOWdBCRnDTHcS?ZSOcQ$2LDdr!
z*Ot}dZq}V5OtZgY?>ku0(_RnedSX=+nZM=NBgqP$TZk2@;QAe&E+-#-YE~`Ay55g%
znpy~;-hM$LUu+p`i<!ob{Vq3I&4|*Bk2BrkWQG$Pd$u4C25BVB*KQCx9v{klOIr$V
zyH61~0XPPq+)`mT6iS?(y#`2aCCs0zN<EC6DO<~Oz=pUte+BNIRw1mLZ(o|`!5T2;
z4j)0tczI%NXb7t`WSo<|SJz|)J6wGwcMUF0ZSldIO1H~`>8MvKThq}e^0k@jg>nA*
zgkrJP;_k8gL?PK(>b->@+$M^I$t>dcC<$F9UCu58sfSiUpSe&4;>%3#Zbw`|SsVeq
z0{?OZLP%acedAZk@oEz|H(G6IIA(xm#0~l{-qWxZ0v98)`)7qvZtVJi7ESgHqNq!Q
zq9$~`odFD5`IFca*AsW41GcdO4|#7y%$lds;LjnC95Rt_^Xi?KK*zdo(P}i>AtpEr
z+$OeaHt$q)H9Zf2+wVW+e5UsuWjR7b1&Q=pj-~r)ACal-w){6aO9yF;NsrPrr)S#a
z-P~-!7grKS>lnWeVJAPr*NIo$FnzwLq6V1hlx(cLVwq!$``!}0*&!mnXX(*CsuKo3
z8uK|uRt7mK{rn&1`bQpoIII?N{J^zy)w=lZJ;m)p^L#Z=I@URCYGi-Fm$hdL-YvLe
z=R8Z8@*~Z3x@U4aq(9#(H2uJFDnpzz5kF?hT#VBj#wwsdlc=4YY~nQ0B=z&3rMQG#
zbUW(j+!7OWW!GZCN4_zx$!Ti|ede9l?c?$;SEx02Y(-P>br7A<PJo^JuJqHzvBP~g
z($kZUBB2*M6`Gg1YDtlK8ezu(DwP44bR_okgGr}f#weL;!ly<Y{aPLw+j|!|BSjS_
zA|7k;k(U`!B97T7a!?ich#P!Fmbc%AIS16VXNGt04Mi1}i#^I(K~IBuy3R#xG@er`
zrB$3L@_VI?ZIVG9dq6h!WXSGvI3&q<g(p>&n^#=C^Ubl9P#TGRT^pStU(@f5`vQQN
zk&?V^LPHHPcQwx_yG_YbRpf+>WmuzRjXClL&<{^8CqG>EQs5I(o%~J<<+A5i<t2<+
z#M%|=Z8D|C81rCrh~t|xBiFD=3XDR{Mt6*uun0H84g0I{V&e5DO3@tttKWG~hIexf
zFN)P!EcGstDgpIT1JS1==mpD%W$Wa&r-p`&Z@WjZ*)nrgN%k?BzSv#tJ6bzF(zV|D
zaCoMMr_jku3|X$`9m|4>Q$aP3CS_~gEc4sGmsV=7X*g7jHrHr$wd;{T&|<XDYZpN1
zQ#KD|<&sci7i#8LoOrX;mjUf}MWLo+*5OXKd(QJM(e0uS`a-$Zo*P<8xCmZi_OfaH
z(wkgwC*<bd5WUFlR(G-AU1dB2it81XHw7hIjk#4)BC<K@u$7L(;qH$WB53#$_hw1d
z3Al4}k3j3HCX{a&AP~rgK1J`jg5EB<qkDjw^^wBFX$bos#4Gd|5QJX^nc8kxPai`0
zPj{q^2p0%v2vqBzVX`lOVF~W`Ys$zPr+AB%kz-lXJ6g^<oY})@UcH5L#Hpueq9D2p
znANyOfgz{x>W&Od+)!?uA=8s_BYdq--T@G9yr~Y(<-7*O?}XiiVW}IYmlTHf+1gz7
zbz@UR=VqYC7h+Su9{Nm(>$ioPzCh{yrIKrc?-~i_pZ*d3+@?#(R=vHLIZVs0jV84{
zoI_wsV7>WaApwqiAcafv^2WyJ&(HqfZ|Gtg3*}*VQ8LrZxXX$=Lld!IVtlF^=k(Rc
zS^vL3_wVlk&;j~Hnvm^r^I{=^yq1{!<wTgPVm&Wg!^1d#;%mm<5Fw4P=o{S#HfdGY
z#wE>QWutQ5R|jb&mWfbstekXyX;`yovzf*9V&nc%e-#I6$%ZRSVI!&;KpW&fqRhSP
zuzIzLRIy+v`e9DA3=(_hkhPcn3G<7RBBBm4Y?dYLwvt=+LLGCTH}cP2r%;iKC7Xw;
zY_R4V2=xX<RiJ0z&}$PHrOUG|Y+b<L<C)U6$xGCd?|GJ?j{CBzOZo2~b0CP}*)qNI
z6E(I75js|%^F{h%d&Pm8cE4+$rUAfK_Lkk-AGH<%0mIs5q_sJrnwfe}+@|Sx;3J{7
ztRQI*?F22wkF-$*O+Ss%O*dTtZaPhZGd}Xk(#1T+oMJL^*xU?IoVNEry2I@QVT%~K
z@{w?#=lFc{pK=i0rXuGP4uAwakJ^pY0`1DWm`Y2IUC%Oq0$UN7W(3N)>Zo_)L&n?x
zF#jJ=Z%FZ_Q9kvlN8_iZj6~Mp726ZodAO;nFSb@;HD=xY`z7^?>w_+S1r`=qxl#AF
z*<M>T``(Iv!8BlNd8ZhEeIh0_?TX+W1h^La)H+AP9mO~Y^EiN~e#}g@45%FkZP)ys
zTNZk<;OyQM7R`MRyArTsL9lI+e>krvpzi@|4FojV@XFybzu?Jq$r@l~$Vy@B7-=+~
z+1L6JnS%dlBJ}xg2>9<6DKVPHV)9UaC?Czz2h9;1LG#GJnwIFIZ)wRd&Uzd)lrLMq
zV^lP1zV^OaWikUrF?W%vT=hRhhhK$OD24Y$=pZoDG}Yr~Spy&`SCCJONB8;-=8m9D
zT!}SJIs_XalYEkgW;LMX1!tn`57W6op=kz4gbK^Zx!0zNj^&m|?m+>2#c*NwOu){-
zcR-WD3=u{YW2MXj%JtQ76K6^Ej+e8dwVs7Vydw3B`A@e!Z+Wl8sti@yE)Jg0U3-;i
zx<1<~8#_^{bNjx^=4Gquc6GKo2#^#`Y&PZ(p@RF+Qc(Z8&jQ9*M&NjSaj*m~91kAW
z^x*=&_js;jFZa8eCvCvAc>9DsLD2PEK+8sSL#=>(yi<6<ky@sv<Js)V@$--k5@SG1
zs0PAdEe6^4pqTmzs-5AAPb*ba9=kuxkcVr1zz=uy`E_3_S4yEe&6X6oWrWkTErz?A
zW&7rGDR=@8RNTyuet<2R5irbz<ueDq!jYH*Ay>P<6ukyDfREMC8XdI;kLUHd$Sbzm
zU9nzM`7=4^N#KDnbUu3`yy0i9;q5`T(|jW-Xg{lU0FBETdY9{(JMlNbPc?vd2KiSL
zHueCFxLUq^v@;a^kn`izj$$S-FCJfajtJQLhxh15u6lXt<;pAR!^$OH%2q}78YzJ%
zfZ5F%>ZtN}s_YHb7E7#c^P?0TR*?|}I?gj~8A8V%ctOm?&`hoO;rOx3Z=|aMcAT1p
zpwQl_Tnss^g2TR_v_yYBQd(;xxnAEhA9*p@&RGRo+=ons4<0ItC#O^I)q8?T^Is<h
zw&CXoz-`ET4Z!vCB3FAFl0puDn&x`tQj9y9qO~S=JAP;_#~3TvwkrmZX5sgg3$rin
zct3cA+9GkwXJIAY`q+o#_OGuz$c*UV%H5(+&tY_%26mK160@r1s1qL^pBqS7{=~BE
z<(UTkt=f1zkqdSbBhcRbgv9*nsK9$_{){3%e_yd?t^*R99yB`bJa@@@cfl5P+Mf+f
zOvL8Eu)&d0;w8>Sk&HoIVmE6|589`*A&!psxBCJ0C&E(wJC(jm(AwbLxF*xlhJ3x8
zsfO3tgF#^1_}<c`3uCrv1Gv!?A1SISf-p2Kd0sux8ifCINKvG>jkXm6V98MYX|sck
zf@`6iSSLUQkk|qe)`Nr?xCZ1N0?|UPRigRGa+14OGHLMQ(~TbiRIFWMwF0XN$uZE9
z82_0hzzT}|e4uukh1(22<O8^RGWHW49Zt(nv-Z_9AO&oyl=TUn2qu?1A_6-|&PLb8
zF{R|5)c~WPIt6m8${$DBq3OVos^h#H5G1iywR=*Fd(}Z2Wk_-}Ji%R2ML<%GJ(eg|
zRntg+nj8cG?wQuq<G(nY%LecStHO#v&>@9VTWb&cEKoal<liU2*SQM<rP=5u@F^~T
z*3K%xcwFDRXOEO)?riZh1K+0OBxJhQ+-B{8#7=w@q+yFO`L;?zpWs38VG};<X6i3s
z52N2ax|#;!%GKknTTj6oIdtu+JvtNn+*l?KH0ulL!rY`MUFdDHH>D5H;2tJNhP^DU
zbpTeGI(JvX()mYUD*By>lsJfjJY4`?_p<Puk0{EYhyR(X;$J&WiW$}gz}<n0!sPbZ
z4IR(c{)M0j*y)`<I=O-RtJx~1i_HMDXuEfRgL2f!6J)a?@D~PZ52*K!Bzb1IwCvvx
z{`?hXV&Vo;!N*U+01-Zl;sK5es|WFO7xca0J+eX*oF><{I~pCq2OV#?80=1hXmZQ=
zrO$HMN~DNPt<*c8L^oUTCkfHVc5ji$@VvYN)|Q9%)<+?bU4erh>T}E8>MJC<wnr5>
z&kwz3NZd4N02&SL-<sZ+1sT9{bFLKFCr^o9_&uAXI4kO8!3)Z!3l$UqtbQ`P(gNJ%
z$mZwyiCU()W0Sf0uP4zCxj(v!h0iC-JVZnN(P{Clfaj4cRC*XUAf+65uARgnh#Vzs
z&S!SEUjwr+>5B4wv}0u$zJbP#%EZ2#6j67!5`WdBvB}E}fcV&TLzOzyn?I`Jo=|up
zFLw<jM0MkUIk43<4197apbL0bl*J2;k|=ZAAGgB^v8j+G&3Iz?QOp~#lRJvQg|FZ*
zo<VH03b+Qj%fWI+GM?tDO;Led6zc1*xoj`0-D;5SPihbBF)|O}BJezGVT&!dp%z5?
z1w~;`9J>K4ksQ8aDPh1!A3#kg2HX=0fOTDJs3ETN;A;i}eCAa1k(&M3Fd8*6^<+sA
zB-k&u5l^CIf%#=5Ryz4lPy{8n9c(IL&HO8B?vYZ_bdc>B_L91nrBzxfKg|KtByS3+
zPFeyL)~q8?3mB{6EMhD9qp(U(>|tL&G)*w`>k_y8G4kCr=Yb(^hz}F-9hHptsPcfh
zkk`|}eBlWbk7y~)_B@#ZqOTca&M^Fr!xS6OY-dMZCRa7}M#zl?#SH2Vll`2A!NXJt
zAi_Z@bthLZ4F&L2)UN=6!qZx;sbj+OV=7>c6BCUFL11^#+u%fF`jxxbj0;M=Q+uG`
zOj~0pGkb&l{w}es-fd2<HomcH(P+(94ZR^mCT}oV0X|xSknDv80#2Kwq$6cnmSvSi
zu~wR1Pm>ANXG`poW}xGG@0qA9b5W(D(v;Jb<q&WEo+HVD*n1jJp?A6R;rY5#w7O-S
zQ7o0}!ES)Y?V(+YNNxe<J_tCA1jV4C<U83;kbSqagXO5>`Ae9>F~k(=+&Vaa1j}?B
zR)FVC&H9C=lQvk_W3Mf8z=7?bB3NEte;ksKk&)8%x8OBc05w!32{za4Ks`#{WWe2|
zPf--$vkG$l+Elb+Lgjas6ps|#s0OnQc<ttp2}qdnzE~Ep3$Uj=+a|C3ZB0qQXd`m5
zU?2=z8&+Yc8o^MJ8PBhdBmYomR2To?opw=SOv;R$u^RwTzt9p^G3rQ52f$+OQ>=x0
zH6?)nD^{cK!s?BNR4i~b{8@0Pb(ji$xspJM3aCsf7{63Tt^$sbmjf1f<Xng_5V*;;
zx({KM9nGdeEaf)2c|e>raNY}W05wQ7+q#fN623~~JPDkHsZgfF?G1Qc<r??53~jXr
zfsSW2>Wz9>Ftx~!-`$aGNVmD@@+0mY^SxvMg&wxCc?}R_>is0}MpUT=H2~^5U7b9r
z%ul~*lE+WUIuB0JQGbxQQz_~IG$e0xeJ}ng^FV?t<+AL3%Rxwri$=Qp_62&H<;eb4
zuQhwuTPj+1U(83f01Nl7Ksi6Imp>@(_G%qfij4!NaV6UQu>wcItz^&gFj|@noAa8Q
zBA7>e_<#l0V1V%9%PARG?DMuO+LCYTHE`IDHDzjw>#Apg{7IYvHs9RFGN*BN6nMx*
z_gz6vCO19F48%+HfzRX$#T+UD1X`;uaep+ZiSe_fVGAe>bL5)@hB6rIv@mgHS;`JS
zsS6+W`z2l8AfU!l8hI#wq(BVz3@8BA$0TFH${vE{kp3RCOFnJQ;ZZfs(b)cIwvj)L
zP<wa)zpmFDm=R(A0jjZ=C30~|0J9yB+jDmxee6q<Iv(*X)7AUE&nB>ADl_+zJC8F>
za;>}G*N)k4_cS$?2G(SAecwLA>aotd-qjBpt<NwM%2ItThmG1lH6H0-22k-r>MyfQ
z`D5UU$!=(_%EAlk*m@hz(buL49Bo6z^vY>RAs)FS)C=g?$-up&W$v7r;;Z;4L14T>
zo89-%VXeOuEe=i_;PXLG`zqI1@7vRSOdM>Kjc!Tn5U<3OvDoR8ik%f#*V$zYi?_$*
zUXmMK#`lKj?3jv2V3=G)DCouu1XRrk1E8DPe>s0cs5Z$R@m-w~c{-=P{;p7x__=<S
z7T4!vd~iCR7|JLWCK+==u>0>H0n>BVWFCP6N11p^t)*r|+Ha6&aJD0!qH*>qZ`1sm
z#OagtStZ`VNY|~RjwIpq;5G|eOwJAp`;P`^cEHICDewf74ER0CR3D9gBpU@~h*c1+
zA}cXPJ8}s?VMk^fN9v<F*rTM1f_spieaN!Z=5vUC@%0tPsaPEbh0Uk2@7xOG+;>Uf
zv(_?Tif4CftK70d&i&=*b*$QDZtUr+04KG{rC_$7*ISXCv~w*>;@-0Yj9|P}86>hU
zkVB|_2w2nFn}IFtab<v!+k*{4aP4^5-@tp}zfr+`aap$8ljt<6BY}K_l}}DCs#>VL
z?|(G~)jFft?j%vmXKuI{QT_$>m~~myqyKi%4@<!l`2DDsd8fmp_lKP(eql7w=Jbu`
zcjcx34#*2^Me<O1y3$7;Wm3~jzoE`x)pZzXQ~Igl1dCA9-dk*|WBs$vG_Zn;_@_Aa
zysF$oJZHX4Hin}-&S=^cen3BZm2P<ezm%i*U?S?kBi<yR8jWg?ef`2@Yq8_2tmUyR
z)m4(>{kYYwwdT+~nK5A(T#+kvhP+9=Tj^Zj?3`ya^G>LH{at-{cAq_uPc7T|xt|3x
z-cfC*=y2-lc&2s^c}~xkcH#A`;oSvFMAw4lw+5q4+Mu(fliIPgJ?WwZ%wQ8#IYPsr
z&3Ag9j!`DXQeautF8I}SHR5}pw$zg=T^Q;;oQ*!e8udAO@q%klXJ&S`PFTU(7{7GZ
z<6X&%dwwhCA?3B8^tjSc!|TutKpN60uOYP<^3+dz@<yGyjJD$Ut2rZ=)oL<X^R~q5
zzw$X@x1a{<q?6g!30Vws)<IV%KAe4xcujGYZ^*!&B%hW)P=0hwLB_&_;80z^arDN{
zSs*bbR+0KWVb=}Qew8<ZPY!cl($csFx-gjy6-xSXD?%j)Vjn_jeIQ0W5fDNCrlZT>
z&MCy~d5vbc*?!{i;n4+d|HMeF0Pr|EkSo2P!gX5l`apeY$#3w(V^j1juE$qBc1?Fg
ztdq~QjD!S3S5c01_YcG0CD2X8JPoy{s!)mv59qV!L)qXxWZLPj)u@7>-?JWPeerQ=
z;G{m@=<5k|RqLMkq9<bDX$wU`#G2ooi__!)SF}24BZb&4S@IC`{J2V@Jzb#5sifGg
zYH*wvBDZMb(hT$&bqA+HRg4KGFP13v?qR!2n2I&6ruFhj4#t%y411K^^X5722j1~L
z|7J>OOIU`aMTD8AS#E6cb17HYB}m;VnnfCb#A26rb-r2RtZ7*`;(-Ax=@Ve_{bL5=
zLnxWz3ZW?EnD13(e3#SHjq!tjkd)$le)vj#ua0hWdbg2_T8@)on*+1&@>zSg(Y(bO
z@>9G;Gxuq4vadqjy~6r?ml5^TaVMtvwuiYK>GiS==9S)&`*_^EM1joKciWaPBTini
zuDK;=u(>DqEplno@BQU9a)ax{V6s2>VmW-y_=;Ooh<L^8@@Q-b?#0vV+-uI2i1?>>
zMU<290~rkidIOH!!wckSk`ra~-F~r8O$FcC4>n3IDpHHiNm~W<x0w>oV?_MspQg2y
z6Qs*YJRann-zt{8?-`^dLHnYs52MpKbiXV;k_cr-_OPV3z?E9OvuJxjQiN0b-5Qw@
zLT0+snvE?gB*NH&g2a9oesDPKB%d_lj+&gOSJ7Wkl_o+ow~Zadnfj-vMwE!a1(aML
zw^Fmk)xJMd=_4(!jV$`7?F5I!XNACS-{%9zo3fYIc@2wV3alBGk$AUN(SLBTb1Msz
z;u)sbdNh(<IUQfvheNsD)iX1D`=A0ipM?~eg7hq6f<)T;SV=WBgm-%no*HYMy7L(m
zy|W5o__V1^tfT!AT>7B#We+kCv<Woyvym5v?K!CC{Sqv(TB7+Frx3x<^y&H22)nx|
z(w^KHKmW_5JeUHtmu88pnF&2NVFO3h=+@S%^6az`rH>0|p9BT5-b(Nhz5yn`I-(M4
zgurW73Yam9Tr@%`SJ}cRGV=&bq)(&2j4V^gG@hegwuBI<HnW9QLWy|G`bK#JS24y8
z<@=l}t@%}In{6!(Gz4FK5AiZz^O)3c9s2M2sQRr09~fx@_Oe1>6N2HMc6#@uy9^eR
z+xD}BMIDRU(Q`Nw+cM=JH1Sw=OOQJyJ>5wGrRHfJuG<u$CxKn6uu`U=$c2VFdxoK-
zosQ(;ue1#=X~M#!*Vlzca?wYtQF1s5zsk-YcvdOVSJ@CnnI9e$n+(yBH>U<eBRWnF
zhktFm*E+ksPa*%WjvOD8C;e~5Ld3tIJmqzwWNl9~ZomxUwAJo>`DvkXvK{T=g-wE1
zFIk5_#Z+H}Wq-${61cEQx~h4<S44hR`%nuK+dMBw1?K~c_TcyS#b27-#Z`<u-&h?I
zbe=!N{c`1#GRXa%GDn?`fyl{lVPShVk?xUaHoT_bOtK;4+U=U2N;-XGL9$w#y$j4}
zaNSKRv~!av=#l-)_U-=#f8}4NWJ4&Z)mQvRNK1*M?tEs;BOI@uXxS@C6MS^+laKq>
z6M0|kD;952Ii3NO2ST&*=rE$?p~pFD88Lnk6^U(6L4r3<si)HWT<Od}-j8OlXRXY9
zB=O8P!{Ws;$4feTX@mRmr^T>er@s$h^on|fgr;XEFPdY0TH5kVD&F6J6+SVt^n^r*
z6`Q6sAVysIN=Y|ct_!TT|9q;X57FXeI#D!mo_pdF5WUwX_=Qnb#QKVa_zn+g2#Tdo
zT$5Q6hA%E{uZtE2#sYmSMMZhhA;i8_$W>Akjjq{)k`R5-73IvPioiTZ#y?CvF|Zpc
zz)_dOEc)PRsO>jh*h&0vk!AEj>x;sA1J0c3UapM<yX0eGW7l~o;_~x(E!_Rq$ys=t
zgP!*umxfFY+sO#?&le^qk*nd+6I;1V|9QOxKH;J%YY>W*Hc?J=*cs_~KrUX{<9rYo
zl{g-RW`)%TcONw4_{#D%u+FWb6n(~wB4ozVD$ft(+BT)dJ6Y<i=Bpoyj2sB*l&BAP
zb`_=a-w-s%bO}wRUJBst5ip?1?hjC3<J*aG2)GzL#}tC*-;3gdj1ZyJ^;{7!-`!R;
zO#jc`EMT6Gj@rwx1aUD$U`FxM#PKnc?>(Q#we`2hUrk%m_9!|Oep-GC<&#S{EZhnr
zB9hYXbM*?&F#5D97Hll%hVVC5z!{xk+b?vm8Zk0Ho!It#KbvM){V)>EWppJK^vg`H
zta>tS2*!ROx*YCv2?#n(nicds;p}T=*ijZ}=o|VjeK^v|^kOK0Njy-4k$auM^`uAC
z`QMzvpWu{NxCzV8@m}oX?^9xfo@g(^!?^Mu6gSzyeQd*L2hZ*^#IHka{DPWz#yBp2
zVGO#&l%#(MY_)Kv><kdxOc8{T?JrC^XLfD0)nz`eqtswciqw^Q;d9}7UMhmbxC{^$
zn_~e0{j18~Ohqe1o_Bv_SOljhn|ZP{gq|kUSr5(ax43J}4q!5&Q4D_W4r*nn>6v8<
z$GBl%{v$F$?-sb$1daaX+mlo+z0>f3+~t=RNij{?z<}qp*&#FJ*lA9cQGzrX)!3`G
zi4DDsaNAHpSUrT>hQJ<5Vw)PARS+GZ-`W@>Zq-LCDpE?<Dwh1(xGP1=PBrZ5i57`B
zfpyP!UCz&tCGl&f^n;CQ_#N7y76(}I``>FN+QkveID#2<Y|O3aFKAUHAEqN5z#O?h
z&+*Rzp!H_$7=&#|o4wh*t#xa%krTbE?>nFCg<+4mtqqPcq28o?p@H<X3^k5h^mRcm
z=yxl_jIN;EkbqZ<55oe^h&o8v`}`B0CM#p&OS>*@jJ|N{t(#7eeuT`gm_{QTRZpGk
zTW8L=WXn*~fG`vDzvUa$YIs78sJvz5iJwaux@ABtdS8LC;dwl*<)}zqs2WU5Qf%hN
z%mRbO1oFoV0Q~m<d~rSS$+%}{LPK=Bmk)l=KrNJXleWJ!zVf*z(jgQA<lN)_4{J;w
z1-T5Fzt9A3o{cdti}?5T=QqVdmWfnybFlbN!%2*l(9LLtCOUgj(Dtv^#P0T%SwVWl
ztoO~7RS5{9j(9_tIhyf0&K3E;E>oY~-{2mA=UpRef2QsV(oB4&^R#u)TI^9)XdV0S
z?FSrnxgx>;`pvJ2aVm(je|Zvy(Iav}pyBZaYnAas;0|pt7SqtLmi^}E7&HMEc>BAS
z?UwADqkK-={KvgrN2kPNi&{t<C5HXSI6j^J6nV~_)Y1Y~1M-bY9JJkXQ+f$&3Rjz3
zNlD5(Ywmn&XQrO*=Wne&>G2P)ddnzh=?<*cRH~oW(BdP`U(Ue`MtG&2E8cGWb3XW+
z{}M%$V!J6PNVQjPzkv`@qs8kb7>V}OX*G`XqY+c5?0YcR*9fw06TaJROGPh2e~06{
z6<1sADD!dGVk|PB^p^DOGT%Zp!#<MzO&`XR(rTBWqsSM`2qYO(|9F~n5rJ(%F|VXI
z=5=KNx(=AGHmyuw6Z=3poB7_Oi#OVG@M7OBq4x!lZ~>%w5-+L(l&_@L)PzM|_&>uF
zj0TZO;RoZ>@NUaVwIn*}sXu6Kyf1>_5N%@<sZ45HHY5Gr$3@uD_QOy=9^c{D7YN7i
z5SxqU<+J<75sPmlWLZ)bbkn}<b}Fwby=qJ<v*3fe3?%>7nQdAxtr+fDKc+?uVlZ_x
zb(I#&ct#+CyaIQ{k;<-)<{A3`y`1SE_X~<Cn1y{Qdl~H0{apUEiKF!!AH2DoOaF*3
z9mOi0U8S$$d9vMEq=C>jOi;TQc?ivmR}z1;Q(=WgG<Nwx`!JsGB<hY~urpVNoj+7%
zdoGF(>9Ewn4@sTfSBv?7V*DoFMeT_`P|;v0oQaOZRV&X@N(Yl)TCzxcK{z^L#B3)U
zPXaXg-<=ewNgc)PQ_7?LHl_J#V`+&^lOAqz{M-8oNwA<66)n~bLO~}&Z%Mv3OHOin
zcl3V*>=_nUblB~JAEfW`B$IltKdQDYJbp9Kan~SImy&UCBxrlVH7+%n(e-|+Jbw%0
zURTrqXvY5Fb+dtRiExg*L0cyx#T$6f*OfJ=-1eVY5ir+Ms1s@mE$S&>ZYcEs_4S+D
z$AHMrBL{;R&FTKwWp0V`Kb8CcNdkXCpcJq5T{6^`RYVDFE;`#NoCXY(Mh#)1gLKd~
zO<7&9HQG1nnSZs-{0aMiuL6C%N9>1Uc(iymn~!G4%Mm(Gt**g4^+Vt7F!`V_yev(f
zt4@i6Xk`1>e#!rjeF#KqT5@AQHh(*No=vq}@Y?#OT%uF3*)J2vDt$lo@7O#vvo=z{
ze;Fsx;-mdAbQ}HcEo}bdzh3cQH*_EEvs!1<7j%x#oTGeDKKl7mX9t%%yW;)N(Bic#
zn+hKM$SfOafl?3)C00p>YArol@BIIJ9%8`HF@b`X76*I~yOCTgXbi8h;`otqXRjU4
z-FAaKU#nN-E9J&&;dBwdMpAuR-3*J{^z)AW=m^e$!?)aECf0vHsYkeVVZiqWSVV<r
zEa?X=0>nHdP{1m@#ZR=e=(a&74Ay&T8W%I<YaGHq$nhk129kg$8^C_wFhsgP=zrgF
zupO#6+tSi4o!X*}So(I*0-9%Ux7R+_v@qFBXjNGJvZsk=6C#djsHt`y{BR=m7g|ip
zP$ARuhz*(Ke>a2Q#E=g?4ZENio?%enQ>&ML#(2J862%*R?u>!UbYLR-SXgO${p}Pn
zRKlF+XY-=7fcIgh7V3UpEnbqTSMGnEW&Yl5f*+%{&@G8RCNYUI@xQf*{RrdP0X9uX
zT3lI3knnT_1kB4~#v$ye;3%M5x<4T;9APmgJi>&ZAh{VRA@GkS+kXVC=y$>!J4`Yv
zU%V5V!6h%r`>b{iRANDA{&zxx1k%r6X}RJppf6-Sh0`8&Ry+^?|B2lcEG!^9QTZGe
z#O>lA8LWkfxac0ZB-{J!$K`TJOtL{8bT1XxG{ZXb4rTFQI-UPsWS|DsKfcS~BKlN<
zg2dX6u;a3s>;vg~3hn<0fSZFgJ7%yO4aO_0Kox;#7Q4GYOxZs$;{Wf}ODcw%rq>ym
z{WiU%HJ)ZusY~;JMU{RhCKoJBrRwUd)=2Ac)<PVyYHq^p=<<S6<e-wZwX9&Xn;DR8
zX?GQD&TJq>g=p8M{^!SV5qyxLVE`_16wGh5g>Q)jiTbYU3v@kh;F~g5VuM6#`pQu?
z)MnWgjZ~-aNhVsP!_=;yiu9$sz2Pk&T0?#_EwfwtZ(9IRCiNep#mRPJaiBKQi4Z&m
z_F$Tt%CZ&~=F1<Y-@O~lGgSCV{q$Iy>!H!|ebjHuuM?}EwuES)MZ9X^FR+I4&l2Yr
zWR}^B8-^_V1nq0>d+eoc&M(pu9{pNW;r;Jg@g{-?Uf%#GXqZHFVkx4BqInC!P#-8~
zIgF)jV!%8gh&C}XHmj-R*U-7Kck%;6EVE~9Y@86f+H6Gz$6~vh-KUYS$p&ekm(lVm
zDb|2tXtG1Ud@$DR#ylSgPgG@LoK3tu|3&@|QyeN5MZ#>dGKL!{p+V>d2Gjj_3MMr`
zdv2T($oREz0f_ej5fMcHwg8vLbh?tu>*AEFuMf=T?UkUE_<kX}`84kXv%JZW*t$TU
zL=1u9t56$X0`Z3tyavq>u35Y;N1DKWf_IE~>jP1lu|PsH2t+1{E5&|?!!vD(%9>sK
zf_4i}^Q;jRsbQ;TYYgf-E5)^ZS@;-d%9*KKT#?6rGF{_S`a+;s_!+J1#$g-v??a#u
zf1oG;*GL^DC^GK)-;oW#;-|o5gpw`Fmp;M?2oobEes1s;ff_bL6mZ)jvih5p)$2t>
z;PM(Rp06D1(xcf&jrx+DX>~$hV>#%`wBnu)F|}YD2B&A{W3pn3VzRd$IJojB5;Lr?
zL~%z&aGupT;*IeIDeBZxsbNeictyO_-fwKiKwp|usN0-miKH!HX87a8)|f^6$puI|
z+}-2(0Cn<HX+)U&|Aep`9Y6x`9!R!}Ui@YcNA)rMjCJ22+&9aITLzP!Ra?E5z)E{u
zbh^+1lY%woa1^Pnjnz$>=Oxx?yFpE)2)yXL(Y}qY4aw{MLL(;8>zvvqRI?clIwk5M
z3vl~J;%HvvK-lL=Gq3)lZX1)$>8m#;ht<RP%g+;Wl%6t&18c%esI2bR=&19YCvV;N
z^RiV}S05*qKMNr(P*kJ!ht79;c`Er{6|H~ggcMevm3ww}3n&~dl6Q%0XSo-ApULKF
z5DjH(vvX|RM?fS)IywZ}1|P*&s*8C!w&Fk@l6=mikchE`(pP<IIpWLH&-2(1y%X@F
z7tUhC%(5mPvk-SGmySr8g~~%z@FBGMLSIKX^32uk8AfOKj@2o;?kWYk=`V|4PBV5i
zKTSU5w6hIFR{4KR47Yx~FuZmC<?j;>do$)+BHQ<0Ih(0cbC~{|bAaDh?~^5Ud+kr`
zG%fxQOL)vUEQlm<DVpyX;*ByI9hgaD($QR$#zFn`n<oXJWHiMth%S=u@7lkZM(1o%
z+Vjksk$05ZyGtsE15Tj=5o;Dp<Bkyv#N5QdhZddiv-l|ut}6}MnH$fFqI(v@sI<~*
z;bYn)Y<{RrRG%b0Btd(e)_M|`d~kl}Oqo8w{}0mF_<`mTp=qh$>oV~_NliZmElvU8
zKsG`5B=A~^S|8fej8>}3uxdgY7gHrgAlR)0`})=Q&S;m|pD=dW3_yyZrp<ScX!A2U
zn$Uyd3G4{m2uv!SmD&&iOy`8ez-I3S>rxCs%l&Ni4fI@!F45D4U9Jd=WhMLqUQ^kf
zld9xysVId%K2TCbJ~D&><*=$1vYQ@v@^2=y_V67cJ&ki5EM4P&sR-FcZ;HJB6yV|B
zrBpL#5_C@M7R+goNuW-^Zg&ex4WavNBlY&>)3uS<-%7ec?arUNU?#7m-Iy)&L`men
zY{6mL1%6med|h9%<&{h#c2v@t71YNe#<;Wuwk`XenW4+RI9{!6UEOym*n)VLEe{U0
zqqhxKF~a`^@ZKkY`hV?~*8oD0?g~FQN>|y=hi~(`!05@p3m5$`Os>o@qp$W@?Gjid
z_Iy)5{&+Q<Rlj{IeF(m5r9hsFQ4qh7!f9grnLyy-OMc>|j%;pjZzamN#pK{aizKdL
z&C_SDEetDS?|KxBoRBQn5m?pMHt4!9hV4rT#xbZ6`|P&3>SPyCZhrLlZl&|aYo34E
zn<}@T2~(o7UHWqsT!c51N_$XEe?#KU^Gcz1+>HDGN?HJGkq72A=HVtQh_)E*`+Jf;
z^7A%Ik4bn56sKwIY}BXu2)G5g5N~N_=>$VbXun}IxK2E76JhHuzzXHl<t0)rM7J|_
zrJ1knG~ById6dzI$w3JBCu=1q$|SJ%xEtx;G0cK+)fj$Kv-eY*PWu6E$oabDKTk-C
z#0SWMrZbg*(N3XX@?W0Af7d%bfoRnvc4qCP@wX7Zc)nPvP)f`vMM*d>-WFMWr*`@(
zvS9Vq$X3Yms}qgyY53|E0zVbgIL_YoPEi|Pb7zS*Pt?l@k^t*StCJ$*>HxaUB{A%?
zV*=0p3&U_D$fJZHe)`|1nu7G(^I!H~t1x~?R{ARp$9B>X*#0~1KyEeTqT=DbG|d|A
zc=)7p80XLC<f9~);>u#z)lc|x-@T7{pZsL`gRKr8Hg+85hHbeAh(EJ&dJ3g>m_GA8
zmH5sMn2sG`pbJ#I6|4K<wvZ5QNZ<=7Q)OZ0%UWAoe+TuMn0{aCg#;tUnCS24?wU2n
z)k@{duOGHP>!8y^%ilflJ^JG&mKGs8@tmQRLSrR(FaT!quey8qG5d2!LRNhsd1dh%
zqL-?0$2=9Wte0{xAum5P(FC^&2x7iut&E8g6U+}J*%%|3WAV4MppYQZrde7VOXM;B
z;({r~VSJYmeWBx^%wq7-Biji7Buz0qm9jjsBnH<SZl%6XrXM8#+&`%?+T)erVvL-X
zzh4#j^)R-v6L=X;*XrJjnfD1>R4JE}Gvg(^Gk{Lm!D6qTOHbmvEB+A>+ozZ;P!0-%
z2jVVrj<j>dPY6<baPnN_n9WN#hcZ){_az`N!)$2+Y#<cthF)@sv^z0k{$Uc@T~CDf
zv>!dE3FM=)^7><S#IA|;_6vVnS)zILzYF`fbnL5l`T21n_$0~%Vw+?GXiaTx#1M)D
z?3Yh6*j4YcF1?IV^e)~Y&_v}kl!b~Eg>e`p6{L$(*zI+6(dlBo3TJaAigw4Na8<^q
z@M6WJIgLQLr0D)xO84)H_Dq0nY7q2h!{Ryb#%Iz0>n%V<RZ;EB!~A~xkosa8xtOe|
zv24*~L_qG5XhE&_#3c5z<@I~ph$QYe^eqzH?}M2#?mw{#<x<mCpU}4@YL-t3oNFPR
z6WmSrqCvRibyW)8`&X4ICG4Z3n8>wH{)6~m&m1aBjny+DhF2)J`?QsUNUG4wR+glO
zmBMzv(q9kQ$yT~hrQ0B)luchlt%*6Q8ai89X?I?!utvKOTCLiQ?1v)T_z~-WwHy8?
z(!uTqyjmADKC3sUQ*M4yU1Rjj*-B6iQK>(!Wrl0CCf0^Qwb*L4GheA3r&0cS9gk4I
z66&w!mW^t1{wmJo(Eak{&&JnJfhGk`2$It_*a->V_<N5-r=S#ruM)bXwb|tBcDpYA
zC3K<f_eKRxOtB2QAf<jQ=hzXEPXE^ti&PhN!fUEUYo##yv#7FgR}_=6x6aAZW2&^|
zj>3kAswp>7z(wfezWs|>ru{jCL?V4oQqG^X1MX23E7&cFq7}2Edd;s)UR8Pj-yf}t
z##q52-4kxym*v)K9{f){1<RnOX?4D=4QzuRaHirRLjF&Jdex%@4E_Hs2{SaQim;X;
zc8&mUTMt`=cF$hGeOmY{cf*{5e?F_n2QB7`>+S!?-dl!6xqfk@iYTB+3@9KmFtjL0
zGju5_t)Qd~I?^%HC82=CFoZOypn#M}OA1Jb3eq*q0Ma?c0K<9Md+S#J`#m4dw{u<Z
zw+F|&R{qwyW32$^ZL)oLZiWe2CdXevJbhNa@e}pZ^8|bmz{>N`Av$a^<g}?@f5vU(
zlmWo`C~n|Ej)^Ay#CV7xd`Ksc^`|}Zqe>JGW@^A*l#qFI3(frlI6*%56F=2&CZkfa
zK1qB3@0+E6v-*(j+8inLGs>wiL-v)(KkUC|Os1JE;@Q>-2dV^l2s7QM@XR#$88bHI
za9o3vK{Gd~-|GHo0Muy#uq2y3tA54h>!~mK5vq(?UH_bF2eNvi0nc^EB}IR@bwG!_
z*mSB06wf6<&R!?q{5$_)EqTQrJAdN0|BklA3Sbg99mAx5#scbma9)GoEU?^Px6!hl
zGsuL<{h<9f;*M~hP7vHc&gSP>H!cKxu|@@1=u5T?m4UavN4q?LZmhXns%N={yc_v*
z$AO0?^mHWWbX@NDpJCyPVX_E*KuQ=9!+dmHQ<d6o%n7%8AH*w}Oh!vrd`?Q5%N{R$
z3yRJDAtMio@}FeloNjEf8_fJT(I3Te%_}&S%6>!mq69xS*k9W0ptDe<;4C$qYsYOj
z$-^Y}4Q)}X$zOF-{z!rYe;!|n2#Kb#j7&4Gbg*vkF_}{t&(*8)2zuP~<x^E2pX+ce
zeN)*CyCA&s!|RTpi}@*yz(>vv3oKA|2&gegXGi69If}~VtP~S*(tBal8M*-p{^`WW
z!e@Z$ByKl8qwZ@lBp3c<+CPTwFZ}X=Bmf1-<9R`DI;XqF#`=7NZf?3E*N7RS#y;KR
zLSxg7#)kqpwdf#PaQH%3pG7>4r55(f)|{SqZDEzgw=LZ5)BoA84Tydi$l6>8717LA
zELCW>R%%mdMzf9`m{u0snuJOyWF21==qfo)-TKucQn;l#wUKRYD6+MP>lt>BZoP`t
z?6rScsSF5*d2?gb3mezUTT!91>ojh#@nY<yg^<azF7&6={QGe&g7`*ZG%WDMs|c{s
z6f4##I2ty{KaE|dGB9b*r3Gm!lR7^YwnbknxJ;a3BpiYum693goadZmfl$mqjE0~c
zjB5xkZA8t}Z>W9cw64ich~9*j`D(~w4&}Wjw7To^$NO&m_a#R1gnlLB56?kL8ezi5
z9im&_=FG=*#o6T^{GxMHGjaZ-vuq!giQnkOUsa@1d~nx`38EVQ1aYnG>^e{$!^sT{
zQNJT{xj94HGL(=14R$`<q+{;_gSZna;ihpTB+N}k<{49tl(<`zHLNZa(Ow@aWkC1(
z!-nR;3w5)r-poB8$Zs*m|FA*-`VQc}@TmaQRcs>9<7l*{7XJioUoM0^)XkLFZ7T8w
zI{cDw+OvnGXR)oKd&9n=igjzw=|J_zgp}5E8mz$_U!R|*sm9@{CqL14DAEe~(>+=1
zW%i4dEMj*p^YYrWx6h4ar+Z%`-I-4npWmgGa(Uq6h#WI>U8pdw;d<4$sPpPu0qP1S
zBP~O|npWZWlKOHBEq9gcxO971Zv5*`{$(92(KoV)hiutY!c-zlIb73I$cIybBz=XN
zOY*x7t`LXBoPbA<Utsv@Xr&#SnKrqR)9+ND^vfq*BsqD1snXdxKdp*;lA@Ci6<BE|
z5FXy3<Vkv&Z8q2m?fQa<lP!~{wMOX*@AQo8tE;*Mj|B@7ONwPz>edBNBgoYzLy*1u
z-8;eM<**BnNtGG`fs2<>-f;(&JCb(}a#C8e`8}~VXo6k7W^kcV6R8}1sXl-2W9i(G
zUtGn%WS755#IE1QOGZLU`lKypzZ!gVHoZwXSaFSBYTy%7AoZ=0WhSCP?F`5rejbC)
zlt;T5#TO4tq`5T{qq!o$tF~LL{ax2SW{q!Dbk!bE@k{3O^KT@Eeo@+LdlI48F(u3Q
zh6g$N-n=y2HGm4$a%ZpqUTPSW4O?juoCqTskPxOyd84f@9B~dO|L9ee!~&_Xdlh)+
zi<#vU34{KoM(Vc^Q&ORiBJ_KT{iJ4YtZz`<GZ>$@3E$YSym}?elJf%q{%UBb0s<S_
zXkA|6*(8U2L(lW`U>g-ZCF6|?5~l*&1P6L5-Dqw(@k=J=Oo>55SypTXJPFq)b}@z^
z*+vMwhKvWe<>Ri{{}BY2v<wS_lJbOJ#s)l`?t7u>;jBTwA#8Y2b&cDfPWinBpAe0{
zi8R}lgXJZ~uo)E@ll=D^#U1G6_fV&-mu9lKj%oIMi?pChr90`AGeguf9*tv2JPq>T
zA<6W)C>LLQh?Kr#1)*3v)1)XN{xqF3(-1qB%H6AnL?Ogw`O~2c?7yB~eoWe>Xkm2@
zD#A_tSOPOMQN>9CQRHW3Eq8K4Ds?%XWJ4lfJPt+o<!e54qT6eMZsJ^cNr02)mYOeE
z9rG=zplg^0N$w8$vF{B6%2XPQjcWYSR4XCl)s=CBY{9J0xTKX?<YE(MFpzWW%P-8H
zlw6}LeVF$RRnDH}<WS)~60r+<W!V3%v$yZYUUSp4Fp)23M!tKFFoYNx1jiQbu}=Z}
zpEJ&pm8a-B<TWAwBdkAN|F2%<v(&L2Hz@6$IYD!pien7_Dfm>r{JUl{w(NIp#jci4
z{CAytimH&;kz}jiX6|1IjEbj=l4LcSl6Isqr4pesZq$2y1#7_^M5TAp*5r!N%TvWc
zUe_g@LnlAp75BVEcV}tjI*6}tv)BWssTqc6t5F_iu8LNw!y@a?LEVrT$$Wc)ocAh`
z<DJ`s?koE_Zyi{uNdq58vt6D(#duw{xm~4&L!AX9ENrDra)CyqohMLGJ3^8HOhuw_
zK2cXh0VaQ4<e4IEq^?qgAk~FTj)F(0B2}^9bWC@k`_+Uw_w5haE4d$T-84)yo~xE!
zs@^eMtS8tOtnwa??Jt+@++(rt(w$z4g5B^cZpyArfSZU5lCsS9kKCCX(js5HvFmg#
z0o#_ijDYEc6LYYhBvX|q0!2Um@u@+kIdIRqZQf2^k(JvkDM(p<+DS{nODE*<kYXUK
zaLZyI?`ydmCxvF)=-VjOW3Q2duxB<e&74tDrr4D4QtqHqVJmNsbC$tAbS^O!lumA5
z0nheGq;3k`_fo+|1d)$o8GBXZ#Ke0ryIc42TWs?+!kXJNji{kDJ0U79(V|`1f@v~^
zHym0bvd|xnm>{k>nFVT09&nxBUncqE(<G+BN`#-`0Y;A~vnw>~UIqE9Cq-AYMefj8
zi-c>)5OGxNMqk=e;wVeIa*oJKcb8Q?T0)h+F5oP6-^5MfiP-Q{)=fT%qb(j>&k@1o
zb8Q^6af;F;_MS}7Y|naD+(0$exVK79zh|FY;%Ul(&%j&u*nEui&8#$!++=#fCH{+k
zhFVikFF<dqMohC+8+a58DiV>!#H6vOj%qy@WEqy9<hu-$jtOEG#Xft1?uue{`yQcY
zZ044!{6g7>)s#LqO@8>JBIA2*^8_s!!JBOngQ_4aa#z(<vDM+RDsID($_Ku%>v^`Y
z?%9tLaTldpiYW?nSHZ2X92qG;2Bjv?bnLs0n7|uD{JPwcuvEBrMUP#beg3*i3wi?b
zC#ylx4eg|3PLOQg#3ZYSUtzTLy5eo8&w_#*R}y@x`>B(Jw;f--x^9Xm!CVhy)!4ne
z$^~l?b_rUvw`#Xrl4U%+JLxRjmD|n{+-Vdow-rfi?K&=lDL0jX$z`cFt9OP)Fu;uW
zJ-XkvgFPFQT2uAtx!<2&SLC0qmT4&)T2GGIbLl?Nkq^B*D`7tAF+L!d;=ee&Zk4La
zHSoD*+79(oCV#M7b?QB_R()HNS|As3C5r~}$!Y@~vD(K|AN+>}SG}!QeHc!w(=fv_
zE=~%S^>8u@X)RdWbeA!b=r&@ML!XHy=ssLKf72wEvI<ob5oD$KSSO)wd9eWwflt7h
zGQ(nqEBo1B%#sA2*%Nw><#b;eO-^hw`V!ntuMyQP_-0Gf_MxwFkFyH(@)pb_>Qrz)
z{lEtDFopIKtgRJhb=6NSWc@*m9*o(Sr+FKWFI6dca5xGdfE=*>2i=8;vW>A3AlCbS
z*T?hgc!PC0O3QWql!1cJ?@n<c4Rz^?KjYm!XR8>lz<Ng5WujScHb+S5j=;2FJgX=L
zueAa#lo)opn3JXG*^7I2W-LrEo3AZ@YK6(HzF^>%O;CzJUi;uQXu9cv&s38r;c{V)
zfU)Y1P|#bEzq;@Cpl7Mg`ju_IV<0@j-j;q}zjwDQerV~+bTUrlHkNgg$CEy2Xa604
zVN<*j-szefMtk2QI)i<6{hsN?Kdi+uXZUYUJ$jb9jwJZVcvvX~D^zC`cm`FjUlLRp
zme-4bXPx4l4nmAV12t7MifeH*vMP(WaG^9K^C8jl(^Pc4jiN|qwr4{pgRu`iTVzS2
zYxd84{CE&8Jd9GBSRU45bGdSnxZRYFpIizP;1iZL%+V^r@>(rk&~dhZqEC20_@Zm+
zDiZXjw#`E<gv-!Eq=nThnfB7^@SV7w1CiU9-S#gNJ4!6GNzyRz?DZ?DBQPVFH;rLQ
zVw!XM9Rc!oae~Fdf34~{l|&i@UQSO~QaORO5GO^FcXRggoQIG^(|C40E27WV8RSsp
z1|_SjYK<V$5R1@ld$N||R{E^#qtwDQ%JV^$776FSzELA@VWKRg=4_*ErrcENDRuO^
zgm6Yaw61m&8Y~9GbP|Od&oBhqSvOjB1%(dNw|4qgD!=)hu(yq{U>X@^YAG)de<00n
zE_6o9MaaK}OU@+8n+A%x8_G?&+yzSyDT}htuju`Fms;1p8|IT@n_|J<8Grp(d;YiX
z1e_+yzKdn0k%&G)8tleO+p^e{Wb{xkWZM~i)ABP3wZcn3>F<*pT?@$utl;I6nlLs8
zvj@wW?bc8+6*gVo%iNZkY7x%-OSjb&9%rT7x<FKh?ldJ8N3)>V=M#jNo7prr9)cD*
zv=^lYgzu)z8?_EG$hby~ZuB9AT0}3FtXn$*ap?OTGVXzEos={iCZ!Z#uNAF76oegI
z7c61wN;_Z*yMO6F*+3&P4uPA{O%t6As8O(Cdh~S~LM_y?tYQ*OW*M+Joa6(PNq2hi
zD$7(~j%Bk^rm6*@ObpYHLE0+c5YWrP)P*JAVZ3GE74cC5X54v0^IXmc6Zp8jR<kft
zu>4!LHqHcJ(j9qwn!&p!b&hdStOX~IR^GqDy?phlL%|}wyQb8l^%H66+c&dqK2qqW
zp{|erWky|yvJJ7Uv=BNfB8+IvnLT+0vb9KcM3QiLnl<WTY^9r0-8w|^^>P#gqT4+o
z0#>>`V9mw!03mO;CY#<C#ja@CUTI&5eU(-RvyRC!{f7Kh<#yAae=DUsOVm^ZS=SV^
z=ojHx=y0{|`9)#l*6BoROe%p|#@-ZG-`tgbEyX>&^AdgEf6a`OEY%Fv^!@?p#quHj
z6Q#3(C&K#NVp$>4RMgZk9h!5Gv`KofmymlH1L3ef7V}Gp($=V!Z7%MM__7sjwr_FQ
z!bRf^$gHf7;Dhx#r2SxPM1rXz-6#HBBgq@HGU-iu+6r<D3Al_wfqCsh*{9zB<q{LP
zoIYIF8W8IUERTKQHW{A!snyHtWnUxvl@><4Xi`w+eAIU0=PUe5W{>Jj_7y{_nvHVm
zM=J~^kg*XJc7>aGtr%wRlBnkKQ=3xlmm>bxFN~2My9I_%@@_AcD@E9XLK#jgDfe6_
zg0J_nYrA7V)u*CpZr*URuS#rCt4bAFnWoPF-%eC5IGpg;DZuQnPFoA7dp11V!HQJT
zpp-P#>%D_NXz{<Wj)q<ZL*dI%gzbN{mM0nvZ-``bc@g5F@U^R1lSU{sHQJO`BDWWM
zSh-qMy&vrL{C^!9&?=}QlHKJ6-ANj2;fa!zH|?o+Mqjg5q>1}Bkm0w`W*WbtvwyVN
zJEkF$(?!unp`q*aO=9h77qd|9GWdVkW{!92#15LMl8ASW4;UC0Ip4Vv%c5ymgr&D$
zQZi2vw&m!}j_`w&;XAe@&8L4KDm$>jD=bxi9VMYgraE<(%H!iXvun}|x?s_oz3z(h
z+CJ0S6yZCAmK0HcIJhZwGCMPC5)9}6V9Vp#7gC<Q-h)H0<ovyq-+klMGcq2RLi!|T
z4%UEMTx-|Ox2`v>5?1`}TWNr6UV8>|1dN=Y+0h_jB%ymT!StYkvZnM?J(hD)>R?)c
z(CD<k?8xF!i>p5`x4|v|?n#pp_gUTX>s@;ze%Q12ykbo30AF-8vX;f|JM!LO;P>@{
zc3)8IV0}L8B&!mEj7}33`ONTzc#kidHPx>AuA=km@p-fQUfU%Z-xhL?29NOEbGS}^
zmhE>TWxKrxUA^?O)9kXALt|rhM05JGGy2J&JvU<IN+L@H{TL^Lzz0+4*8+!hO_|`s
zFP&fS2>h|`AVqo9rozXEATm{pTckC>fQ%DsPsK~;rO{;C^yNS`W>>ean)#bY_6Kj?
zR+xWzWRCxuKmoiun{a;BZ>M_haAL2=ox&XdS{rP({XU^WQn|M;eY@1Nn(hxWuaFTv
zOZ5(X3*}@DSX7@Hty|u+KUiK~Mu*pZKNr%wm)gtHlP)v5P=}GK!#taPTC-Q}zq3jy
zw<%o2Qn!~<e}7^J+Pn7w4LsG`y_OG`@%8}95KJzBHxrQ=72bbJLOSczAaat%pquw3
znTa7$5%6+F$WHBQ4;m<o(7VQIzVWGPZ*y+chu@^~+QD~Z-So5BZQ#n1J4>%&eD;{v
z;_8gc@$(j=Fe66B59?4`f#2el0Po<({;p%}MmH}5-8&K$d0Zz;K_0XrnOHUxTTAB;
zu-b%vT>5j&AB?UvCZ>hDpUX%CXmE>jt!C~p7QME%*0xoLGtThFPB8wVNZFB(A#<g^
zmOX|<APMRK_u56%Cdb18$8MNoEp7l^06cYbFmdkBMkt_>h@VZ*oxd8<)l5CvtX#zx
z`b*Z&PET3XkAqoQN;hX#jHBg#TNJPoI1wY#2g^=*{E|YfHVg(EqR$kUK$=%{CN;8?
zhmerC7CyVZN(qRHD;ZF5n!Buh+;87LPf88W^{-etQ3r&`9Z(mnL^>XHhmYg~s&_j?
z9>JX{KFoATMzXUTd$=xlWpy>O)x&}R39mrSCClN<V}PZI!2KaIe@)Ng6Dg0&?WI%B
zlK=TD1EADGLVs%1Ur}Yi1@;$hcZw;S=YJ10|L!N?KER94Hlsfj;%}pOJK`|}{z;Z9
zoBiK){cBX71L}H(9aj4za{YB;MuLEtU)YyVjrz}DZ^-}Oh5lcT{w>)5S9Aeq`~N!~
zwJRF(`AfM2UK5Lni-!P(Av`7ASH_&Smy+&zdwT<p`0gdgpr6`bNgEIpba>Cv(Q#;a
znEBY>0M7>Rp{J&%^i1ty0bu4dehfYKRi=O@0iDzmh}kYDpk#x)^osBSTI$HjWA4bK
z^<8!KqQKpRQ}F5^9Vc)$`14G=M*`Jv>YAEZw$V+;*N|COZR>4K%1cfj+oZx%fN^wv
zyb^t!Zy_54{52y`Il-)Hbj4$FZJ=V@ZzJl4`VW*BKtp9gLQ(B3LHLfG81`HB_^!kv
zmMV7?fCk=2-ysvCKL+_zBcL^A=-~5;Of`8TZjJ_j2Jf{XuyoDQsZ}sxzrfL^UNh}o
z-o2j&aq2A@tJCgy(4@kyY4D`lmE#|7hXJ-VUiSPP(4h!X&8ayBN-v$!C!6UzyXK7t
z@E@MMp^Up>m34`9)*a+0UVLo9jV=JD_0}FoY6vmMXVNwET9(}g5{mkx*-2aD)KCA$
zAcb@%St_QFn>oXKaCllKz<U~T0Iq;z8Zg;TrexAnCn@P2st>#CEY?2wou%Oa$H>c*
zuAM%2;Ly0r^H)HaCl46wvl)R1VA7XPn%Nzwtm4_-r*~Hs{t0sf;vt{Sf~5&jjjNac
zYB87sxLz-RqA0+rh!EKs&W`5#a7P`n{_wL#*NF~}|Ff)R*|Ur&L`;Jh;7Rfa=ay|a
z-@PYZL1gTg)156`B2G*E!DU<R#|hv~sqo{-x&a3)qAGoh*_a6Q@L2X*&7ZAc3?ue`
znGjclVeF>Vx%z6O<1tMHhzR~K7Ozh@1Aqe>a4f33P40CtzxvB`J0AvXp8^j$O&=|^
zY_5w>`fR?{?DPELDV@%ft^L}~v(%rxAm)0`yH9w|9r)eiuR}A|-P=Wh3FwnA`x5*K
zhm&&BxLob!yqX^&hs90O|5HaL@ZS8T!N?35Z?|3UUhfH?LbD&kzndW5m2~lQ|BLUH
zX2j)1gAGx90Gn8e%PT5Enwo;f3j6nDRy}*y1i_nvO>=%La};vhFgdp`W3mTLay}ja
zF{ONlLVSnbi4!b$Hk?U%cYJzxN-HyV!!iKgFPEdg0D!our>AEvW_7;~tt~xG-4vT~
zfE^%UVEgWVbMEDfA<|g3tG!Ju_|M1c>+vnmOw?!poSD-XkJ~>c>g4-7omSea%&^RW
zz$MZEh9KFzXyg<4;3Js4$PbqnQokooXjywfBJ=&#{QB;NYBxg9<epR$pZ`H?NVeZc
zI&v`nC3s}ogJC*Tb|Ler{}#?%?jXec@Z0lKVVhScH?B7E-QeBY%h>U3x>_;z)hsde
zU>JP5$I$Pa;rYq^m;~|++;E1B4`I%qa2<2!ry}9F-*wbioA^ui07_Ju4Op8M>&DrH
z$HZDP0ggZgmA^oL-Qj5c;j1#APli69)=INw>^(Y-CxphfXT>YF<#55=rJElKgtf-i
z-6(W||6T$~u=c3+W}lqQ!I-VNRL%G60+%Lmh3@B%I%CsCqzUUtM*(fZi0&DnG#GPJ
zFJY;d;%Mf)BR!#_e#4OeQt#Q{Fsnr^StYkqt(?Chpf{<H8_;O<+EUS8@^)MQ@mS#b
zbx#btArN!)g%mi4<M!bKb`xXCZuEzQ)gyv5;b3pGX}Mxjw%&iRdQv8}dj1XC8R<Rt
zPH>=Z$^0<SlaUTgkGcBOCjD*jyZ~F0{&ev8U!PM|0&l*BB-Ae5Z_-cSh`Z-MX03!C
z%}-yCfRoF8+P7;Nmt3m!T<$#_iy`mK^WU%Vuf`p2W~?)8zUc1mMo(FYPBMyQWc?oT
zG>CFfW!@TOsyYRj*zEWQcmExCN=N_w(U9fgs1y>w2HEq<$;k3Q8QTjvngWwEGA<QV
zy>HAkgtej>>(?1aJ84#dN^3w(j*SG$Ue6vx;T6=E-xkquUb}%ADt@4{L<FdJwbGP@
zPySnhUax_Yr~Z3EauR->1NXcID5|aN5I&R?y6#cP+r@nFVXaGer>!U6bzc^efM=`r
z2i^|W%!)1x=DF8kn3rWFee?e!$`6wT&_|4AuT6zz!=;-hVZHv+=48rVQ4b?#;OJm*
zZkwcImtI~f)((w~r<oH6N^Ms@Aa<yli5)SjnDiSJW!$U`U?Hrjkk@Zk*ZZ3E<>pia
z#ncAts?P4YxObic?>2C`O#nt{V%%UG5Xip<onnEkeL><A*K6Gek9N49@3e>$W@Sp%
z`v5wE$%(qtA_&2T+`z`5_^{j%xW3Fyx@D7%|H((K@492C&5yfQU4Z20iw3wO1+Ee-
z;@yFlX~6jgv%hCf9OQ}W*yqk1=$OX*FtxwXR-6lPoRND~A1?nUr+-Th(DYd<<C>DI
z+Al4Be2ygoQHA*I+``*m2>9c(@+6>p4h@vwI05hZqs3240t4w_#}YI0@?!osTQha$
z*z}*2Gf3e0-+-hcQcWI(fcX^`s{c=~tO1PCfKU4Wy3kSasbmv?LQZGNq=9E1WuBC{
zb^MrkK6}6>xbj_8r&~ARo1Reto~(@oEyD3`h1d^6IVnOO!v3Tw$n7oo`b@OfzuCFU
zr^I4nA3nA$x>XCxe!gbrgRKRB5%mJ%^ka0PgEcXphytR?E9}>f?pq^<=<jPCmK%Ij
z@k8AmnDqL|4kv9;qFO4!J<i)j0&yH=8~u!|8gOinKcneh-)<?8SZPTrUi(2ZKqGZV
zoeKBfQSclOrU!5;;T0?eSY0>in)laxZZlmo-A1Snwqi6S|MH-fD)CTjx`?awT`@r3
z$~Qo(LE4HSfP<Z6b`d>q$a_qHA^_$8roK(|iDDa)3Jd3nnSdhX(G5T-+3J55_^_dx
za(|WdG1nY9JL+;9=GUY654!?Hxjwh2thhu5R6YVGCV5N+m0gOf02Qdd`wO}l_Yg5T
z{zz(7IuPS0bqhy1n2dnt@&JI?6#xK<(M=*kV`*|mc*(z0Kju^+Ux1Hjmup&Z&>y)7
z;J0JBG)V`<V<RHspFe*{<KMw9pF)F5-tB3a69}08W$N92Z9*JcAAFRJO#6y3-#UOT
z7R3E7tWH;fXua_C{Z^nqt@8mzhtG;w2-^w%IL<k5y(Gk>_awaX=x}Ay1<3rIciaz>
ziozaZc8)MWs>>BP)iPFti>f%kPqJS?=-S?1izbn?s$4Fp0EaE^*X_%!vP^i-?MIa&
zl)BQum553wbmMh<FuoNmF}6q8p`2TloSfztM}~t;?tg;&jbt$T%?z4}*Q_48o83QC
z{j1Ia9*4(eV1LscNI5iIpz@dq&ybT|k3HIYyaWyF@@%j0-C4`h-V@%Wrl6pBYFN7s
ze=6X)olx;%@UU5cd}kj97IN(U1ee8?JT*V+9;_I3JV-8LtXn$Tokq|5@AEy)@aWd&
zv@%-SCs5Y-?`qet<I(5pH#AaG*`gEOYc?rEv=7I$Gj=f3WXMA-n!El;CB)H#Abx%l
z*IU2;%7koX4)vkhn?RU5DnTRamx?Hg_HpL5)wxT*rYi!<IHsgr1~y>sfMaO*2HH){
zmb=f7(2BvFcRYwhD>-^LuT5zmWIPq{8!!jUV)_U^Gz#<k_D4&n3(^vqP!i2P6Xv^1
zCr@sPXKY72O3LWl$&_0zZj$puS5RbZtyhrO6Z-0f5|oUMjhiwFSQQHYBgeyK&kO*$
zQ3&j3A6X6!N#M&Y+T)>4D|3R{Q%`=gr2x_fWP#6*<QQtu1T3|nzp%2@Pn$||Uu{dw
zAnXm2JMI=73Hji&W#^U09rlJ=$#;&T_s{z;f@N(zmRkK9omR%GVEcz(s*fJ-ri1YW
zG)=|<Z&OHwluzBR|Iw?974`LUW9v~Bl)VS<>qVt!zkml9{MMdKil@qWI{@JpRBZ6|
z<)q$%`i&L<wJeo2NtqqZq9cnAap?65X@}h^f81L%C1G=wvR=IWa7J#QZkCrtQWmt$
zS9Gv9u>IWJw{LCQ(GOkI{d<yaYT;PQqf%;th)tB^aizi(mRYjXEVPEa`_#(LV>AYQ
zSllE~w~{c+mT^Fav_)#I@6xu?`Sj?|+M91iz$q#ww!NDukNWwZo|B?`3YW%d%-Oe^
z*6i1=GWe~QPs(jb3{uu^%rq@KdUvj69xa7D&GuU{x3@RnbM0ku+i^o?RrX7RJo>6q
z&P?oY0t*1wO&#H5)|sW;DT?rPa$?dJK9p3*M~AbkHO{+7={u%ZeQXi7yA^U9$yK5F
z*;PDu6L5pvoTa?N=asm$%&)d#%7%285%*1}1PO36fCjWXq~hDPhBvHtkPu%9RPb2S
zo2ENz*y-_R4@zGfYVBV2Jh@))Gt)V8m@T*7_tbHB+tFrFZhzI!&tt@VuMz!3!1Igr
zc4fHx0n(kE((`jjcM3&F6B<#s#M@MK*e`H?l7ItFwV@}Es?jRuxDNDa`eyegJwHE(
z6kPAeTL7X+_5jTJ)k<`lbWP3PBw;NtMBvEV9C{FPbO3H0tCy%*mKz^&n;(PI%7R@c
z0RrIFG9=!?LN{jDpL9>|*b6ue5`MRFg;9^C{$V13Izvx1V&MMsb0M{dwf7JR#BPZ{
zPN3)088WlA+DTFAX>H%FgWVzRqpasUC9)WSNThrkGmC`ZCau)tA=GAQh|ED~7Sw?V
z>4jA}e>79!zo|+i<wE#!w2vI6NESGFi*d(YNohqXW6m#H+>ATIPx@u2P3~6LkM~q#
z#1YF&D-^0sW*Z0TJGDD)cRjz9{OeDSO>(6-;C9@R8MVh=062S+GU+n9CVI3p$;h%f
z$0E1(y)V3e-fA@rJ<}2j9IWp+f*#=3QE%mT;Ig%~Tf4)(yKi4(4)+9S6(|orqu~It
zO=vf#6Mzkh%KBY7@cu0<-D!WS%h787OR_um8<*#09mdkEeuqpeEC47P$G{zroiI1o
zntS~ov<cJNn!~b_sGYilt`+t<?dd<})7QN`)6#pOVlM2{pW)ec;4#nV_i%S<T}GI9
zzEdOgD5K~Qk0vqOjX@_6Fh%4=yYpb}HWY%zSyB&LtybvAvcx)_UPL~JSMSI(+h4fn
z@AGj}`*72+RHZ|9TKk|Ktx~s7QpEA3_y-XA%V#wR5*IFsZ^A%EoPQytPBJDYmM%t5
z@$O~!8dr}m;<XlLtvxB?ht-o3E<WSJM`p3CazZUTPg*mm_>hN{+7G|Y?>iD^v?($+
zla3bGHvA7Wmk&jK!F5*`IXJ8ERX)?L<6rAeFADhLV?zpbu}my1EGY{X#eOw%{)2nY
z{MrmGht$YReshy`_dLe64;t5=OHB;0bW4hb#s5CjFj70|4HQzk1`{QvMLQNKCiXOP
z-7;u%H(UmOwtfm6<$tXdajVqJA6QtP#?*7;1>Y;jXxcRm3sgZ2Mb~H4AAaaz3-Vt!
z)SOh0|8Re9u6B2?>DVFP?;hwSUE8>Cwvp<-NP6RMeBm@g=F{9w3BHZq;g+adk5rkA
zbRQX(^d}p8NSrLiP)jB?2CZyOSpqLn?t5bd(^E9=p~H^?9+8o|W3QZql@W8iODFc*
zldE%(q4T3G#7DW;n1&%JH$;J7{>BM__wk}$TWlR%+;l4g{<u$iuu*ilACM-Nz()$U
z{5AG6)MC|k%bI(yQrYJ>wx<W>|8}GBliPw&2xjE#L<MKMSU7PZCm?8LS<6O1Pr0m9
z7`^Oim`ltL8m1}|g+>!B$VgEHiT3wVonLkqpJ<r(y5hHSD53d;#D0X9@g#Dw`O1K|
zK!Eo%0NtcRY|VZG$G;~$h>{N@;b<pu1fduV*##gf9@=dK0|UEt(c9bK0tDbwLz9yk
z+K=uKA|~q{gSDD>%s1psHnOim`w|e76^^|o*1ZRJopyqDw_bjzUump~YamGjMhc4R
z5B@RFzoZhtU*`itBmGJ<U6&ru8*BS93p$IK<KznY*P#MY%3y5!_V%{ZY*!K&B2-~&
zX?eNKDU0kdVJ18M6t#g+j+Lge4*teKbev>M==Yak#L&cqO%;GhMt4iK5m}Lr%hL3t
zEp%U_4omngD}i$YRa(9+cIPhn>u9%EjYr7E2+Z|+KW~@;oE5)Ho7*2W;8Q=r`z8E@
zl|odezU&9#ki<H0aqYtwSC@V3nzOFLIAq^j70W;q1weCSRfuIxe-0p5qNP#Bqj_-?
zi3d)2Eq{k19(cz=aokrA>Dl4R=V7n;Jtg|wwwH$8SbZp39M6x81DCiVUOlCC`5{%y
zD=T8wvAEa}F+6;^H7zWY(m55{$OQ^b?02?D{l85T#71>5dYorWA3`6^dh3nX!VS1^
z0y*`0{ftmjj^<D1UQ8!HGEqVH_i>>Hvm%V|X5eNh9-xwEVOX4?cPvLUiqDe6R?T-N
z!)>v@Fj@F2xN~egl9l(1zLdj4uc^y@S(83GR1*`GLaDvusv9JPS!vY1hcvy5EhdyH
zw-b+U>tS)BPFN3x-Hc!;R%Tuf<4hNtv8f<LH|PP8;OmEWzxx%AcO%3@!ys!awc@fe
zXh+aUWFSJme=U^i73<eGg~*i;3KGhC9Nd9K0ZPUb&^{V97As;Y+wE<)x$brattis-
z9PLTMW$b5AO2(G#qVLbd2%f?1?_&k#sRaYyy=2u5g-W!UXrxD{5F)ziRpy-C%K5&(
z3uAQYGXp)$o0+L=DM-^E6RuFbx}r}{a%O1)-6g}R7v{QIrStw=sFC_Wnry*WFvtkl
z_bnlg`1Xgm-cD-(iYSVxKhBi870twH#wFMn&NL8wO`eNYo(FX$mA5&;<mFZ5nVWle
zb?8Ef71UY9*66USuZeLC?3O{E5SLIGoEo;WI;asn%&X)xleg1vK=5h~D`)C@0tKaH
z&PG_Zk7buz^Oq@wde6SEHGP%M0h%T`89bKjX;q_|bqVHWEiA-Eo=CH^r7)E$V=NU7
zNq;yx^0u{op(Lj_+V3KnNfe-IF2Qy`@&*5d((hgo_oP)@bD2a_v^@hcp)Q0H7pjzf
zZB`~e7xTd1f8=#ws=UOO3um>}cQdHX4%uKon_VcH2d%63#zk9z?hLc9`tyEo`}cXT
z7c_FNRTLC>t&$h#hXm@hQ_x#NRBc%LQbX^X-BKT8b-`Dbx;!Anbrl4k-?^6gny8ZN
z4IPw}VKy=7;p}yj0c~bqkIA0ll0NT%(0d8mV2w#1v8|F{1I`~-^Eh@2xWE<7%%Vvx
zD6c@OQ`uceJz|q^bGbRmQeCH?Kgny3-1ni%3sz1Dy-VSk=tWP45Mh_6j)5bid><9e
zqe`!1shZetaN5vkj9(L-d&X@((Hhj83%!wi4SA`eg(b6ke;&W<<wBRaBx&=m1zaXO
zvbbK*CD#}t(M-PBWupH5+Tf)GQ!Q&H;#)qo(n+Q&q|5^?J?y*hgF-w8V5kN)0K#jj
zcS!tcJ8ryi3u8U0PRz=_(Plub97csDpL}N^B^ON_O0KS~#&H^cR&alR_}=@$6^&93
z!?5&@C!t|ZQ2dZSpoT@P{19g$F;~&qQejR=u4tEgYuvS5NxvIo{fYV1x?d6P^mh!4
zE#Z_%yJF!pVfv8l`%J6z`d5%HZ=Ho72=+4Bblv413<=3c#K@Mdyk)1{DHFEtNN~1g
z@>ieo0CW;lWdtOPO<+Xc6_cKkDv$o1zEeCgi);L0p@Mi8v5<vi0!L^~XsUdja>rKj
z1;apy=U0+>lx&$T+^f{)?%m~g3VKu;Y`vx7uUT-<B-q?7FLM)R&9-1v+Mbuj3al5n
z9Oxpt9A4lzpMQwC%<1~Q$wvb5`34(U$}$HdxtWF^7))Z1`&!}3<d2nw)9cwRUCJoW
zPf1N$gg7o-1yzE8sq($NpZ~8L{aYX)Ixd-X0-=m<0a(EoODFD;s4pwF8wMk8T#KlC
zNkc;#d6%`y(_N$PstpKA8m{(~icU?TH-@2%hBGHLJH1%Fr+?5(#fB=uGa6}ny+%H1
zofpNdnm(e|oQ#s+yIoL|UwK{JS$`vI_T#=k4~h@gQdM)$v{pnBzt(0@+>W^)VSH%o
zmtXZdH7y*I|E9YFu7``y14}_d-3qUQB**~otj5Fe$IiILX);zesuL*gzJ0j5g%!6d
zR#hzt?`5~}-T3_*v;GtlQqTyBH{#>qv(=RcDioIYLC|@9q(*#nXmjY}lcqB%#9B3{
z5m^Go{qfEBG>JQPQA&)EP@^8~6}8~Xx2+{@)a{Lx%>&*ZHWZ>1G0#K?ylwEs?^AfH
zJ-phRe7<X=zvLEN#del<hHOvYwcGTd$Gz(sOZ0BZ_u$kDT0qhOd6HDpScN~$9kQXP
z=Lr{k^tmO-M?vKXO_waI^N<H{cm=a6F)>kb9BEo8c!m8Ygz0NYF+{jcL5+D%`qFg@
z^_k1gA7zljy5FU`#ic@o=#+atNx6L2-z<c_yJ4jkX>56>IPmD635RyeTxyVYcDk!z
zcK#D#`%tE{pr=7G<(9&=r?{EI*bN2w>h3A^OdD6nxVgFAz2f^Y6FcT`z!ty>;C+n-
zBOnFz503Z?X<2+D12zwzY`4k<vc7uZ7RM?WeTtLK&E5U!IXeE*Qvr{qdDt2aqiIS8
zo3Fc#M2BxPh0WD96Mwx2qC{PwzU=Ey@6xM~PDr=KUz<~<*th`^G_#aIXtUt^t8FP-
z>JIsnE|DlGckLpi2Tk$>4zu@k=yc<YH+C;hUWpg%YhkLIjZl8;IQe!l*?oqc+tUlB
z%#5&<9Y*!}U(D}>1jGV4z|`Ov!9R%{eZu8wAZZvC`gAo=K2fPXwpMpa6T)4sy-~;z
ze7f3IJ&RQ?>KTO#hardhx#UmIreCcjX%g6rIR&F;wI?G?J#FCKmj0Hx8uzp~Z>dLL
zDHlfyyQr$wVW(1=?wEfX*iA{{MB)9qiZXy_`QPpweHPz)-}%&IAzETD;!WkejC_)Q
z<f=BANiyI#zWUh3A8XmagavlqqJ`Rsl;s5&qDeTZDN>Xz=2Tc_6fS?3wLz`tsJvW|
z5vJ3iyvTz3h_jqO-TX>|t=w^SAAp<8zDG7HXK=MqW;C&`UqKYEz>X?L_0ZC61yMS+
zt|IgH7AWGwg8rO!A?_jF4ij(tEuXySwpeX9tj=ibmvq642+krn*%rWRne#<I@_YY)
ziBt#?^@&tZCbHrt+=Q9gfiZ+6Ox~CUr7p+ROfUO9Q;~gH=2_=A6t`9;9VouK*I>|B
zwlJ@kv+HI;;BKz=;5D-cc0EiC+`jldj@gG#d0Q=gU9Jwy`CZDi0fEtRRbP_JKS8tV
zdXsE>L646!lq7#}O^Y``y0D2|Hq-B+%ZWRE_2Fm2bapt^yqrJFaS;c4>8^9=d#%3<
z<@pMk!**kC#5;nN!vie^-SImZ4z9UatY&)ta;0pollscr6t}@F@O1V>D<V!~BgA*Q
z3Fe%AetlkFD0ky4(p$FPO*GcP_vpIY)z4sC4FHyugKN+I?%lPd0)TDDGE;689SU@S
zmJ56AV+@iqh>~p)uC=P<(8<znR@i9FEyAP^F_p3@dh9u7lsEz#MtUyhz5;#*VLqN%
zzL+OQQJ-q=xO0FSIby=ah;P*urq<CeAUoo3`(j(2F|W-Bl9(QK)kwg1XmDF>vc5Ut
zBCv?d`302*=D{X{>6Q;(rVjuj1TT>Dqx&@t2V@T(5bXd5%Ub5OL00`ImXZF-CrF4+
zgk2*P#fK6(KrWV7EM9M4l0;MI*C>U^Qu8(&@)uracpo@q&E(=xQKT@J_Zf8=8xc44
zK>*vSaL945&D{2~n%-bPr|jb5YfWMej-FQ{=*2GK$xDtU-J*~;^xsD4LmHuaT#Hue
zhm!lwAjT+PaH5@KhV|I~7-^ah|5lL-c9+RW0NL>kFPtqV=$|hNnC-j%2mEILB%S|O
z0Euqm72j2FWFH)^)=o(nQh}_=GWCoNI#AW!33DuY8zYG&(hKy?I%TeJw)!<Ufs>v0
zVWt6pp?VG#STOK0YlIKHCzTnfB3p6Jch?5Nadxm>(5&3n<#Ph)rJf>*r!Eqr>u>O$
zHCo-^zKP{tdEp4PLA5i)h@Q`d!-O8`Hk(wLA1gKsB{1#Bhb|08`@wtP_kaw$id%TA
zK1ncyL%;SIZOwmTuxA<;q<tfUao*N!hXg=@wJ<&TeZLLJG13L3IM!8#Jy*Syb;5;H
zkC^SF=*Efmfc&W(Q%>oHptf8(_C5h2;&p|d1w(VVnae5JGL6$oic;_vP5}tXosSCq
z__#;tl=te`&vIXboIQMUX4`8L;oZf`T4ZS;XhfueBoksVQ)9mMl6sp{da#y&vY7ZH
zW8IVjg<C7DAtX)c41(Q}#aD59X$2Cyo!H<LDLru8^69uryfENYNIsT``soijz?7tl
z=X%&&qD@fuJf#boY0Xbjs(8eN`|Wil#$G1^jOgy_ANxZMyqpay2_WIJ(Kp28F)y&B
z{s)5;F(aNq^lgIe$e6QOwfdl-7`8H4wv{5W3dgCKtd<Fz!%6{EQ%~&3ykbVdRgO=$
zX~ed!qV%gxDJZDK_Y3$JL*w4=k66bcLnMPW)Y9(?PoIH;?%Z0;s}l5!yi@I>&i|!o
z_5+X0`xZ59zeA#$;{8G00%s8PQ*?2=={k;q^(}oxONRsJw5<a=?Aj{<SOV2Z8W1e>
zj>JQw-|@H{i2q__k3Xm4K=M?qq7VtAGu5{isM*~*`lC`m6q)jIIxVbcj=a`aJc$Oi
z2kEd<KpP9wMilg%NlCCu%I7d(mI0>k`qeh-#PN^FOx*UOgVH#=!Z!^rgFT0+^``?r
z)b&daSLyp!!K3=vwaPL>_Vm@;uzB6Nd6_Mik)9lRSyULUSGg{0f!{S@l7m_=*8;W4
zG8dGdU*?Qcmlj;LuE7jc)<#Z^I4&)g<PzFv@BE*bKvn!PNe@ogb#%v2b}2LWbNYd(
zsJWY!6%G8c&9sKqnr+ljp=FBP-C^1h4^Cbo7QGOa(6(L_^oyBb|9ywDGb_MD02ibP
zc7yuTxcrwCvrK22tysQUdzI#G7}!l`@4;otGM{>FSdEi1M&-*LW(&SeOOsg2XXA%N
z&`;-44t}$h>}$D;ueuc>$4Ldi$LO^y|9Y~&h}=*8om1r|3$K%bbX-HuIFwWOp?2<&
znt`&8vKv)`4<|Pl%2`CaRw|o?6Y_@m1|E5*t-txk#!RZ1B`>GYLWJh!FS@q({iI*U
zd%CP3J3b#>5sWRC*d#WvQY$E#&Q<!L)5_0oxwj2e#`Y{J)?Fp;*xizrtWP#3%K$-D
zudo=WsLynzTHIR2Dp199bR@r#HB);5WFQU_{#P8PkcS{~PClbEF08MunsmQVdwvcN
zYzqRNjkhRvG{bgg-Ws<4iqv11mc2Fk_!I1Q9JZ&%rtVwGtj;u1Rg6k0<Oal8qSs4A
zfmTh?*EGRoI$b(0)0bPeWMIGy#JyNhhoh$jg`}2AJ(cYeg?$klC^eu%>LUtvEzM1g
zpATdop=z#mf5hDCGa4ruao_T?Ki(DE&U8or<_jCFxf{UyZJ{xAWu2MCr~D68bXeJp
zth0z#KPx<@GXKgZr5Jton_~<$wIEP?V}ivz&vdno4dzzlbaOHWq#7Cn%FU}$W;5{U
zf15?>_mqtx!?~jTjY@v#8xEC1cIeQaKIn;ihg}pDVroM^ZJluz2hHjB(Rk|IJN8Bh
zch&D(_90WYv1>g0Ai1GR%N;`Cv}Qr#I{j^#XSXIdL}i}G<Fv`M8NABU0U)usS5<BE
z8)lq3!`iTtC`}Zcm1D}GGNY(`@@q(IAQdTQ7I~YBYOJrE=j>Z>;A^*PR+ngr>gwvN
z<s4dO{3ea383N-J)a@L!5bSg+VR5QdR(d!M?^z{>LS9i|@5EpS1;z4N2RDZW=c*KY
zaO8Dd`Qp)Z7usgJ*cl$t+EJ`yIOmoBj9G}YD&~|zpW<D+g4ovV#MZ6d3)Hb9mvi+!
zljN8-7>fCK&1x9omNCGC1peysC`|qD;tU*}h&y3<`rb$JTk&lQuhzcnV^82vtV<r~
zymyO36NQNOXf?&S&YXP^11`s4x#gaUR#+<jb{>dvzKLO7FVNacr6)&JviRf@NVGYj
zjpwD_h9&;mWka_e&DF#(6E;8pjIqSrk;BFwCb#cQEDqgC%Acw9Rmd(3h?hlP&3yfm
z9}?#JZY8uZ69lg>i{wj0DwzixPTx_R<ZgU3a3|w}l+(f#q7!bok)Od>b>;sErN+n8
zbZOUe^vKQ<){vU=8_EWs>#C)E4|jq5-%{SEVcnpuMEp%(50l*T71pQpws3fM`_m~!
zA02f~D?`d^eJWw^f=K$7q9RU#mq86|?lOg<wbF^Q&FuHu)Y7HDKB;+Wm3@9WE|hiF
z)Z9O+E!Xvy-ORn3Wccz^wseqnlTln(!d7$tZXd=fiRMaMMOi$QP6?j{E^cP+ix#P_
zzE*2=(D1xByjXpfDa|$|QI6vpaCUk<Bk(t*%o?DtK2=!FFS7nvTS1!jd^8C)St>h^
z&7786RKZo{o^2D4cL!UU+U+6M73b|$&$wZn($>Escclj|#Pz@KAL2BG_{YT7n9wva
zpaZvFIx&CL#W+MD(pfS)=-U!OcVnC9F*k(RW2Y6<J4|;>VzZUCZEZC76VufS$~-;>
zIS<V!`FadW+8Ey4Fp#Q#v49c?rhsWxJ8w9=v`*i8c^o+vZv#qTd!#LY;tv((bOO!^
z?e%ZA`URiWW3vok#h!Z~G=3Qx&CNuMSCQv1w59jg$uKEpgQ+XCx~g01^bS>qd?-v4
zz_v-GdZY9)-@G5@2;8AXH&up|m|l)GSwL)UX8A84+`EZ0y0D{qQGHfLOC77dKrLee
z_iJ_nbBztsr(jauM(I)4`?;fp4_hWMNoHk)YuW_aI{3LJ)$HWaD{8MdvWb04JN!@m
zHeCSA@C8(_NXjLgu9fCF*$@>Cq8?+Xa$(}WZ4vE98rorK^|sDMH;7n4nboW<X`3(>
z9ubiu48<E2TC<yX7Z8)<s98+(`7c5_HDgi`9Ou`^vJEIUXwNO=c1onuVfwV}TH2Ug
z(l=-kD<y<F%5`39hr^+QK|5KLRM^gy04gUHEme;nNBI81B2FEoCYA5{9z>A8z@{#m
zDV9E7S7}!4=||jzUXokpQ8NY-&uiaiYIa_)4z^mcK)Lznuxj2b*4pHAUEJ^mDk<N2
z#Fj7Ork(#yAAfD{4d6~OA_K7i)mC9dzhdftqgkx-TCb&`GR%Dep~dolj7m<ry~|P-
z!=`V(UVw1ocM15)I^X^ELvDsIt!Dg&;OfMR)s)3&P`~_71gvafAz0W`#Zr0aW~-29
zU25OGj@ab+8hVY+_q+Xb()ZWZiJmbXo&Pn&1{Nz(Mtt$;vw2N@f8+B{{CV{T{3GAp
zQ}b2pV`pX5#$H?E%e%6RI(;to>T51L+_hU>4ddMDhug>$I^4NZ8x@}M-jg#RoUD?4
z&XQ~3&#o^Jdkn8E<-V;_4Wn=K+&T|Sa!bmRoy#-*{+_@f0D5u|*1c!bTJzO!2kT)N
zv#)Mh^eD2W2n^s|yzgw`@rjRrgV#z0(qqKWr$Bp!C0wJcFuEDH>~1KcJmNGv^}5cp
zSfI+&*|B4sna^`&9x-5mVLGB8oM!quSDNqaOqj@^0N_L4Iuxq$o4b%FqN6@O&TP~w
zF2#0ffiX6k6#BMlYa*e4vvJCV29f$$)fA|P*|D8*=RUYel%zTsxFa+N;TGUtxlb{^
zUpFS45%;OQt`@1`<F7TD+E+g0cxT_z7YDa*7fFjn6l<^a$>J)vZsBkBb07;vdS9y)
z<4S3Oyur5i=e=LIq2-BeHKyLA^h6Q`JE9b)w_M&>X_2LaINDB7OjDf+cVS%)P}3B+
zh<7X8-!?%^jEr0YPKJ&qnbbr>eWBS|lsndPg$0h>{FXVr*EKh;rc;}erwBe=#q4iq
z;$rRYGV$#|;a$%?w%IY9N0y@^NHH5*EoSlg3_%x%0D_S%K%6qdu4&D02GA*)ELF-~
ze?RDG%EF(PiPHvhYUpJXf8wDpF=gwg#%yBZuTwb^Ie0JS%RW?O)Hm9~fczl%Wk+Ia
z2|LNl*smH~mJ46_g4NaHu`8PoD6nmn`62kWRP0=8O+<=Hbx!6M2BGbpk+IIMewh+v
z8~gc*klIT>3WgMmrzTe2A96CSm<okMJ_)b;q~rG1hX)J`cYNm^?`Q*)#T|^9{<0qj
zNQu)aDu9u8`|SSy{<i$25Z+x3kR)az4QP9&8eqoqIM8iGLPe9qaHOrTmGvf-c<t5;
zM=N!_bFb)YI+<p@42d3w6j|%l!S@vvQiUYY5*~AptUR|<_f@D4d4cK@cU3k^>QrNs
z!OJ+mjP9HG+x{*(LWA>yV#FHF2JJ7O<uU6;Fqrjg52{?Jhut5`XBYs$MdZm0bbo^8
zb5}0lIQE?OWZCZ=&rUueAT>*Q(A^CZ<y26AY~UM4s(f-fvx9>=6uFehalM@)SR*Vd
z<|=k{Qh-;Pf26#E8rrw-^nNpYgW_g7hc=cXhV#`EPqaI|)T_V`-8VP1ASAf~UBFQg
z>&@{YiOXU(<0(x^!tmKvU~77Cg`<V7JmZw6&C*Dj0VW~!YzjlAb2^R%G7XKE4^7QS
zEjnp-#icNd#SR64RVw>k+gW#SOb4I$Or+d-jyqn$YVYK?KbDXvZ*~1JU~-irtU-ac
zpIy;T_FmB0;w3^3;-defIWi%3L1kt2Yi4dcac#W3O|IaODV~=fJvgt>Gb+#wD}ooI
z!54a6yB$I75xx?1oT76hC-nn~b5W*Z@%u}|!kIbHJ`OcCs*RF#K^JoPd3cXqg(vIF
zXX6|(p_zrVw+82pA=y@1!Rozl2TZDy-xFetRlxDDBK;zIeQauRs`RiDv5~NF=}zg^
zusZ{(KsKZM=wzkOZzf=B3p9oFCQPYFGBNeT6!XvQ=|Tb!R?W`H6E~6EY_}Ien|aq!
zX{?-Pmq}nw$l$h><VzK9b;<{JcK!SyOG?TT8m`7bmRpzIv3dgri$g;>t!XVUN0PrQ
zNW*lD;La8yRF?CaDTZY?^-ka5)J{Y!kFl|Q5qV(U)dqD=8H%poxL~`XLr1{!%s2M%
zKy33n#=Xi9hVL{><g+bR&hpIPhJ+**6x0^ixpCd6uqw$H2h5|FKcVoC)W$dBWYNTe
z7go)?@_jt?3&n3j$%vvUsn4jMY(BNjgd)D6Km7vR!ttODx?GrR!s$XKjI+dG@70iA
zTv!Oz=~!W&HH$g3u0X!25oNujA=9WVgVC*gFhSe$@^eMA>dcE;miuq%6buQ#qo@`O
z>sRr*m+Lm(pL^;}foEoUp3|Or3---uvfjWm2CiC~&(|1&vv(Nqu~z!<%_rP+R?Jxf
zf9H!<Zq-(XXWn*e^6^E0VUiV>-_n2qj3FmL>?A}13HWJI;IPPwZHOA#<C#T%2EA%`
zs`FU+yYGTH^iGA9l!=m^Q6?s-f~UVsV`sK0wKnYRs$fb|?iR&-jVrdj>g|nlnYol{
zbw)SB(Dv=PME^}rXHID${`Dk8x7631X<PCE(?C7}`a-|9)EJr4Y&qDQNe|HsqGBkQ
zm?&Y_X>h2X__o{VXOzc)`Fis#DTY@L^&H0uXuxFeMEoBbkSf1?iFZEW5xOo|K3F-7
z7JDjiYRV$!;vKJMg)kEAO)@GMHa$1@ot9J_4^=VgQ`K`DH+-UtX7#1+Ebx6n$SkL`
z(IM~LAH%+_imi4TfT_l|DWvPD^WXn}*n97HIJ>TWJVY2Zqa;f7E;=Db@1oa4?=3nb
zBx<xMA)~h#Eqd=Q1VQwY#Arhh1__2JLySniSDyE{pZAvcdw>3d`xnO?2d>$B?N!co
zuC>=ybK{me<yQl#D2=H4oKieOMfO9dwf=Q$s=>r|yHCJh&1!Y}n|QsM#_#ASV|jAl
zYV%%RT8iyEX+N=;wc%i%cgdgR5)ADR0!-Mw=>eVbF{=K30q1XdsW%ab*;FI1-nX>d
zF2_YT;!3iC>J`(lW_fW0n;Zv;Yjz7U2L&sX(=L+!INoA)yp|#gKap_eNqSK?%kw%J
zK|-nnMI8-g@R<{1QB5I|T$Do`8;Y8GSuQvdF<t>jZTM6q2zO(hZd!NR7)(L>qGe+<
zDAeyWh4&l7WB7e{Rmd(HH&aBUYwz1VhcwDMn7lB26(>9Mpt`-*Zx&YJJ+-A2T+@RT
zeF~J10Gi0_|I$Q2Po(8K(~kvAisfw%QjK6{D$WH3%K<zJ&Q2wh&;i_o*hr+dK}ScK
z7duGsvkF|9h(!r2n=Ay%q!&}2*chiB&|7|IxE8%c&%y7yj2fA5$)li0X_?Eurht$$
zfp`yfckDtIIFwgj%1PO|?lr#^>41pl+hyd6q6A%x)^AR|wD-q2IA6~n0%ny~`X3$(
z3fB5LeIj=s&*}Jf1C_0IkSjY@y?BKEt?(<(d(Zz+_^4h2S?}(VXbPO^y_}*$TjY(t
z#_0#Z>RmF%(m;4vmK&Vm(>-ilBJ2hgljM@)i4?Zv$%>OcsD`{EYRGi3+0X$&!}v1^
z>~DCfTf2TZotfh2TYFH};7Z2vXge^F{Lu>~GXEv5b+)e7)+cZ8yo&#%*kaFZeG8UU
z_@1oOsvzB>(vOQ%Dr5LvEwh8p=0}P%d-QK!;Aei=mA~r>bNIBzDscBe+mo3Rw%28=
z(h?t4b&sT$=ial}1eI)ZA0>;Y+7__m7mo^L8^~`jZM_tICQ-{~hz-4<`=SXnXpDy*
z2b}xYPDei+p-(Nd(;O&%aZq+*mqkCK7Q{3prq(0w&z<aw1o{-B+xO>|>w@SWQ+SMM
z-@1@6Ay$)PLh|qYCd<3?-|+rHmP5jSmxFGk6=w~o4fL-a#_~QER?n6#&Ac|U(8lE@
zNN=FFWcTZ*Y(NqvIZwsTI_%b6A9$Gw-n|tO^mp5YP^+(*UqXN0H|{81q8vN^I(4k;
zKC2`(hv7CtZ&zdm((~GHYDuFnvRl(oTUyHlrJs(lfHqd|JMWKPK21ZNuw1${$EPSY
z3Hr8iE4pL!7!)HMvho#MZu0dAC3!Rwl7H0xy8BI!tU!h&EaAft(_@B074!VM9@$zZ
zzm`q9x=dAzt>Iv7ux+~UivN;{%hXvr&CYBh5V$U6eT^@ZUrWq*N%)z!Gbye#t8gc+
z8#Kh*vuFJ5u(~-t$0fd%kbf?F;#xQIE_rEWI#K_q)Dih{x~*)gWU@d`y87{kLv=##
zl%p`$so2S#y++}i=K}Ffp%(yiHM=kR_}$-uH-N$E!aI41KgzKd4FyuzCjSiSxju_2
zfm&Ch+1JbDwu;>9EY)4)1dLbwoA;?c4QaLf0|#cKP005=s)`(h8vbCfRSQ3FO|?2b
z>ij7ExNLu8^Kr!zSFSVEds1w@^yy?~PCR-?4LI@8{0`pBx!G)%eDT{gS|53atyB%X
zH3;lnytIp#vs^IKS4%YGt7DJa))2gOs9hkvYSXCW_Rz?=Q|(Z$XnsaO>)!Lu9|G`q
z!D2&2yRo*GGHseQzQSdvJ4(P8$>z84UM`k-3gLp%-Zkzf{#sX7{)N_02El##Huv_~
z*oJ{-ml@bNmvy#wFjRXz^Lv3u<+|r>#L>*=ty?*2D2kNG;P{Jj&5G`vBd@{H?_=1H
zf)~D}0`@4l<en(L-?92$x17gN`tcUMC!7cM*}M+=N#JHE+#%ta!1A&Ukk#LvC)kEw
zHb4wZ{hN?uUCE!ri6)?DR`lN5pXqTZ&mxp#!yr0Z#a`G`NEQKsV#Gl*IRHLA=%FRb
ziYP6Ztk+sO8?|rMNxzS$m>d>bpLT%I*lr(r{qbpnWa)nM5m(50z=J$>c1IYTTq`GM
zda3V)sM_HSeVNa&`Y-zNx-C_yAEGx;N75zt3|9nxG-Tkxep+9cv3g&=paZ1wT;V{k
zwExrZO`>=%`YXjzM)Fsf#i7cq#j=OjcmQO*f=5EZ2_d^qPO0{|3_PZjjba}H@F`M(
zVIAs3a;PlJY&5JAq_L?md5;!ZuJ%ZIFsW#><!&m+G<-NSvio=>xLwB@>+F75R<Fwb
z;jMztSfHuWAeU@NYiqfniwc9=_|k3N39hs0Iw+OYp}-`iY-eZCkx%SYaB6&uLk;Cz
zbD$M9kUsdG6<d~F4u$>X?Z8Xb7ny!OA^cgd#tt;6iL>-F{TL3Tz>keVv!z^NX?PP-
ziYCG#5wUZ2c1~ENuRF~t;7;xpNTN{WVB3}yVP%aG?Yy6*gf>&yW`=Tpr64CKB&Suj
zBEryTKi*iP-yuW345aO+K{>%V3^eFel7QsOv$b^>EO{I1I_}a&wq){n(0P}hqcn@f
zEykrK`D$|i9@}QpnSM-`DM~dMkGw-u;_VByDzc$24rs5!(!h1`7JFwBci-J93-j3+
zz)6J2x)}pA8vd4sFWKOEzW!Ruc@vBe0^#*dPB!0)&J_h+_1}X-B8Nw${K#^+3u^Fm
z<03&(6BEdEN)`n<JYB*Ul4_{qSKO>+Q7Y)@xUMT9{yp9>6tgoCraH29H<+g|Jgw5l
zrA&aeOnFo%mFFJ&rl@*~D8HW!(Ys(*i_L*>X`Zu|xkC^4-AQGHQ!3vuv%}b(p?jr8
zQqYHWXPzxnVGtFNFcUKVDr?zZTU)ks+#S0{0Zxm2r43hNny1R+<U{)T+Ga9!lmKv%
z3_;f3zsv;`f|zMJc^ca7i~JM8K*tOKyuttpQiM0Qyt9{DEMk{uM@%MD(8Q(&QgItZ
z1S)HwG?0cdvGPV~)fEzh^2N;Tu4UKPnI+1%<n<6m76U2qT^tr!Oj5qvvPaA?OQGY9
zhQ|K>-G@B-N@Sb87Q;#whaWN<Tka=D_)JKPi=PbnXrkaU#GStAP<Ib|?^+CN!LYUO
zmX=IJ2{nXcEa|RTnVGL;<_{hPm0vCiqv;HgaVbb@&)P)?>^E=yy`t0U@5gj+HJTaB
z6HcN4)=Q!XMMOqY;4RJVB?cZ7e)jPW3$xkMWLHKXu6Fw;Ce<l}p^=hNcmb8U4POi|
zG<)k8OQPPD`wT+Z)kf%`neIHvO4b_cSzpQdW+Rv@eDiw_XS%X<3Ar~L+A;&utK<0W
zx_q|znU3-_YBC|cFFG2#)rB0@-m}+dZIzEAgWV?Yu^b|#o@yY{gzR7gLYz{C^B6=T
zlkNTdYd$F0lC9-TX8ZGO|L<CE-&}v+d6>ndKK^tmVDys8GmDfA5a1TV0@Iw$R*iZ(
zOgpUxUkGbkD3>z}Ydlc)G|EpQ!n7OoP?oYQt#AoKCOva06vheE_3er_$GTXRhWppG
z6LcTGo1XOB5ESkoiw#KBBFsjq45V4O;y=u2^M8jB$WUK3Vp4vf&atUriT;^=$bR=|
z^YtFDe~M5SLD}IkZuzFc(bj8!RGM<tljX0JCoM|c_EB<DK@!i&_X+IWyECM5i4N>d
zjek3Y3zTKsWrwg$cpOpKvqv3GG8_)Nr*4V9ojrhyM&0hvhZZw(y!3yQrpF8&%?(Dd
zIGF8x-hCB6CXlu(KGNoxK5V<BLU`Y|h{+x`?uU18Qq1YiY3WnE3Z!TmFbbGcY-+p#
z>8yW5V`pke)JA?6f$4CiXT(wZiCEW9wq9y8Qb%MN{i@FOoxS#y=wnQGb4f^EUwq5&
zE8Sy>pHwcVA9rza@Q9zH=4#WhgWP*n%(9&NU{j&lJG2x$N#8*!qcO4NYOh$;C8pa3
zR5lpY@?Kc8r9?g+7xLWrS~7me^M$394_M%=m1|Hw7-5p0DxGa!tj?~77TD&{bh!R8
zl0o{Mf37)tJl<Dcg-rWZar=ynZ;jdqSYy;Rjs$W+3*X5n_jqV9<0hKv&(~ig;|T)J
zEQYk~^PU%9KN_Aa@h*SxwDuNY!mmTtYJqFQf2Z?oK?u*p6al`<D7Cn73ViY_(ym5=
z*EnH2*h-J(eFG6rMb-#5#fT>w1J@~V7UX-rTG-DPX~iZ!kF%+qcQIC1Ci5YOpd8(i
zu6H#|Nw!%9T-`UP>`j7mCH8Ucq>CqxX=WuFOqT<HxTF|xWU$_g-PuaAluj6-$-d>2
z*6eG=x7+9Pa4>p2|IEoMfSuRv-mm8sLxU!24VAg-1MlV5Pj<APPW=M{@dV?7;)W@r
z@C*w4s}P<Pd&v~^X%iEHE_>|w+ly5n8HEi6H72FJD!m9}M$J1km_GpGU#5+`qX-s0
zIbNp4YC%{9Zd(&Rf9b|pQ^_;8)FGfALG$?o$K-A+PjZt~w{o<lOf~PCgI8ieCJj;K
z%N=&#VRh+{k+*L{*4zX*ytAI#K@4Lx)jc=g!Cns2OMF|<n|ktyry;H7J{eRepV~g1
z#?CV+2xu2_(3;u-E=)G%<#$$HhEbkZa6!1sK9R=qR0y3M3c5E4c?xgqHQT@#t++@E
zVi__#RoE~QG|hcjAP*@PQD2^w!fCh!5C|&SK!-rwl5-o8aNI;Y-1h|)t@EPQ6APaf
zb1v3sUk6afs*(lPcQZ7FFBM+smBrBw)w0%QgaPfem<AMooao4_6p7R;WAq;%Z|Aqn
zbjGTNAf9B}yxbhbO7Q(N%8w<^2#=aysIcT%c&p#(3zjzbm?QjXlfsn>&T>#hDUak1
zNAKSL%7N?ghTWDdzJuES<F9tn3!dz;wGK1U9+$hs9VS8SB&Wyxg+`tJ8JH5;HJ1jt
zfQOMjn<<i$X^6I*g^;Xbpo^zjg_A>of;)h=Q+7JH)i8D^A?HB4j_Wt=61rDwc3F*a
zfHHx1AzJ4xFDmgTG4=QUr^vVA+^cR~Y$PV)iUq~;Xxy**?6q=g%vu*PqV!|kptYgG
zA?Twy4J4HFRgb4kC0izSdZ?9)a$pjh8bIHQHFQj7zXtM<X5ajuak}I4@$Qf03}VeK
zju2&om05A>d*t$qi!IYJvQ)CuKN}j59Y!8%_$N*^zp?xFy8j;+MP4WaHLS_RHRk;U
zNY=#%>wQB7MQ=S;)|+#Tw)NFAA(TThb0NV4@9ea&P^$-Dq<n9xDPk?u=mdO}@{iZY
z>3zGE+h!4`MX&bf>cl*1rgw+zf{-wC^Zd;A<Bjq7Dwi#NK0LB#qEZnO-}w^lXK*hX
z9{es!VSGAGE=M?3R-h<l>1!K~w|f04%xX=qhZ}s8_I-Etu}8-wEsnlr*-xOHQ_IFD
zf`2+K{H{dC8OufB*I_lDi9Ocj^BHR4AH<w?$kwi?Z;P}sx~vG!T`WPhZ{;(rMW;+9
zPo=dUPteQ!^D*u2?v5KL%N@II4YEIAy-AUE4zJ4lUX)10VU6|C$p<}y8|G&r$^=j2
zK3fRV=UFLg1u-f0ATT?HUfZcb2XG&QsgUh2nx@iR{s-{Tko<x78PXM+7FdS4y!rS=
zun&h9mSzoFynGZ0@cY%LTl?F8WQSTnb{Jy%BT`P0MR>^<^{lHR(db_A;V}hA13XSM
zf62O1Mwm8dC|2UhTS3nft5J(aE?=A7YQZD*ywKf*oX`syzYB$)>aXqeAuASJU0L~9
zAaWRFo-}X3gD<khCB6w1y$c1N;<dAy*aE;2)2hV9cmExD@*h-qS28>nZ!*4*(~m>n
zRgEO&N{2#WZlokTks9*jWXFIK+EZQT!T|W_<t&@<=FJ-!MvT~Ey!Dhe$upmkk&gU+
zyFJ;bas;LQ=aHo?DNVEK@R1pO_yx1tAqwutE`VW+<Fvt^RMfnp<wZldrq&>6h+u1w
zAkOcGh{t~;p8nIi0rbZh9CP$t%?4fzHwH(jmA<ETd4I|leg2c9b3kA2r<Z`hmmz)3
z^qdQ&=2Mt#w6HH6;#(R=uT0#?EqGKSC`^TQ7NCs@tUdoR{hgy!L~T3BNBQDsXpP&`
zd@oETB!!m)7!Y9hylBwDLSt|pj&ZDU88eP|&Lg*rbBhDIPOE<3GS~3-|C1GaL&&Ub
zz$&4N79t|?f8!0$Ce9u3@Hh~gTeQD8(Mqd7Ds@m*#_ucaJ_&47r;JiX4pS<;%>q#j
zDp{p16}*A63UN0JTj3eJ=$XN8zUv;oF~|}vo$xkGY_M4qsG^_6v1GK<C#aD;dyTwk
zlCkn?_S@X)VoMYmLWN(yNYX;+m)2d>1{g_U-bK(~WHlRRtt>C^4MU4YDhE(Y5x~NE
ze=l6oCi*fI_&qUZYcki(P=Ul@j$m#$3CP7-*T-qu2b+BfAPPZ*so3W`6y&?nJmLBj
zC2PYmS;OJnuVkdmycrD5(FPU%jAM%4MnQw91dGD!9aXvW6kH=J^k{NEgzpQZkx9B*
zO_B8R&o2$uHl8FK8dNZL*nLVCLaBwSNMP0v?mdvBmJq(_H*KaFt%BDefI>`?`KWCe
z=M>Aa16}A;5M#ng&%eii01DLJ4F`H?;K=NhPyDeI{yXnOfuTbBFVlt<r1e<UG)9><
zH583xI(1?N!&6Vkl@FDz9OkqE7_KPAy0L2GCQXz0BbND4M-im2=sq*2Lk;zI;6r8w
zsTZi{DfcQ<VeHu+kX_eo6dHWw5c-4GA|SPFvjHNq{z+!xozVemy+F&i;*;`mywOWL
zS1shpiH0h0OS>(=A*9)q-Mi)Yk~Jqo|6H;tjxwnTFe<KY1i$ziN2e9*qxLdojWjiz
zyM$`~CCy0-iJ<haKHSjXlRr@k5$+x{Y$_q$hY+BJ<BKoUQp((Qh~!nVKk8UNLI1FX
z@w$DCmQ_OVfdGyFq0wY)$+%!+RG|O)`^OZT(`*)|lsh{rWSC^p@#}=?L(P4u!Zy9Z
zpM}+{QjC`N(Kq@*99RdfLo;^olHaS<7=Lof;mYIkDu!9!4GQWycf`^i#)2KL5_H}x
zWq@eG8F+U^6n4M8`W(0*B!WNU{lsLyv%-?MMXD64EoG<Ci~7LrU!|33mR#ij5n*pO
z9`ij#u0<)vG^uCw!zUh&2+r(OKV}1NlrD?EJ)20S@_qjWuxMP?sPRtK&(F$a1|m9f
zbW!(TsKAa{V164Pc=p(m#qTx|fB7=`YB(pw%pC1&r}GZ=#f;|v*+v~<^Ym#T0rAH|
zylZQADMVtJU5kn#;C|Wh>H+f}59qnRX|psYmf~~)(oaihwDr1HE%8uW52aw^?!_iE
zyg-psd;_^7e`SU&AE8nywET!!`it37)hFo@eN0nDTRUovgm`v1`~3EOT}#58m)23#
ze&dm18Nrp7a^qrarGqwB$#w&?aFd`xPU8`{G-Jl;PsH7aEelgyVmWzSgci|2f3--M
z`(;qYUjG^oOchJz(9dp<J##W9f))-;SS>y?6nVw2g=H0DjRrHMU0kF{=H!5qK(Iji
zs>7M@yStzBBD%`^G+&T&bL8CSz<oBG!b{FP?wrE4YsNwyek1b?5$Hxub^v0;8pOuk
z8+j=)*X7E1L(Lss&+-)OBOox{>2-*HfGW%_!n?uhY$rKV|B{&X#R+N>#xzJkX~@aN
zapZLuxWeTKcw&L8!<TbU;NGPMa|$a3M`Pjk*|uBv$4)!~+$f3eP$|12W#tjLIwvB?
z3(U}EUWy{x4~JkQR#GHD?07mODHN2u!xgz)38ohsVF(N4l)e?BeWaiEt1wBBwtJGB
zTh-wq3@*JlAT7!X(HKahiF;Tfo%+yr;J`EE=}Y1k5H(OleL6c%R`kiTn7S)VlhJzp
zjYP2DIeW{hSx7wXtFkBq^gK&O(5${A@~x;(zF0uYKX2iRzG0(n-AFct=<v%o`t-XA
z(*i8v<*~zsx^Zx``*F$AtVV+0(Hr=<toy?XNe*1<cqF(*avE9RSWz|l5vUKZzFIV1
z!!hi%0&5OaIDnp6pq!Rm+?N-p<qnMw0V>2!bNGEvrn0GZ$eRE(l^xl!0%(Tqe#=fQ
zM7uL$`;!7NNIN*{hFcZG-Q8NA?04<noSv?AVjmNiIC8=2v|iPX_g0C3bh#Fsb-CdV
zm=X32xR;@`XiEu+Y+kh?06F`oGvDSOenCMV+<#x@XR2RXRC(vb#BM^x`Mv9S*XF2z
ziY*N}2W^gM!#GcfgsvK4<}sVV5tj@^^8_W#xh&W}FBXcOS$CK&(gY$gYUD?f07ts3
z`>edBpbt%IcgU~mWKncB@2gF2Vh0@`=e%sM!?&SYGE=&5RB(*}n%%brwaayhM*`+_
zP)g{am>Iy=?%f(tic-Q;L1qW8WH7LWgjiA<Q<Nm?=BYoR4>8uZgQzv4w3fGxp>Y=U
zV#~*0C_|?6yYH7;ilV;Qw>vQ`+)ssYmLg4cZ@I|UAq>qs12_w(x8zCQKJsCRD}0Jf
zNrf0-gVxSv`FoitBv=5{O{hTow}G3#v4hPFNEyXU1HPITdt;AUMF&~yz(5X$`{qhR
zxR_3P;?Pr>qY5jan^3+JP0K+GdftP9Tz1YgZ0HaMJrwRTn-m&vUy^IV6T)zS`h0I?
zARwbx9k2`?#Sxl6UFf|l$&uA@RKrw33VKky4q-2kIdbv}OMe}U$pOq607v;|%brfU
zOm#pJFgm-^K>wj;{yRsK$kG#aEZ-!X@JMC@B|%QnG?$lIW^UOTxIrSbd(Ovh8ybTB
zX{EU;0gw;t<tBd!8|*<m_17$bRVwhR%G-rcZy&J<KrMmS4udjY6%W&Vs%5(iqoe=^
zRJ*NVcM@h2p%pu!u%UPSvM<wqe4U$WsvCo+h<YG=b5DL+U^u+mb(o;XJ{G42PCf}s
zzI(oDRJTRbf<v^J7Pr2c;bsDOv4e_X@e<(-%oP#}f0rvU9AQOvJZGv(NT~eo-ToWk
zu0Sjsegj2^A{kOtV5_^>%erk>vVokQwygW0V)H>rV)qfJ{l^|_uS3i_|K?`*tfEl^
ztMvw7;k4PHgdJr*hKq?J;ou4N^)34239+rEw)kgEF4pBI4{$Sq8fyn1>(xJ(!5Kar
zrY@u<rJm)3JYP7CU|p$I4vJF6f@6#pmX|%9_gpmT_cv&(V8aRHY!DI8ZQrSmx^+qW
zTmUdyZtc7g<xEW4aO?+4_~!Xrz?*y229o4bVuOedP~*b2r71AE=`y*~i$ep4O~pKV
zzeM`V*8zRch&Nj#R-S>qIK24)C2TOM{$uz{Jr)m4$umUwA~pgZ_r}DXnMwo2d?!|$
zgo2=<FZ*?hM6j=6h|2f3IDV=11zt}bq_NOr4j%-|J(_jkigEu!lk=jOAHc62DM}u2
z({daZ19zBXe_x0iWTL&B@-QeU1!C_O&viGYd|AxMnoZ{yVC!t*CaN_Z-9A;;1arxX
z@Oa}mMrgEx+|<vPH1T)>K)J*hqS*Q;EY!yZ`t_o&ZN)x=yX@D+edSOQ)JnQBKoM#3
zSt2Nd0T`fdn?`lUS&cP2ii~{joZyqz;H91sn}M<;Av-KRPKgx9BV*nA-JG<ZN4u^`
zK_274f`8VxZiUb;L6Ji_eD9ghBFAVPCO0|N2&>gmyW$Gz9imp)XV*@tX8pYNK61Hf
z$((_878C|#ghS{XOo5QqdT*5-04-dF4hffNH{gOOd)P5|M>t#|C5@ZMM@VKlbttTy
zi%i%s#ja{3-KzKa@T$I1qGW><ezGAhmoWKl+MFLCvZ(9MSY~Bbl`k=46&9F#l#p^~
z_%OXPnIDJLiTST%{_Ke63QO6R*{{iUKkX}JG4nSsN+TiZX|@T-<fc!v#=e6316|1y
z&HxYa>N><N_a5JxMnXD`q`OG0Phd8sPWG(}a%n1Ld)vL$;@|7$aD@N4ZZ=?ki?%i&
zGc~JWmK{oUl0#X81|0uzcrq=PuozF4r6WmZAui&5!5cebT)(vE`OA=glY7stP~S|9
z{NFOm&O;2PjJK0H4}r>@CPN3LQuk9nM0AAmW!BN1&1OUA?Rc%{?jzFk9)?m;^COgR
zn|gXaZN3<{h3)=isU4s`tn$D1!QWdiF;rM*mSBHkBE^y;ycM>4$dW;`sX5}eZC(N(
zaazFLj9qC2W+nfPic9LPq_80%7P(3;G(zSy77`a>wdt+Rm%?%wUONm%VimQX(lfI1
zC4mFFxD`tnAHN*5zdJemB&p5;#Oiop-q_kYpKNfX>@HN-N)kA+X#kXd8#x+pAB`?0
zD{>@q;-t=xR~=@4dx;fwn((D*T#U|@N<C(6b<j~SNKo&|<MLUe%u$=H6qMFj>b)wk
zJ8RobwV94-C3<>qRRoZACXPeXAF{sB5ymZ#r^iXF*T}lYj+ut#q!4h3S3X54D=WQY
zy$z=0aHmKtFK?%7+vcJbM%%eeznn<4=y3jld>bkBxq%ZZr0d_O?CqT&d6J^AxDFBI
zgW=r{Xl#p-uNdm!R99K1;Gylw)EJKyPU7XrYYd8!OV1X1RW@M!WHzvhDTG&``u26q
zfMeMd4PuRkZ69bI1Maup24q{+=fw3#1gTP9EkB;0w>c&1D=t=7kNkS8mX*0urjZqj
zdmtS5;svSy7;|vr^7FcLx=aQxju4tUMzQDL3yh<{%aOS<)9qqIDLgGz_#cvk$Jtm*
zib0+B8Yn$Ebt*;|J2Ej{hwF(j@}{1|ahImmArsdh2;m2?S;><^YX?KivRZ8Go%QRR
zb4{qcQ&4(>olfc_yDpbGy#~(f7rbhXff>Ce03Z>V7<%x>nt3&^;3TWX2D0?}se?;l
zdaDfu$Qg<=J#=&Wu0nm<lDvRcMB$WXZH7;wjG#m*e`l2H%15H^5|nm!L2R<=jys<N
z(Z`Z*B6t>3-OyKUS#K!FMf`5I(q=ebobqj!z9%CLE{3Pa6{@qDYDJB*fQ6L$8;Y&#
zVpZA^n1SSj=}ZPyj^Zz7OPaDX5;x4(Vy7N%Yj&yNa<LQ)LkKzS|BMgJI0FQodaSxx
zcIrvNyapr0$bf7iMO+*OIWmo3G!&<?<PP@)Em)<%*N=J0^{sm>=cd{8#i&K@@98@y
z94U=6+Yg(cViZ3=x7UeHnV7gZYfq*}#dB9Y+`Q)=SLUK4COF&P@D}TOO45-UFWe}w
zDRLxeZ&}kebv@=bN@1;ju?%ad8^m~R?1w~*nk<1`N2$>A9c$n!?ph)*&^Xs&=ivW$
zdjq;~>gn*FxOfu2Ou4o}?0Q!x3KtsAz`y(Qvpy4^vi}QkPfwyrwiUaUCEm~zF{{k)
zBalun4H&}SgyZpH=*upW?C#)cO2rtp1dUZxa!JbFg!}g1n+=s|2sVYr47#A64W(~-
ztVR^6-|rQoK5Ox)4rfbNz7cyN_uO)Y%O|!d&>}(DZP`{K_%M1*;-kdAi-*`0#lvJ<
zTCO5sg2q2WAse7iU!BZB2OVX*rG_R_xarNz@KCAL;jyMNuI4L&5MNa*W`QS5agR}2
zkTM2HY+UT~dsJ&F=c{3Lf`S1C0m<APzOa)qY0bc-_yaabJ0};!|8d!4ltb-+V{k1`
zvLJ_Xl0Z%1&q!>F$eZ!(!Mfmgp28VH0E}q#98_K|X7EYRvg@<A@6>+qOn*}<KZ}||
z0_QhCWTyS!dAR@44VxQ;zy%Lnat3nd*jNM`aS|hhm70~DRi<&>L2~Y{S;O9OoX}^j
zPiy*$kzR?F)Szh{V7^3=y8ZMMA!jS1uDdG?%K76QJNJGexUM@Z6Rs8KGabY~>mEw4
z_I0iMB#|97Yhs^Ce_LNkcGfCUeI~2A>4_ukxQIVs-lMrGa(E$kj^VE04!bwxx5FiG
zwEV;2KBrjya65`L<w`g>Ko7TY*|s0^Yic?7%Hr%SCfC}z{j{4BS5V|m>~oB3VFb+$
z4{~#U`?WBhw={<OuU?pmTY6}`f9}}3XHvpR!}9$32Ds%NUb5?l*jKM!i6}c#sBmCK
zx<x?4-@Cspi=O;Cm)_|)^*#CIY<9ZaG`X<VX?0@iR`;1!)@^0XwCB9K#f?T^Egu1X
zg-`w{-UBjq(a0x0<6az14<X`;OrW!?*Cq99gp(-ECyF$6E#-?-D$dvxg^p1Up$3FI
zyfU(n_!5~*Nufw_2MA;<Ya!0>Y}6wy*<;0AE(q~Kyvgt5P-Dw==(dN$nfE3Zv%TBY
z(Q%{hYb-nxX-;$;gL@95LG4~$cfCfXCy(>jkJV~oa4cAQoS0pN+a^yzb#gH&LUt2t
zF?l#F=bF?XEQ!ly=``jtHy7x(kftQHgI2}3wYw!XHEca7lFq|AkLAq`F6Ey+`sH%(
zUEucQw`V2r#x8hMp2*>ieY=jmPkX6DThcp@Ot@9jvWcJR8(`oV*`wfBOI%~#wY@%g
zBv4q<MmG_6K^X2|KYD2?hQTabcla}Xmusb}ztYMIO>Ibrnl0a1wK4Cs&RBufvXFev
zme|=bZ`eQX5cYdS#P+;ZV>orgka4VK$;GmIETl2-ViO&D6af$LD6N(j*g8*_^9pIr
zu|N7wvOqJU9M(TvL9znP4{+?z36BI8LH5xn=f4a1y%g{0YoO~QX~S1ou13JEb&r~M
z`PjxBmOb5oK|LhC^O=r}M&qGfbt|g~S&5E}Az-{u#>{N)SV!(zm!Dq<Y_voC(WpH%
zZ)2i+3KKK2xNgGPY->jwDJeNr#~?TU0Zl+mgQDMEfAH5a{O3Sc$BDj4XOQTN@G%QL
zy!xz2Tz1TMl3(%6;ufq-%StOPWWIYh6+IHbnCgfMTyA<{ENks#)dNz1-qFnXFljnj
zQNueYJQWBH^5C*UTlHt~#PeKaJIkQ4V7~J+rSL*re2CJ9w9bFe{`Z&Cn67^0==Kk;
zX^vP8Gk=^V@`%R^%j8ksTS>wQ%_o6?yhZdrhxef<X<rFV*)KfZ?+%VfJX#^S)0jv1
z5&<r<oD8f;t<&#$^wl6>hB@xgwdc#NbEm9S$9jRCg3#ru924E_mZjI%hYoJ+u=s1!
z5~OD@LYHSVce0$#k$&iI0bU{HUS{XmmqUNofiPTW`PIXXG4j`lG0E<aIZ1XW5I(WF
ztmQ^TG*O4vYLbE){_UvSX_-@xKWB?;(Y~*Ly0?h0fycjqKRPApwAB=B%BVng@oGHQ
zQNz1mDPked(8HL&vnCR~JGYlJB(x=ggyY-@$!`1AC9(+p80NQ7=O0u%7SK82c&*mB
zmw7AjnvB(DIjOIXrt=Cv-!LF#1L@7q9_A!(EBQ5<;?Z;sE7I61fj>l(N73TZpRsC<
zFvnaxFm{Vo3P1quPKEbKtpwgR>^O$N;DyyBB3VakCRQGw7F0KzBcXG1w7I?&?yn}K
zq;^v6=kqo;;D@C{lqkdp1xLrT6Q1q54d<kOamGIBI(}0D-%_MNL~(kf+>nFZCWAHE
zVGJ%Yh0w4mxcc4gzZp;%Zv4&m;4_U5(zt?fd6q~T)hUP76c%k#z%|^PW{%Kvk~Dbs
zt#o<0*(!oowbt8>a%eNH$do%%Uo=y|5=MR}(=I<Sp4wV?Ia&fw7>_j|-YC@k#)cy(
z<t^d<&TF$<e&9R@rtP4<8?$rX-2F7L@^f;Nmg^6eZ_U2{NfOw&ES<5wBZ<@>D<hn#
zrF-99Q#0}8$&m(az`#!}h@jVx!d~2KB+#3&ZXf<K`TrPyFY7hr)(BV#Ea8x(nAETA
z{q`YkUpmR{`Pt>4Mu}BZ<+eQ}kLL5uTeB$PLLMG7<ihm*p#irLcZNTI{|d@>$u<8p
zIJj9%+S!>y9_rw2(0_A#ExpL{p%lg`OL0Y{nCC^D2W|VeJn<Rop&4(ENXB4SCK^ot
ziJ3Tr#G^W7-2U+MBqF!XenT|u3jw;@-+|IklBfP5P9$7~*u49acc^C$C*dzG(%LSk
zQM{4op=6>T$q+#w5laV0FguGXW(Vd_u?*h9%f_d!wl`mLxvvTr+KxJnV#SlCWheiF
z2m!RMHN*63YVagkA;{4|Z#r72y39g#PpIxK@%dcAGi;+w+5}?C<o3QBQ>owV;Z9_N
z=_GT^dJ_p*G5z+4IHGlKwG`hRVd>G<y3KDt?--=9@l$U#4e(?-gh<!F`*->9)SKmR
zl}Ncknoo4gmA=$BHZs;>yJ-7jB~wadsiT=oq8ZIYY@i-l`jPqcANsj-TIMsiRuReE
zVymkUs<H05Bw`?ya^96`XT|K9nHq`K(5p9wgc{38jUEPd&&x%Yy9Bf>kyvuyy_#ue
zoRJwaP!erAQyeX0>YTez>g_!uZZw9xj(vGO-6<r*n&~BcqhVP+d>Sk?3?X&g`CA|Y
zoY75YJ+)dy48hztHZP@@vhD555AG|$UWy@lV+x!eKMw4?Pb#!6-j{;*fPGF6S0@#n
z_i`?SAz9ut->EFr3pvpu)por?p_XA2RqLzuDKUW2Q0<EYNb&qcz?XevRvi2i;lFw0
zw$06kRDZhd4`WFh(U+mu%=M$vK^qFpF_$fWOlOOzbgt;At7|R9S6GOlTSe$g?d}x!
zx)9Q!;2F)B%RY*WPyR{M8_WI|^zTxN3|$`@_7haru&6V9{TR)%P;6m*nhS%m4A`)?
z7q~|h&dmInt)$CC*pE(?$DAOr4<3|mAP#w)&2DZq<h5?4p@#!3dmc9vFJ@s)d74^N
zTXottPT-l5CueVETP$E(EqCwOnUi|C+Po7noE*FLl-rvK@aJV=?l=ChB0L(uKG7FD
zUCTF(@;40B;0Q@V!_ujtiqF@tQ`K7QFGtf)>%EGmN?Vjo%0m0G>xQNpeo7Q^tJTlX
zH&m-Zh=aqlb{v{DNzJxpgVa#q0@Tlzak}yMgOabDBo$-3XHInbU(r#_U}!03yi1oo
zRnDR`>5Z-D4@Sj5{4^Un4xTxg&f^o^sm17?KKkdA?u`r&Ak)I<z@w<D!klnt5L!Nw
z@CsVg-!qSdzxKixOi&#0j<|5`RY+2JuV7;!^;Xt>EQV0GtwD*uXjVPU=(Os(3GfcY
z;G74`xUFW$ybb+O{<hGnn;tfH48SV{p^ooUJPL9x-HSpn88viYc1csXBdIAdGsXeb
zV+j4sikLVsZ)nEJhZE!=R>*0&O?Kl7AX1IrBE|C%yDU<5Vt3EO8_Qzu%*?jg&b^5D
z4x)XDUz;A%d{7)tDSG^Uwj_{*Wq5gO7Hpu^e*+h+NZOZ37l9J_Ot9!$kgA~Wp+p+v
z`lT<F7KNJ2+Y+uJL_7CPOw@FD`z!05TjaT9^&cF21mzHj%=O0j9f$uU_jml3@)o?f
z$Q;vpnCG%>8d;s5j_kQ(3kzCIgcitANpJvO$uOdo?lOERzE*4XMv{;bWCYRzv5RC4
zZ;;$Cmy3K(a@S<jq{YQFyp9e75cj}|oHIfeCIsTRFTfeYRq4-Clnu+0-Z;bJe8
z&iG2nW`7tyIIRy6ds0zI(H4AAV-|di+M3epDobCb8CP5KOtNNL2Ia=l%jMSlk!*^k
z?$+)CvMo#7YUdzdZmGpwM@eIw4}c`&Z?*<sTA_^sE-h+2nvhnn`A9sBrFgg?qI5>6
zD71=d3^D_ClTSN$%<h^BTnu1bj8^aPGWQB1+3_9qj*5{mq&^p(Xcb<}-XVlaWCTJu
zufin{%VCo&1=+dV2M9lOIsK?@(D};@ORqXt2IsMpH@NYDk1>s^bi5oI%b0u1!KEZp
z$lHWW&{1aA716D%llG-D8P8=|`keFT#l=Qw{!-$pR>#v$uiC00t<!9o40N@dwI9Qd
z@p@||hGMxsSHvwzl+;{=Z}c^<uGpwZ6X|%9We3waD($m&M*2=JuQ%OqJt}l(xU5IG
zcpzSwh$ATZ=&NuP1Y_wr77$GZ$!9dYn!bQ(+HZoNNMtylN=-_hOh83#Y#EBYLB=%c
z{mKkoQ`GxgGv@v<CR3+wxzQi>B8djk=iL8@oC>_{td8dM-?Lhi+YWfnEqkVI>`^~J
z`ENhWT$D7GyOp%t<`uN0;MjVuav(g@YA7HWkpnlG#5hg`%6l%0I`>2(1K+pGT*G+5
zIst@aQPvv@Z0LNX{W2z!Ai$-i1FO=DH|*~!#V*#fjlF`4Q)Tl9om_-s82x%KVEL9S
zt##277=Nzq!Z&FD*%nMwO^BSi|9Pmt+;W>^+j--X=hz0k&9P;y$?>~Yco7i*i|>zK
zT#jI(obueM(_O1!9vyiQ!R}mF505u^lI~m<!|ypyNBovvr{k#KQmnT=&x4ZXoGj)y
zW&y~1d{3@3_k;4TFB)rxyA#l|G2T7(+G#B7lh^WzbZc%Cli;K#qc3`{sZC8w?M@4N
zsWWDuYM$37scEkN@#S;tAYaAlK`YZF3C=p#zn=_%kqm7dIe=WZoBTm4+joui@pmL(
zXpiBfv8JQy!~jqG?23^5F^?RiW9wW<*4+9c+9ROJapp~#T+x}Zwi?=DA;4k*86$|$
zFEiWUmAx30Y5FScC$>-MQ$LYJIz%$S?c1td<7K{;_dK{ro~`~=H>ByU$|XyZ4{zf4
zK0Dl?Xnn7t7)EkU1U#I~0+}kXr9<ud2g}^ufka=3gbi=-@$RgKoVz{~nIA@@g%?_v
z<Q5gb=to|UNJEmrrY;C&MQlb7c_PstcAz9XNAFtUK&ONjEGI*TK_T^Ga~}B<+p+@y
z&sjsRCjX0OS;np<e8-u`Zf<TA86VWJ%Cu>7_%aK%d{Z5`M*#TPtZ2WST>u(#clpUW
z`;!WsYejOddJ6C5C6bGjo=i2r9>q%9MJ<Mk=mD+hdL?_ti5o%@X>bwu;Q-RRrEl>S
zh9)PM>$&=gHX)9d04&{B1L>=gYI5g<@Z|XWN}04~ubWyg%929AxSb@2m`tAf{h}jt
zG%t@tH>sk#P05@V#9MXe_G?lx$8L0Ke&lH$e!<$oCm~1yMR@MWNezFuMxOZWPyLM4
ze_r{gM}|xf5Z@f=48U2{$_2d-GXrnkClzxAy%jIb&87Q+@GgvIqKyUT36ZZMgkqDP
zC50zBN>)0G&zIXOlWL13v5ec;t&r$(r-EbjRmKB!A1Z^C<sRO`kiSA;%i4JL)$j$5
zIToeSY&)(=hKpToGA%#O6m(j9(8m02Fx*j0(A5HD8*+JOIi#jJzx8M|)Y+WJIVh-V
zaekg}E9vw&#L;oT#{JqP6VV;OK1rkFFY9dpMLTo@>f%Yt3o-$bfE2cyOh7)+N1(T=
zpXy*vw)r(Rl55WjB0R+2XXCSpDOV_fZSg3>l|yC%P1LR!UVTpYvtCAtL$yfgpqoGc
zuvB3((0zTL&H*7G^FaXffa9cuKT4dovi?OyhUrcXj;`e#KIg(FcJ(txMs;Ct0qXX(
zB-%i2(fOnXJ>^WQtw!Oa?BR7%hOEzfjr1q?{;*F6oB8%UkX!C58_d_B;D*q!i<rfO
z|D`5?Pm{@4+HU#0)dkBl*LP3$#}VSY%8QcT`kqlq0^N?KQ;l@@+J7Tf9?dE?Jn}xA
znY*94FZF9IZ3Khrcf<!l%xsyN8Y#5B<Y85F3I0v7s6w*Q+LgTfYE=kuS%&smlty#g
zo^g+D{A<&xG!J{704+ad%@9u=#vC-i)Iee&{rTJnk%gbGCd764wC&#PA(b0@yYFMn
zbo@TI3aL^!(xkSv_|Q|HPtWZVUrf|j<*`+NGc~sdc)ROp9|IdXAV^<0Sk(P5wE=vB
z2jF-PWy2CG-l~xjxdNa&#XkyUiwD68*Ub;#{pxsY(|hSyr2~&r3^8o)rkDIyjp3Pr
z8Xo!W83JzS>~YcH&+QhaIJ5&`cSP8;lLlSVFxUA$aiq0U($|Bf(zG^n@=|X;+k2lf
zMMQTiH}131yS{O`N82*#^2TH)sRCb8-*p9mE<s=1X77)UhkPYIclK7v*=e38&=l)(
zd4Xks^9TX?ud_fr?~T8M^YA7?J*EszkiORti^WI`u5)i+S2P|TNrGxck;WHGeA?-}
z*<@z1^76v5JA9w#rK1B#HAKD@EJ@BBmhBMALhooXbfgrjJeAL$DJYE=u3pW0W5`fJ
zDR#GNM<kpaf2msqf)S{vlTLR!QPaw7`W6^$X+7qZSy9E`=~sH)P&4D`J2C_L^L3uD
zP>s}S)n|jneG5;?FRyUt08-0BX^rTTcU%nyd25mh;zpBLxOy8#g8Q!a;j~gpe(T0(
zOfaj!dH+c2e5RlTpJ;};|CyQUMQKT(`2*5K>>Ji6g=D34R?XLyofy#ekFK*BZz~t*
zTW^%#lZ7@D?PO=o>C0e3NOea`=T*H$EjQf|giw&td?G8&55?@=9E^dLbVJ$hoUtKs
zoH#}IH@gtDSFXL*LUq-~OsnJh7pX;|i3{#Ki=D5Zo7a>wT2B#r)NUnS8I7So<?35t
z0ZcgNN{8ts+CicUHz(&7IUWKWb|nTJNsV(g9H;FFJc;^}MJN4D>n3j01O=-Dct9K%
zR#Wp8Xj_>GRE(yBA(&(A30*-lG&n~2=P5X(YG_jVhm=FpKtBLkA4y;GSEZyK*`-pW
z(`;D%XULtF;LRw`g=KeZe6^MYm+}Ey7sH*JaFTEx&#}fP4_Z)5k&ml+`Qw@>v92u%
zcTnXQf+;)=-BG)s8vBwbH8tdT9}<xSCpC3gv~7zZoPg`v3ANDh!pb23Wq*`?vpo>w
zN(cg70f9ggAeslp0Z%+HUhk3k09@N}d7Hz5kX;|W6()ndRdfdG|0)5k4#A^vFAF|#
zo>Z1d#UTwR#R$#3b`5fUs965NlkIzXp=$8?_hOQWWD-a27<**5gB<tM{rDtDWFAQ@
z(*ONt@CUWg$rI7KkZO-I23nNfO%FRW%NpO&l4|5BNjptPKd6Ev&h^)}&rHRv;Z<gO
z2tgM&@73F*{{LjXKH;7F-I9c&uB!^bf^_f+riECPUXTQr0{Nm2zJyp8%j(Dn@qo?;
z__t^!!Al5_>9A5KvFTG<4Mb9Lv>>gCo>i+0T`@wB{dA=MnZBH#gEl@t=A@V}|Lo3j
z1&%g<4@|z&mXOWHHB0Giy0OYM&$D$S9@_N6y)0L3e&M!k*A)6|wnyO)I8Yq%ySVcM
zAe+|VR1#VMXC=ha#(RkdayGKDw=aqU-AVMeajS~Pg8*;;A_$6v)4Qa6R>J3`mG$Op
zI~VdAt5DD)4RT7%-l$M`_s9cA#9H>SA4a0tC=0xVM|kU<(W2r?X`9mw8c3992SU?K
z$41_lguZ#aJebCUSZ+M>Z!~+Fp$#4|TA6bk-E>>*cyT)f8gvqKx*KB()4Q$dXo(Lo
z9J`}#eB$?yV#Vo;^B-L;hz9xH0gX+^c{|W)fhB}C;2R{0o9XH2<oK8v=y11TE87<R
zR23;-DMouS`&@YH#F^IAbT4(^S86A_(L!|!p)av?a)-f1G{|DMAKSXiCwS`I{bnLI
zADaNT$R8TtOJvg?J@lRQK?~QYXMD8?4*-nr$jGAXvX8!sBk*%e+r#9&KpTrv=Tb)U
z&f+>0($?^_bBQs3EP&+5Vsc%gRi^t>-Vjr(%-HC~VxUBZzJOnT@Fv+*K?SDOv1!T{
zlNs@nX*d4_Gf$V)plR+OZ97po$G7<t(&I#6%q_%blO%d|&HS?8k|rD{zG9O3c_4K0
z`i?!L8l+=f7=5VVSZ^J)>mG!JLRa|eKF!WT%vJaDkECn)6F2D5eNcd7cQ`ZkKNdQ1
zX|w8tJ2qd8`JtCv;BLfYuQwT{8TT9i0pkqe(o(&Dwoo7D;c>;s<gRm~ock~Tpdk0o
zrM=X#7!M#1I2rMvMLKj$ual8IJp6vr-k%XeFX}O6TheJ6e)d~-mKIf)Wfy?M*s7WJ
z9-qh~R*3-lzVy?qKpVqEBXrqQ>iSp{IxplYBi82D4T*bz#xgUs@m$te6Pu8b8tLA<
z{X~7)!5?-l@{dTovbH2o#xT|E=~GW#PHlURL~`f+^Qo;DA+=!&lkGoeYcDN;<UibJ
zKsmr6;E9~2&o|H%lGJ^@tJNT*lw-y-<+X6`p?`*1g50H|4Gxsn3x5U#5Wc7x#mnLD
z9r2QPKk2~?(ma*?PLu=ycsa%kPv|XEZMCAMJNV7BK9hxmH#|H&H%mt%Be-}i^yI_!
z0ajCWqYZlLNNw7$D3Gx^KOr9kuQ??32dS>Rdx^4^h5AzLxIXMh8&3OGRqZaOL&e3#
zUnyWtD(Sz9c0FUP2*c&#w@LE-_$uQM)lBK#7|Y~ha{@hl;-rHQSrSo6Qlc?b%K7SH
z<o8L(#gXA%zfvk8v~~1#ZZ4W8Ta+}`B->VR;Vlypx6n&KUJs8SR9}*pMZWy9#tmAX
zj@-&<i6yi9`9VryhW&6Xc>Ux7FN|e(MC8yE;^wy4P`jM>PF#^m3BDtx%UIK^4y0=E
z;!?-U2Ux}}-<zT^jx;+DhGT1ts;clL?r`_SJ-_RhZC*?BuKz%*V=1G*^>wa2eY#-*
zKm`v2O2n3brONmc?u#b+<_^aO=g<x6>EPdulJ*1M`s8^iUKL!bBWoN}5S=U)C7bZk
z)U@KWwyal~ELRz9w_+pH;o-wn86W|vH3W1^5=X}3QYfX-4D-seW7od53?4wlT`2?7
z(vx!Rs(oH$>dK=TSRczkA^j$VB1WhTAL6(8XhF!i+Sl_9u*>hXe@8+9D6P0+O*vV9
z3pW$(!T1Q?rkGB!ng)*k#t})snK1BKVUW++rm5vVlRv$WwtL;w01IlQAST4r-#|Y6
z08o)ZIfB+po28;*?=^llw^-#GrX-|Po?58+rsX+v%xdW}ql#X?aFlq$@`!moD4Zr)
z>7kPRfULVO@T4XVKs{O~fmeoHdKQdNK4}^TA0ZbX&n{KaVQ@^{53nMmB|YEZ&Xwj|
zp+wcg1(kBy@^3yiviV&PBHn26zO1i4xE@K<_>$;Ydfco4L9=(E9^X^X$(P3k+IDWv
zv1vY#5AXk7Fr$QaUZTXkRj+OH8?^Y)E;|&YYER0S0vYhQ?=R`@Px+3XZ|!Dly%HY-
z&_?HXaplpQ8tqF*BO#2VIx?<9J3L-nS2(_lq(hC@=C<v3`85rl0hGf{K_DxC0-)hb
zTfNCDBxqr4d24fgvCKoubISfOq%(00DBbx3>zkbHsOPwhKH@+mg!a+&zN{}=z}KiG
zGUP+j&Kk&koG)$vf4(7H7<j~>UYpMU{hdu1U_@8$T)!;#|L_%q0O3Td^hxrPC;87l
z!*KvnUwtpFd}-JJ*$#~ofVN5ec8o6V?LW`M2Rsvr{1LF&%NwWCvmpUe3t>c!;s4L`
zn1Q%*WkTuy{Ez&<oBFR9|2?U{6z5;{`^&oiwW+^c#=nl@Uq|tmRs1(9{p%?H63M@g
z;$KJcSNQzbQT$~U|2m4ltm41A@vo!!OC<jyihmu&zmDSnk<N!>gm1U}!8U~NP@9;V
zcKlT2`)onH>)*-n%lEYez#~v^Su7g<U798$02{y|Rj9VBL+CYs7Z;xXJ@MB@bEjKn
z>di69yThilqHaBZnc{EnWOH`yCGq!-{c0JlaEfJAj+0Jo>gT+0ovTd_@%Ru;e?>Ix
z@7#_BeR$Wy9toM&GD;l$<vR+DMUO;Qh2Lj$^KV2bOd8}pkSh5*qe!j5EVv}7XpLS^
zkY`#$c|73lY+(JF8LneGRbzZW&!M(8KA(Hkh7Z~Lfrq@*AJpE9I}w{eT<tB~xhzNI
zOtC4(qJ#5QZ5Q>0z8}7Emh~O8KISWcj{E!KxC?q!$e9P+tT*@@bgEHLgNsZjh2tE{
zHhy>iz>XdY{(a*>kP>LTqM*$*tJMb&y;i7W`ZzfxqkJFO^dk}U^FN+@jrz;gT=lH}
zVfxanGw)pupXH}?DEO)9!}C8Go-DTa(GUphkdQMO0%Go%<U6U8HF5vHA@P5Ve^oDh
zL>@j8!3HH$kz$ep*we%#hs#w~;BX}J<nuqLEmmT;HuS=Zps=AQvnS%i5I5(6lgrJA
z0MQkJ8(^M3H3CsUkjIM*(~_@L{<$!d@?O~@=t~?);+o|?T`PclLuvEt|2IPY&kDmu
z`Gsgpd`rz;);9^Or_8@f{$0ENvv`TTUUvEseWA|j^~ar?6OS+z{2y8y7QJ0_Ef1fQ
zKr?>eEy`T}r|@^gX<#l~dPepoe96wv0j|zZqn_H2RS6BtU()kj<&+qlNPieUEf1(}
zxHn_z-xUa;?XKW*yk_Z*o7>)YCG};!!@|4pX#wr;|M)e<7v*xL0#S{uY4~lVl>e=l
zo9$1ufM?i8bX?{mfwBURp6ZJ7LqkP)%Zz7;#~btbv(ZskLQUX}c&~v06rRpOW%!ch
zyJ<X5(L5Dje*8AAp$|~&%iF-Y=7cvI3!g~``OLo7%oIqcwk1m@Wtp6veGuT`<?(ot
zchNCzl6?R$hY!bSTi_WngdLWD3OEwaPI=Nbq1O~03FkGgO%vkUYrd^8xw+}o*3huv
zQR^`)A44uFY!I_pt5G0ITRbgDZ<M(x&;?M9&whbGmr5kSxy#@N1l_Qmq_dse&}6Wk
zL~ffAx{J4ocVaYek#D*`$|3*a{_dP#b?Z1>M^<TzPg-s(kY4)a5x{Bc3@w=bX%Jyo
zrbbQU2^g)-tObjpPpA$z(1D#Z8|~<AS3Jom$gP}PpJtt&j@<|+yRLk42BBW1gkD(Q
z8SV&G()@2!`=4h5_6tzz-$#sJWO|iXUYB!vL{|6d6UH;-!k}$IPR^OF4~}_(838e<
z?lw<N#SO70p8IW3f49hsslA1Q$22+I`a^#wYvrYpPF#v`Eb0A=HlqU)_`?H@gh^9g
zYfZ+&Nvzq~171Z2tIRb&zYEtkf4`bVi(FYbE+`!P1I|6I9&QRKhAH{V<)SDcKjm-{
zI<6DPJ32r|JlY`G@W%gF-IxDEy+{AYC8V+y$xay}LSbx^va|@LEG282%09A8%nVY<
z+O=nlq*B?JtkaMs`wYo$jBSj4XE0{w`|jTE{oH#$-|rvr^%IZBc+Km5-sg2*=bY#B
zoRjUgt+jRuR^7|_xXmgtg)-CFpC~ZbxM;OAR5=8>2ojIRmVEx+B=GnO{H?xCGhh0Y
z$5kaDCp`rkSo$~qaX`^Jxab`ZfWP#|tRF-N)UEE`4@&5LbQ32obr=AG9#6QoS@p>b
ze5|<Z5`S^5!QDEySXVsPqCOp#YpJ-yJH?(^>tl)4$Ah&U)vf-mM`H&M0GfmGDE(rA
zd-<YJlyssc@}(x5s?NT&GXN&CbDK@aZBn7V9fpNA9?tgko5*TTEG<1wXd5h;$2-Ki
zf4>B%r_9$o`TwzVYn;4wE0;uYh)01r$j_P;Rw`}JUuxF4lwDis>OB*FK8Fp=rd^!u
zyrTt+9kk75;r(h#tkf~xn%ZSQHc5W_pQ8&@9~_Zv7(eGP-S#o9bM)C7z)o7=^D@AT
zA13`T3l_xkt@lju{iNad_c5qaL7sE?JGlWyrovekn2#0`@wOff6P9@aN#Q4&TSSev
z5*@W@&mqm8*~W_Dz|nxAjRUj({q1Mo@AKE+B;dwbzNIKN^+&B&cUZ+VH^eyKG1XBE
zA1?HhE*hM&{1g+j^eD)+muR&dyVJ`*QxsEpmI68J&+>5JKJDbo2KqNa?6(2kv21Yp
zuQiBOFbA};Lj)74#s!mZtSUo()B%M{&uiRS6-6^_G*wuk&T2S6Y|!R*?-Fo+TyEm;
z2l>Kv76LTHpP0ltr}~Ttl#I|srWC0=pZ`rW3})p%B3E%UY50?Bi;c4WVMkMVm-on(
zrZSO=E4}-2kmq*xXC0F92AZ9OX{6stC;ElyME6NGN^JvFBTE28uxzCM3-ACWBWnCw
zM<j<S_-Xt2>YDEaJ^yBnHs9FFHOt{UD!Z~5(3~D~oaj`6i4`|dR&-_33#<K_!V{Xu
zDVm@Orv=w|9~FG*m;V40*75rQBN3`%FJy;P)foj!1sZ2wWmFoz2!3AoK1@@De)^^e
zBDg26ZZNVgCroqG?v!WF?XIfZ&z%mzlE)Q?IQ}is5H+cI!dIzE`EipG@y|zPJk+Ht
z<@HT<1fUA#-swkKcxWq!Zn?0TqCiR6Uiik3I;Al{<v?l9x99&kcW90feyoARMY#qw
z4^H>(Z}LWzGtz+OhH31GqV?*=%N4eqc@G1W4c`^YSKu6Q&?;~J`*@uXdCD#w0Tcz?
z`4DkWU8BEEMW-^=3#x#5*?^6%8(04{P`m&9eoc<aYTy5txG%moP+dK3!8e$g&FN`5
z>DZh{7<>K_Ocp$ai#f%}WzL6Bh-NGK->Q>rCj47*0TcflJEx1K-z4y|;+RQP?aiyL
zSMN2nq)!_LH#Dk)*q{G&AAx3;|CS#M`xC&0+i=9UsgoRfyAr=@kd18~2KNk9{sh>e
zyy+bOk(1n+qb+B0*v|RvA3io_l5bnB2Ppdd`uc<auKOa`F-9V+hu2y+=1Shh|A^}6
zpJTfKm{~yT;s<`!V)vW)$Z&PtoEI)EmE-vv%oRLUx2(9gH1X3&L<5MWS~7O?)ayC}
z2cTt5&Ewg%sW04r>1>_N(J_BPq`&tKz$^I%lr;%o9er}t+(EhsxFU%F7@t5@`rCd$
zZv~TVwuw!bfsGuy3OuMHeE!z?kjT2;T&o&jTBFO_DEnUzG?sT@MVq^}M)Rj6oPk1)
z=#!pb05qJhlC~80xS~<nP!9yXFcJF8Dx&pSkXGaW_nnlM_7LHe4C0q1_;Rb3&|f!c
zNU3by@~J(1{j!oN@HlnWfPOi*-YH}S3}59sp2xRh^ofKqdExqM%wJz}l*MRrrRG3w
z>7TngkQZlD<@@uZk}s;N8YKGmXApt;narh(<)FX<zrPs{Xf76{@wop{BoutL=Sl@E
z`4wQ<t~*o=>oyJL%_;t@Qh8VEc;`sv80#{MNvolem<(XIzROGWWfY0Mzlior++LP!
z+r+7F+vT5XnY3Dn!nIdr&Mr-o&`@uE!7IT6!iS|Wj(B$PtQzrbOB%~6aAia*tc+4|
z6*VOKQvafxr@zEr7e}*u^ygo<Yuz)U?GJ%LH~mR*I8SPU|Cf1_+j#}MWzlMIC&8`{
z2fa#Wh{U@M=*CJ1pQmVUzxu@jR_NW}@wR0+A<*d_{<rGhN+IG)Fq9rWsQ2fqOSTGr
zsTroZ=K<^U_;|(lDLT=b0pAa1)6`zIPs7$c+A-Thivv)xzIW2k?Rjm^^{y4#0kU@W
zZ#bmkw~VBc{D)O8H2+kRKfTyPEaGk*YTlI@FMtjd=Ni8{x%A%2HJw!RmYo|Q+xDk_
zY%6^!-8gu6ygBVbMvSH*OY?Y!bonj$*LB_O+s`6tT~?F0WwU@5cOt@r{-CydZYF36
zplbXf)8E)Qy|y!+l61Gl!bpaAScKZ^DDgMJtXCz+BG4x_BmMM4Tm;8hTKtK=R{rg$
zsWTo?S)KmhDhyX*`20x-|2?PA<CijA^mexX#KuF|xp_xys~>}n_K~`XLF=c9r&p`j
z@!HNu@H<>ye*U7d#ba@;Y4UoW`EwI}r#uQ@VIgX#%cOFC5(933@5>HmeagO_Qbhf-
z{DsJ&x59ws_I7FzzZ12WZ0zmrmo2UO_ovK<{G|`Y?0$abHTn7Z9?s6CHBYF$OTVr1
zT+-I2mm1@_I^nL%U`O4A&W<rz%IFPWS+kyxXWJwxH&gQgNZa&6u>DQFz44m1`4erW
z2md~ISfIug2$ur(w#-7&nq`U-wVZL96yR+9@4&B;b;N1%u?qV^Z>4E?PCpRjQb&e;
z{#xm0m;(HxT}S4+z=QRp$0WxpED!yE3XD4-qObff+&;ub$a>cDkq~Y0ZEcP3!-{z!
zSrJ?FF)zS0H^hwm*Z79WaNi3qE#c=f-LcO|VU?}{6xJ9!?<4wKhD=YjGjLCAtNh0^
zHy;aQyApBHLqG~Y*Y>EQWQf4!>oi0p{t;6VGmzrTR(@8}J+l@W0*}I*6btK`4#RdK
zm%=fl+fRL!WNAL-W<4btnWg#e^M4t=pYOysM7`1G&fR$+8k+npUig`Rrc06)4Fp%S
zuF#q=&FA18WHsLXnUHeJL#6kw;!n1#hTUT$w>MUPGxdpk@4f{Z(}m>h%$6$s^G&Dn
zIh;5wh2P!1?LYhdUBKO?K4riQ)z{G0)+PcG6LV;$I=^68wgc?iuc}60TO8!8)YLbX
z2r#LS4In8IbWbiyyhXzK=gNks#j#^aTTssjlRGvm9QcWUMi*m}ECpxg7jN=SHF5&Z
zkAjBntVbp6qCZemQ4%{T98+3c+)YGmF5MVeS~B`VeB=Tu@ZKf~i|BZn=MdFj)U9!V
z-N#;zccOP}>6{$fGG+B>m@2lFDw;SyEm#x`eC>-ic!WPDQ$F(QuQT>X7<cB)d0X{@
zJec+^HgJk^5-;Ujy2nC+E$)WVOjk*M{-Vceqxmx`TH)QA+StlUl?HhwhNiVc)l;<W
z#s@R*LN57sm$Uc<Z1E`+a`$y`(fhqL>9X;r#NiJz{k^B7Xb(}c<(N#aXMbO-KdsZ5
zlJiydCmJJ}0d*?+hbnH}t%8B;(F!WH{Ntuy_U!CQN;3X_<^Iw?2ItBr)N}JWQI^s_
z#)tEQEb%R%b<vx(Xnj+GfX2-DweYuL8jCPA`f_C{A@WriJK{luZq&}H@k1<!=XZAn
z1!o2_tDr6G7Hrc)l{fw}yFYc&)IQj?qZi=q14PWtW6yl#F>yV$PKk#FhJ_=6KhjHa
zxY7J?yFhO}-^#DE8bikk8FjGzVDIC{ynU_(Tx?5oJe>1_jKsV7wcYV+(^0=4QF`@8
z8;x<R9zFQfZ2JqnO7O?JPkyB9Kh)FQK{TZG1Nzv@(zeB0<U6_B)1LyG)*fL~kMR`U
z#&w?RYeFODC4Vb$rXOGP#*glN^4$qV>!H)w)K|lu&NZYTglQTJPi(Z)GSpd*7>(R}
zU|uiU*OZA9W3U;0uy`+;)=-^*UN8e&(Oy15nr^R`OO(z3OU2AtLQ`R<k7s*WPBhJ=
zw|;&oY*<IW_J#iFSslnOVbM4BYnE^2Q`YvQQytv8Lz3Z(tEVjRlh<mehs-W6G`PS#
zpm-WlE<rL}?sKDN)B&Fd%A0BwL;Hek2%I~W4$fMc3?rDKr@SGat2O{9?0@7PGRc*g
zTVC1n%C>^MIgA0j^eGfu-xW;r>A;);ji!LgFkL6xSD#FbQVbp7q3G?p6bMfT{BxGE
zVu~+1X~6P6nSz#}Z;jm8k}Zqe+_O2gJP7K`=N~f|<mU2ub9nC6p39js$ly!2Kqk`c
zgEQ**I~gUr<(;qCOxVM<o#T`1$uZu4`z$}5wr0Vxx%@CyIPsf~(eQ><JS|bkigi!9
z;80tOn9=e=qicno)Q`fEL5jJR{euuPCu}$BQGGIh(3FJuXfw>F`5~S@ov}8OypGH~
zc%j|U4;7*dMnAb<j~*#xJGlV4*QS1ka;^|k#Y1jhJJ;hzRHG%z;H^<ttlHwh^s4ty
z1FgK~MJz`5TbX>*ft-D>Ym9}?5gM?B6S{ITUs6k5H)e0{3iyKm=_}W*vQIoZjr6AU
z2i2!m>M0U%AN_A6l6L0re`9I<VPE=VaC9;NV~X{&^p_>wCRbRAF@tOqWs+rD#@)QT
zM`{CPP2ozWd-|{ZzXo--W`Nb(#|c(&S<|p{F8IJtkA90k$o(c-tZkM$%Y6H-CZxwC
zeQP79Fx)R$o)UiRT#dvsZQniEOaxt?693$wW*3umKdF{-e|T`gqMWdGlF)sOp4Joo
z{UxpwU9omvZRlHdKoa405I*1uLN8GMA{sqe&soGT{%tK1IeO(c`k!%f9qfS4!g$;a
zZbQX>5HSr!_)+UP5GL@I8Q9hjyNtJWTU&nAtqS^G>L)A4L+Tyn?ad`$)ap=z;%G_o
zrQqC@T^;l^OEX_vQEb_JyS7Hi_hfD%EpcwTYPMgc`dCN{`D?TG>ajSn+R&g#J#p^Y
zwjCZ=CGX)^YffMWuZhw4#_;{8A-caW2o&-&8YM8BYa9`Us7W4Mo!>x8c@^ud+S5BR
zx5#FW3l_3%ukD8TXj7TyN_9SPvkZ<eS&D7rN#q;q?P`SwS*%H>30)tV>C){lSg%x7
zc|PZ;LRA??$1M22qPJ>gkABZt@XS+ok47e{-$y;LXHJ+$w^uo7w--N(Yk9a8zGW`4
z?l(6)jKpt$(r47aS1!x67X&6wwz;M|?_j+}5p$HpyKRt<>pkV&zj6S{=40Bmw2wrJ
z-ds1aP-nS{SW6>={Qa@yPBPOqr6|{LHjI7y?&4Po>MAi?Z+X3DnI4kk%bX-DFCu3k
zK_sqzC-19L=19XrruCiLXD>WIOzg&^ipaO0p^9R>?!QCQyy?w%3PiH0-fo9rk2uS|
zl!!4MM20J0JJckw`mo_q@fUN}CX%CwTfLr>ggT_au;%@9N*|Q(R{yw*3#zI)f_k`)
z`%*bN&8n?bRy8m>-pi)`Ba@sMwH9D68~RL2pxI(tPnq!=`lzN)xm4L=|M9Z8uE>uY
zakZBMuFKMza`)K3QzMIByc8!#AT#r7*=`(p{GA>teKnVc{GPJb8-z$TqQ8Tb70;T@
zLTFu?-}8=#dU>cmRzQL72bvW_$XLf-Qv^Lsb7T2N(6@rd+@QV*HDky_>bGx372cbV
zz#28Ha0MBkSG+lgd}gST8JXqzN`HnAMFy;9%<jyY(M`O+pFM4$`PispH(WQrV1>uc
z^GU-Zd2@uhwXmd(o0k{M{J6Yb4#HRp<WaC(jmopWVb>HJnO7nNYAK$c&7b?`ENzE>
z6iSc=$6uAiexsx?cLNcqe8K+xrUyi2i~N=YNi-4(h~mN*HHf43iPMHB)>{*W@Ciq`
z?H&yC3vCK}C2|LqZw^!44QqK=SjRh)6s)R6JS($mR=<hVh&bV!ww~BNP`i?I&^L;n
zdPT>_+TJJdhxgP+PtS)`6BKOY5JJK6d|A{Mv8y5pya+lmX3ujaJ*_#aZtMX4h1IN8
zhccD^1{-`T<nnAkGDdftQI;u@{Lr{ON&UO!u5qR1!XOC$8!b`t#O{-eSf%DDpGNRo
ziLh&dc%J#`Yf*CqZ1sacY~QYrKFI^OGrg32$7A;A=o#3kroBm+hC8Q*yi%;PF1VFp
zFwPTD$Gq`rdyE5e=40Ehh&Wjv2&*Kj<{OATIA-8Hr0(>^oP~x5tb53<l9WA1YMkC=
zX*B_{LGf2c%}T`dCFgBps931YUN{v8vI0x!2a~?HZ<>rqH>r8_^;P*zJyA3wHPiNX
zy<b@}URJYd`yN3G$JBo5b@x<pkept1S~>qkTytg4Eevy;Y#uV`YCo#B3LjI~nzn5g
zhu(h02R1hew$^*YN=xzzi(23W997;d&hdT2+ZP)p-gh25?d>0(Y)x-5u3lQI0fl_M
z_v30^#b(NN#bzDh1X-#OP+S{!1WTxU+BGeE0dVCwbvU^mC}-bLpS0(Bpo~&k@s>Ox
zxhI!(Ng|*~LgxHZEF-Tm7a1{UUMsrO-G};4&&p4*+K4LJkGgQyZX*tt%y9EaE-&f0
z$@+32g;B&qnR$dxkMr_y8gjXLb->4CO{yi&{xRyq<STiCOIgBtU6nMps#$mG`t(Lf
zw<elH4?5p%wKW{MZu4Ff!)S_|UA%=j3F^yqJ-+y>N)2DJkI*>ocgko&Q{L-$Rx{KP
z)t`W1aBJx~>#0#rxDenrw&BevZL;7et?&1>jfh@!VBC&FaOaw~Zokq(T7Rl-A>YKA
z3s4wT-)<n&hd;g6I%Hj9YU&vbzxsmAA0&FOnX&T%Vool{R0hb_jZeI~t{BF>&>#nl
zk*LFMjPs<dgaii<=WaI`9s7Eg3*uCjcs2gKcC5&B*L%gMaBPWkV!Hm}?CTz}d}7oH
zp|2e7_iOpP4C3{Riuv5{pURHe<x|>-j4TQtdx}h{z0_te?XAs&m>b3ieNPrx;&DGM
zfOsJ^&Ytbc3HFGLe|lX(cq;Ku>>K3~pkzqj6eF`6<R9ibqGf4R))=@Og1W3hs}>|?
z;*8(LEXHkNllH~xZDo7U$Yh?jBdZ--7WQo{JKC;5ho=EK+(V#2iMC(PjzcS3K#>X{
zO6k_iAqhvu)S94Cgj9X}<u=T638(ilo3H0M`(<4oclO=h)2qFO^rgHCE4u$KNDMBf
zc)qKFB5!Xy@5OCd6+1qZ%Ulj>AJ{eILX#vxO82j}1RoSC#}fLHw-=h1$i|0wE8{U(
zP0XSSRBW76s25A&IX0XsplBP@|44DYQ$OsSjV$fEpdI-5)TNNaG!1u!=F$Rth`ib;
z<+N6Gi{m|xidkOXA|4bA?*y+~m#Hjk9PH|?g#`V>!!shN?joiZ%{UXZX^?4cXN`Jb
zP14Al70ABj(Gf4R`%R0r^q|%Q1p&m^$jXGD!mi1_X8R{Dq!|&4d;{7740-25+lxTn
zmuRnwz>yaNMfD{wr8enpJdK1S`!u6!*ldc;(o84TEw8TS{F$k^DqmM&vFhLqVi%NJ
zSxfoymKGur^iBFjd54eO`LW;a1u*)DnGY~ohx(;;W&)s~+NMuDu!*7PBx;5Fq{jh<
zV+y(Q^Qot3xxULG2zI<@e9z;DX8XPuE-S<5YZ$DR3qtKDMI)o8VcwlhK5YBhSo0_Z
zZdAY;4{x^4vC}|1$n%wT-nIcy-eL;j7`YjT-O9@emNA9vkW)Tprfe2>v>Y0$=M?nF
zua*xL1;Qd^{b8WQU0d$y%4;2;_dKvmVQP&#m)sQSajH>Q%U^1#bwv99KseE7+p|y2
zDY6(<ewP>m{QmSw!<q-K@RrAWO_oOeUM48h8RxAua7BVj+sqO}G>Op_o+|4eM}Og5
zQ?TH@ApzGZ2VYBpUCT&8%sOMYMZHHk^EgK}m4onYmqg2(0^et(3M_R=?3(MvdE2!^
zRm&}=FK>lwW&9a&Sj&cR=*>8{Ral(1$XsY2<Qq#TY)6CTyACAso#h+w(zPtq+|JNt
zDD>NPz{k9UJ_m?4K8@vae=mi<Z==L>K9;VDzYO#uOO|et7wd?6G`$FOQxw|hnB*~y
z4U(*s;v6${Fu(B1`u;>{8YGyF(-i={GpnDdh|O2!zko!O<18ik6Q7ot_K{(OXI6}p
zy9rm#dE0^E4a(!jWhuk&1IiCBLd6!97Nh*W+sDeBk^9P`Z<?p@^}JgSX(My;dqlZe
zDC?Y^0ZL9o7I{N?6AB5CQ_DIc6Y)SofS%O}{=7d=ru98ZIM&OSYg*GEyy>e{0<r1E
zz<=D?J%$epM&|@|o9<c~%^IKvJ)^upeSnl@ybscyHjWHW80$*htW)i|oRN9^739wl
z$@%~bvUZ&M+fOB|`*~~=Smo<_rLtj8n&%Hz#LXvc$N6qnD;Br_O}clQw~kLGx^^N@
z;QK7YG|7(5bjZjW4Y6ZiHPLYjtx#k<w#NacvwLu{Z#{NDek5@ps6JG5r_LU0dsFQ~
zC^vGNtjltKq1nKz=;*}}Ibi<ODu#QG|9B&e>#@`Df<)nE?fc4D`3<=q7J6%gT1yME
z8+Y*D0El)$Vj!jNHFm~xp<Qd*r3z&q8k+YyO}PxxeR%HOVp-gI8T&A?Jm>mUzYbH;
zbjAXq7%obw-jLst7eGmUE9+t;9^aMNh}m_*P{NBo9rCtTLRA?L55!UT-r=K$iJbiF
zK~0C<XA{WtON6&4{>~@=4CEX>)VZZvDBYZ(?(HEUf)mGm^7E#t2#*{m1Jf_KvyH-Y
z$m?<1Onp2O8p}P)kgg6p=KZqc+1YkBG20nv{B@<WNubRXk9Oq(WlIoqt-eXMoeexv
z)8cdI*txF~-&As74@uMn+;r4+<3Xw)X4uD6^9)2>^O@bIZ^{|qybr}Y^E+%@%gV@W
zkF4cQv(D7-!c1m&s7gor1PoE+X@Wj_(4c~2=bG}Jh1HV?4rMAJJQr<M2u!e_@cBl`
z-D`pvNt8I^{}m8lVs%foz*%ZJrNiWJD2e$`R=*o{F*llOT&A6Dl<!ZbpW~(_eV|<G
zt?FSHw62y6=cctPD8H$?1+p6{Z)ietaXCSZ@}>-tRWosPA3IEl0BVlYeIQ&ioR1&w
z+MCAcFpxVhp>{BhO#_<OUB*p0s2xzw<7iY~6Lu_2xZ7i^ozJLBkJf}e<Rh2O=39&*
z)IT&MulH9(5=1g}#XW3sN;u}h?>O{}izu9RGc>l_VW+E<Uv7=L4~zb1W&!BbcQ3U8
zJ)few>RGlbLAu%0_yp&Nrf!z!{yvt7Xgdsa<Khi@#|<l~g?pQ5)~#E5L~^jE6wbyD
zd!}B3z*7~4+Fc$W$~uhdns5RuGumBzu+Tscg?UxUh@Di6a^<IorcH!T@J5`OlYRQq
z&2`Dlex9$2yfJNfxEY5y84<~MicK`^p-EXnS#`%GV%BW%25PDp+wRgfer{IHw{gp2
z9=atzR0oZtY)x5xT2Gn`+HYf)*#=eC^GR!5$K3g(C(7f-@Cxds8}E9GJJU`)ULbfM
zUHNrG{F$&qpJOQ~8S*T7gQ&Svvl`d4sTIERoV1)%JL&TU_Q*;R8q>aYkXMtbe^{t|
zb;3Eb@E})ZD)h0Doe4=)==+FW6DJ1_lYX7Y)?AG>RJGih**j3qzpQrFA#gAp@}Rf_
zN&d{^S3X`u-vS5SKqg!4s+RhcM|#OKSsn0=>as(bj{BuJdfrNOdV}lb;uN&iAtho_
zb3E?tR+j1XH`f9$L=Q;O>d_3op(34!C=0u#bh4`b1Js9f^|h)fKW~v+&ow1_gRbe&
z+!~>ts&mg>LJdRFoI*R(`8*_FE9y&5A9d*gc}Gih>681jO^>UW7$#A$zrlh(vR0QI
z)H~$+ofLXp)kjlYQ@5Yt>^d>#O~#7%qqX7&&iZSI)&UW2NdhAHrDr?4udOx3UEB8N
z`e(w(T>ZLn<pVr7eJ%Wjmkg_Dy=d%dR1r*kSeB~tP4A>mD<r^1Y7;5~!wf&~C_OmM
zb~KT%y`%KYpzQANS^H~TJ~U{_ZV_Id#uB>rad-$hGr!x<&CNSNlBpf@HI9(sk~a`=
z=T<n-WZ=!+vLvBJ(4};=l@ZoJt~qKf%>3<w5Y2QqA!!Qhez8DAEAzAh?Z`T;##%s9
z_4vYloV1?Nb3UDJQ=`$u!Yj+h#6+KNYQ`DLqda9oTcj^Mkc4@EgXt!+wJ;{~&&=#G
zi{M;f{CC4MTM<kmIn9^bT|vmsq+|V-<@Th`0Y$@jwH?vvCU?7=8ql}&krJ)nSnAPR
zmYB7m-{n9T#V>>$JaVOlS7^{S0&0DxDivQnF>pB$MpU6BOwRHkW+^TRMQC8#Oibk%
zNK|sBrCUD1dH0DAYP6&S$HwK~+`?lN)KwUCnA<loQY+2LZZ3a$KOHSfqGl%!Qlq*x
z%EI|hMgKlgF1^4tlKS8daPgCj`uOn08A28E-b!H5NbF<(jqnn=gpyKvQ~Xn*Zi%+V
zoIRZ+K5)0X(Nn%L-V^+Jz!e6EMpstN<F&7%z#Ms?o0G21X(<k;rh~-W^AU-p%!F3(
zT>rHon!2XNp9wdB=oC~B3c5hWQNI*o1?`7td%cF;E0Rb;r0s7SO4#payKl`F^ft1z
zYy7d3V%#6+&(ccb4_nXN;Pg6oj?3r%lbeNQ_bruH79aLk)D_rnzae<WH?V^p&zmUN
zj;&#=u*!uD1pJb>t*F3$ua~IEC>NzfXEB<<6Y~SjiC)jwRQE+JijwSX1r)v9Vt-Hl
zVJwWuD0^n*p|I>s3lX3Z=6PJYzbUE>H9XU!kN1hN{x)Q6em=gT|D(WT-s4jdqPt1T
zyZGF&^WHjNxm>Ort+qUQQyevC#??CGiG3iHef>)1*O^x{3b>$lf%dYbt)#-q*iDn@
z`l=&z7m3}DMz~`b_c$leK2ojW<A(iBlqSSv6PnSprNg1vW5z0bA-zVrai=R>?*~!X
zy^}K5MbI&ZyV~fbM-r}Yg&4kH8=O1TuAWTWETtv?w)YpE10=n&Y0vZ}Qc+B563JLD
zo@vkA@<iRBrq}M&3&^7uE+BVvP^XYoV6syWYS$(3)!KWFFIjuIDyR!{I*W%s#oy8~
z?AKLozmvo)3><8?>qE>}=&imc?)MxnRDR06ah`qufghwF9OBrzhPF)-YkS2^(;v?N
zwaVZaZZhnp`23nv23Y7@wA)xcqrv+tfn8N<-*9CY#j%;~SgFw7kxs+MRYkD?GxM#%
zU~TP;l2AvnJsy}bFnpsAsvV~lx3;wbij`?>^?hg7egA@dDC>}E@Yj|jtfIKSYBn(y
z(2uM`3kfnhSK=>{CX=#Za=Bp^#rDHnhMVD9oRHKW^$W+0I8>72<%bevtY?<*-83yr
zi(CQff+=DuuU@R*Zi|?d3-=0Rz}&W=s&SJ(V8JW!=3Tz#sj5qv{3V7c*r|CMZv2Ih
zjnJZ0(J3IB^OnBU9{lLL<&c5c^<#^Q7I>TH7k-&*<x7wckVMK~G#+FA%4GzkI&i3m
z;S6K}zkCW@OQ<cSC7SI_E!EW0YP4B@3ytdsAZ6oI6ztN*3oGm!+F&NVO1G2ZuVK^+
zz}a=ef-}krm8mm@qIfT7_o3^5WUiE}Ryxmslwr<x9tkhFA1YyX7Jxy%#$I0I=xj+&
z*0v^Ha^i#&ek-6@f^3pJm?A`*W{4&<Z_YpH8H-eX#VA5M9%kt|AsXS=1qzK(J)Xk@
z<J7ENjPNZBU5OlfK56~ljJze*XPjy%>>ht{*z6soyBsF|M$F3Qy=S*zs7C1VP<h!Q
zOQV5Cv|o4ORleA%Dn<QUBq3HYJLiq?@??U+a{DD|Bc1N)%^87AnNSf{(Y_R3(9jhc
zSP7!GU@zO(nPz>*4h^zfC?(*04*LU`^B;7CgGaVM6+UqTug&9@=C1~L(-tfIl@YV;
z)&1D%TSN^+k3cm@ceWK6wk$#1z6))6jCPJIU5+oGQt^bh->G3TNGM&>lD7TLK*3u}
zI5F?1tXEHT*`G_~j)W~18z5>T?tZjtqkN{&4Hqy`L)KPCJ!dGPui}Hb=PCnmY97#?
zE)Iyb-AG@(mWP=W8;2udA&T4G<C(lZhtSA%L6;j04EIbB3E!|WAT%IKlCWy?Y;Pxj
z$lUj0-+M9Q>ercmj({&rI*~S+M0Hc6=qi^N^U$2WYRIS@4<E~ByJdsw>%|N8W`4+K
zQ|aE16~5cJ?yPAh5v{)F#CA0h({}pif&isx<V_>cPUc6r%*hR|9K@m0%%CObofZq9
zk&zPGnrvpxwn=vZ4|41`B8|2l457<cpvJ{1Ftan40)cdWQ7a|VuLI{qYTZhk4En2`
z1V{$p8!#@)&m=&05<u*LbzNT6TkO$8m_@8_<S@;t(;BtKHIuOwk5QXeh4Z+ntGKrd
z%pBBiGtrFx!^||+W&93~oL)2R$xL7ByW@F!UV^#ZoZ=AST$M2!j@lLu-0~zXSn<&I
zzDiDAX<d+39iU)~1}4=>HwF?Va1#AySCk9d|DO|{#vmW~!~CoC;9mhy=rq<}yw
zckRkLxo=BXTWs}^dlo(gw-Hxef!pkShXd)YGG}lr+_kR*7hPRN7;DPRH(1Xo?9b#O
zX*)*fG4R^cuV~w?{SH~sqxvxW{Zf|r;<n3THy0YB=*xU0&LLCKU`n}d%27=!E4-?~
z#b_@@pkKh0Vl@`T0iGST-^SaG!1Ozxsb=eN-s5yT#`TpQQZdFG#w(sEBxA8FBz0N1
zm#-P%k-dpluN%Km6SP3var;8M8v*s<$yn}@n3I>f@f^INFmy%E{KAJB0lJKw<OgXs
zsTs)?$+EPKFUuOuPHMtw2B>MQU|(ji+@>L_t?I?==&n3At?q9C+elwzer>PF1O<*G
zC(*})_{st1LSOfWt0R4OeoT~aH))^BPy`AS#g8!z+zCqZgB_HJNR>EC*6V;0x_zmX
z*Dr{F5q<?iC=jMq`||PHy>>#(zUe!%gw~z&d)e1a2?5`FYEV1MwdK)2;6pvgqCLBH
z9|+aHO-OQ&Y|pqg61r(7oBaYj4O|K7Z8PGmS%r+bW?$wigmL~S|0FkX&Z0Iaush1`
z2cok-=o(c3vYMxeRE2yH_uHL`!<kWMj3HW~aT@-RH5o!QNmspEg5C}Flc0J_+-VEw
za2D^m{%v>n-4Y13mQzh)(uMr(Hw^86ASOVg>K~o`#-EE67zD%GF+%nWb&r`eQmyFK
zGZzti&Jz}yyNOG+jDGut(H$V2O$eHfOOBrzBru-oN_7aKR)tdT{J=2xDE<WGmU1!-
zIh%s%LCobKAIYa|F@_51kNgV*iMcd0=4^O%CUb{b5jfDGckV(qNqKlj<*jnU)4S#l
zY3#dKqb~;wELSCaX)#$9c;GD;{o*#?26(JTBG>m>Va)2$^2#mk*aEqEGct$!tbNd?
zT7we2TW=JqU7FvMlV(r@6&XMkweR{>&02UkW9UlyJRt_67Jc4ss6Dr%;W)@^$RI&A
z=k-8E$F56X1>*goGSF|Y%NNOYNZMOv?DEqjb?lqyXIr3QiS-5NE%)#E5_43dU622e
zNNx$jow=rizK?pLODSY7kSXz7<rFCh=$*!Z_?f#sIRZM6xPV4z;;ZG#$$XBox@*58
z4&{&(+{9;Mw!}`vv$MOC5XNK;MTb)BD?uyfQDX9L0@sbIcpojF2+)u)H%Lz<_Ed-|
zfB?Vw_#oe7DZ}~4uB|nK7@0&8V{wrP9M|PWA794ZXe8W^`kF)cgL5KY4YQhAlLMg(
ztMF5L#j?7DkmEwvh8P`C#wBWHv#vU8YIq_lsQnN}Z^NP)@FFr(P%BTSWFR@BFhbyH
zY4qXUaS6st{s`=DZ=qtv1Zo<pSF-0Utc}=7LD|4Zc?i4UxrSJ-*W!#7<HxnzQG_Mj
z7&oJ0seD-PW0;jr>FOcR?cp_^=*%^F9tHOn2LBEE2!S5-P-b!WcjX<BN+<=Dj9Buo
zr5#PMs@*-;y<Dgylga?Jdg55pdR!aCyhLD}Cp#Ro66QIFq*b7Dv_0&v820ZtzsxMo
zR$u(eQ;;=6+FP8DdlEsrdm@CvpKW`(31%mvZ(fk0^m|haqH})uvMFDG6#D6+818dw
zbLEEr!ygGgZ~K&*)nL1a*KNp6hOm~$xwb1XeX;d5w6M}1?O}DI>s)b<KmB`$p*q>Z
z?_(6$w^2vAqg~=fnsb5tx|~8u>_Ek$*@>HhaqpG%IP+p$c0UU&1EJP}^UlR#Pi0zh
z$Bzo1??Q8;xe#>vdtEY1{pTbfR5?9Wv4??N{!ZcBZJ->Yt_?=%c`sQyg_(1`D?xMy
zc~D*@y@Svh@64jKAlEXKe)V##W3YN6*V8BaGL4%)#D_OMcm--)yb>29Iev?nf*7<)
z3>T|iboW%2@kLH*Aa(0g;uK^q%Pj&hh849RMekP`nkMp{*U4f7n?3nnk#c6nhOir{
zdWC|d)_6A0yC?^?zGVrMuw~QP8|R4|e#>wxVZ43<%K_XTWuIa;gXke-*Hdw8#@B?p
zFb*NL2`~r;pMT9Ev|cosxs#UJNo+?;f<Vc_;Lb+5hem>*nGSp96<_kBui&riFZk0I
zJyXLpv6NIR+}bS_X^KU_oq8!F`d*(W{XU$;aA*Wf)<%t^-yW`vT$xw*oohqLkUS@s
z+<xv6snriYyNd|XDiZ~Q?&>fhct}T+5~!HRdu^hcjeS8VeJzyvf%2q;iH(~35V!B`
zURH~GW6+9$FvkYeELDKkt-RZq`1yPCpzk}TZ$&FIZ}QkyW|*eTW)3=~oP@X&r`nFp
zOk%9;WJ7l1=lYO@HN6b&UdBSteyhKDes(M#A`^nr^`b`kL9_};RbZP1E+_gid@UjK
zrcuO-?JHy5x$CRwRPAcgPKJ<S%lh2fH7lF%KA3g%5@|2pGs=O2At%2EyrXk2QwK={
ztp>cXW-<S%r35md^uz>TMNvzD^?Pn&v8>br)i8Edg)_X89pyJ6m`9ft_uC5<!aCG;
zo=kkC;o;dV<m2X6p4+yJUVOoW+Ai2~x0&vP549L*r`=8nVm71pnq6w85c1D<FNU}X
zq}m#I-96^n#<Uh*ww?~}xxuMu9N+L!GGgXp4D3EYCQy|QjSs#%n8k!VD`|-X_h91S
z%q-;)4&SES=*EW^;!0v*y9mk=_{Oat-i?gJQ#3{*Gk1QT)PsE0+lQf<n=!Q%=r-@y
zE%cz)6_gxZp3b!ol$?j*QGZ%Dxij`-hRVB(ODsGA=c|$;ib5i<EB&NDYI$TyM@u!r
zN7pe{7sSg^O;$nR($)>Y{N?da>CD?e>@epV+;5pgM~_#jqSiG~B7}h*s|0%0vlk`8
zoxk^7pKC=>I=P~@rK6j5rWM-UEvpFb6+}z%>b<RsT3xHtLHJUyfF;vdwu$6CW4*PP
z-go*$t`JdcAk+|I3koTrRTl>CP4%hbsy4ji0&R6RDr3Kc6%aO>$w%eRC=^jx-*`ml
z*0ese|Du<H%hVmAQmh=N^)l9nUw((NMq0Nxyoo*|)zxa{2Fhz$Jn7Zytm!d*Zg5n>
zBJ^-u%bMWPi9~#2tY{5nlV|=jBFmFf3@L+eVW((7k76|ha?OYGSEr`w13?7p9!V@4
zhGbGv%AwCtx5P#FzL03kS1Z@+g>YVL?EFEwy$HZt<Db|Yh@;P7N;r{rw4r{#pmYIy
z(^U@CN_fJejupj6k5W46_!VM{`MOSE3|f6t^R+utp+?`hCn2~Nb?WdZD1pUi)a%aC
zubt}{>d4YMfY?`i?tSx|vD>aErIu12AFVLxem^KgKpfLjS(}by%*3^6JX)2U5vtv$
zUMXs5c$pUPo^VZrcqBU=nM>O+V{Y)T6?Ig=$ra&;X4|{AFs+koDbAF2<<R88cg*v!
zrkji?SCA40mIkLWOpr+TACe(t;g<<SN<GT&#8oV`<z)7)HZ&Pos~UPTE+c~~Re!={
zOjnr|ZHost14It9L{HRosZ&ETQV$HmWbW7%{3<aEndc8!jbgs0^)q)xYM{FF&D9cn
zUvrX5w&_p9xz`WfGJ$M4Lq-w96?A)ICKHapT#J>H7okan9f8nFneS-5_VmwwzB}b0
zkE$p20u1U>CUd<|X*eKaO(Up5V_`5(k|MV`m49>bYxjt%E`63DM_7jMP1fRQfG75@
zWa*mc6ALC!@;WNV<?X@3!C5MDIm_iJa{#QpxMWuod#3>QWT)MJRCgnbCx-C@pv0S+
z!P_iA+3t3mpRAfnBXB2in{&oa#KM$@V_!BrnT!5O(8I+}m+VUzE~(U7KIMy63k}|E
z(|anXGoK-KQ5q`bJx#-!<v}59c#A4f$&sh_)!2j#%Id@l4G+e-yDYn+665rwSPBPh
zr+09f&<OL`SYPh62T+qOXkh_`vJ3G<y7rC|5|{tT>Ly<tJBzy>QAZBp2^u?L@_<~a
z8xg)aD$cX!!iJBZP0*Vam`u(`%?yQe5t4oQH_c^bd@6kvJcZKpDQ8DXPv-N9rz=Km
zF{jJy5hHzm0Q)!*dStPJ@NkoSR?nHXsnSoWu>8V6F>V#Wklk$el{khZISyPzm#&7i
z=+t}WYb^wAgtXl|!M!YVa2z3RdV#O)mB6&g1=ezbp+uWQzHKz5SUSA6>!UJ>VUnZ^
zvEWYmm5m<TVBdb_Ww$PiE1vyaNMDDi$6{llWKs1D1Kl4#V^Imq-uemwYq2A@yMz64
z>lxwYlZqSUE+fp!VY|G{Gc)j09+~at5CO{UiNmh*YsA~VTD-x$V?|00$*C#l!+nLv
z8iSS^ac%>iqn_!33(}U~IcRE0#AWQ%69VB!+sVK4gCXQ2azcvjgXaQ$E6-OsZK`=`
z{`h^dctrm&C~D=Yu;aB_+fd;PXzUn7vPHf*Spor8RCBS}B+k;yAQtmw4%cc%u$fCi
zSFR4my6j*x_|tp;X8RvFDB!ixbVeiY#+V!D!K>rRke1fc9y^E90+hXaMSB%WjdU@v
z1=r~qb+-&6U{4@~)&I@@|COXuQ!+1boHWMA4Ua?#OuaP`XiU7WRmFG>_vz5Di7Cl{
zm`)ca9~8Bn1z4_61Lyxem*;$la7d8l@<w`crMy!3iH-FjZJz~f#ZOx6(N|Fmew$Ty
z%OOzOkJH0D;NhY2|Fpn%X5BeRdLY;zPCw_==@pBa%EJ`f4T}CPIG|tMRfoX9n#tU%
zj3d@z`!Y}KIsZ{c;K&N*hpH-bG+LuMy{^OxEXRqHOkA}GC84$9`_1Nk{*kH<d3FFx
zcSrT>ZcpkVvwW?$n~si^HbFanenj)d`RAGmi1zd7C$6{P)^NxvXbGK@8zeF%nsved
zAAQUmUa}TGA!qu<?P3Y`_l_nbLfE(6R)zln+Yj@%><fqY*B(Py7*GDV4hGPcdr0$;
z&hmL~Sex~0Enc}-mLOyD+pzy3ZHMr&A?Mj^M|GE6aec$NMn(2prvR*yAegBF`U!AB
z0P?VWd$Dt?iS)(6L85#i|4ieq#h1^I>|D+&+Mo0uDzqy2yO0F<ygq9oEvnz|4?-1g
zvIbM~`nclOn*%Fp6MsEvDyRt5XmdB_!HFBc2Y9O;mBKHXip8jHl%JZ@VML%W#j7?t
zV%otsAJ`fHv*Tyo!vlL=sBPR561bPCcg=QCqG>*_R<7O_lJVYFTm{zuqsXn}zyg<-
zbft~W-C0W-Z}`;P=(<Q=z^l3_^|Dyh|2RVvbt?OM=grZixAN*uTX)NO9^ABe``6X~
zllyy+e>>JQ(21HsqW@T`BB1oC3WPup>Q4l|X7aS>62L#(pG46Yx*ELloI_;YMARea
zR5o@(&A0FUS|J^pf4$r|gj>$*TD%&UcZ8udP+@iF({8^<bcei6>#7=Bx@IjtT<0ED
zY&oELH8op5Ox^vW+7Eeo0Vz}FDRz-*Q9}Qhqnal}jCCf|u3YmUS?#m<)ctE#j3y!^
zTz8L;n4&kWP(#?uBP@w;zmA*j>+3VE<?9PrV%vV<jmvoPb64n6KkZ&1@G5TF*eqn`
z`aoTu9<@FdA}b*!BoDs)I37$@TjcUCr%@&iUR$W5Lu^`$_`qRjyFd7~`GX1W@4OeZ
zi$ZK-|3^<)fYK%}7G$4NefUO$^AX@B9w&@>0WA*s*f~8vAn)5<Kr<{qiAGv@<Vu@4
zGUr<^J!c2sY&&~LCFD?H>Kvgu&e2qAHT+jP?oXrP^#&+20JS}IJRoZS`n_ht!LYsT
z==xa+x1ehg7h`W+_<hnU$=KE^UhR(ma}8h4>jKl}B0$06T4EdY;%m2;|J)p4XhUSU
zycU1zz~q6%mFJJ2GY$J*@|?YI8lMn{akR@p^SDx1B*XWwq!@J!%TC#!2&@;~ibTNo
zsaovJ+G{r8Q;q*gEDdJ;QnyiW&JR>BU3`5)nh$n{P$-x=a-sLR=Gg**K|w`E!&5Yl
zPq>R0AJ3iwm-v3~gT>O(`3AJ^AxM^~t#WAdFIW6m;R)rsLm-r<qdu&z;Kg8X<4igZ
z<bq6$i1%JMpgW!_hK*tz7~TkdQ&OsWM{DWz{o8crBGEPO&6#8S%|lOo+h#I9_UrpU
zuef+u&^M;I<M>v*TxeJOYJ%Aie~4*xV=JhcrgJLCA^h7dP1*Co`&fV}7QB+@gX?VB
z@gRM?E$R;aiGPUR>&W|Cj-8s7v!$3fQl>YlPvQmMJs!SQe}u)9?JPJ?m`w{lg{Ni^
z198)^dX8Uf{JB`b2?qnbj1);A6`)vhUOswI?d1j-Z+iB@JCpbx4)yK<PEVy6Q{pVX
zNiX~yov|dB!4TK}*9zM;b<m3a;W7I{Pzq;&-jfHsV9FNp34*f7bMjAkrwyK^tjLCI
zRUO^>bL+pJVWq6DSE427s@HEUtjb*2SMPLe;8PmcI>t}&PZ^LA9N(S4c|Mkn8EEb2
z^1<v?*_!^(!|w}uSr@~+o2?+qFL_2L<j547K>1WKzu~s-gs3?>(^#d1-I3{#x6BIt
zLsowa4fthUFk<ihPH*Q&TSdX%*~h0y!;Y^7-uX9Maf_uC-irL!4gqn8400JDd&=$x
z_nf`vt0SH>?;tT_!2WQk_whOTw12Mq^B>P#4QcUwwKbW2JKwSwS*xu5Wlnrh@<)|_
z<obWtWnrylce>pRtIC@zcklV}iPOjJe}7~d#9MVOy5*xY%S7k>gc})`{QhqrlHrA4
z!}`_Sz15tZ>y&{Gcyfy|fq3=l<bT%-VOmK-&t(#CoSSW()aP@=`_)`KHNRQ?!m8s-
t2_K+R|9+f3?D;9xhUjT8&9`~e+}@j43u#8=fqlTwWkXYgf(s6z{|7tFY##ss

literal 0
HcmV?d00001

diff --git a/docs/source/assets/deployment/open_webui.png b/docs/source/assets/deployment/open_webui.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe9a7e15ea71d908c76eedc52d92e901bad9dae1
GIT binary patch
literal 69283
zcmeFZWmuH$+BQte&?!08ASo%~fW*)s9U>@*fYJzvAku?$BPk%Dl8PcB5|TqH9nz8_
zARr*!?|He_{k&`4w>;n9@5lRtZJTZ4;=0Z_j{Vs8{W!yQwN*)p7>Tg3ut?R_l=QK%
z&cU&;aCKnkz&q?$-Knv#ur2Hq6?N4W71?y}J6qd1-o?V=iFc2y*TCpfwAJ?|2GXvu
z1?uJ`APjjouB$q<yn4WMt*}}xt<_L%jw3oEy=MBt><`2VwgTPxmd5vN1<0^_)s--_
za_Kd3`<pF~=MHB&zSejXi0vD;YBHcE@2?p8@FVNr(QnnELReWISSo!X;!o4$D!`i>
zqn~F|Qz$-czL}Db4(;D)T=depbLhUucle=>{YI&mJjZdip_uiR02(gsdq~Z7TGtva
z4azkl2~%0B4A=df8=~nElqq4yH?(uB`Z-*A3>KK&GcFI87c)2S$bL;rXJ5@GVM8MW
zQ=5_cId+<c45wQpM&6dj_!ut7q6QQhnY+p-3*qO~I*oxLenDa9gLp%m-}q0x`z|uZ
zS*r6g(|)p_S$73z%)|cHuBfjKV<pn+)teJjBfgH{dF+<?A;k3P+>rpOr#Jeuz`~kx
z_r*}uri{z26Fh6=T|!-{_mN=Ck#~*Nt+lkU_`quz7Ivr|79Mzo4gMIx9~Kr)HqKw~
zoP%fM{`DGH2m0`x;F|(0EEJZylDwfO_R2GYR72&R50N}7QAIGK2osTocY%@?rtV*^
zbrv`bi7&uC%-p&UF1LvbSt|=$b*Jjmhz8!-`!4X|phoPgsEmwEujppx!qU?C<-s>z
zYpcQw*B3I3n6G$bz{LpFvRkMYd!H*tGS}dy1mIm@!zNTi;ZVi>*GoRtZ1?%JQqDho
z^dCRC!H10>n(Q^t{Naqt{QhosARY~z4>pkZAHV$POM#?-Kt+w(UokiTelP#|U0rz`
zBC_*Ab_RdD-~ap$8#?(as_}X5tw(JFe|omRzFWWs?w+WrocDiwNR$R=_H?g)@yEI3
z{~DmbF7oeTGmJxx6II;(UvCSW&`<=uFBS4`(sH#ajktn5@}WJQR98ROj>lem_{`&(
zIZQxLYcRJQoV9NM{mEn7FC|uOpSI<S?BIRx9fo`Jb+X>gC*KSKe}W!f>(h-hXaW`(
zx$NF;dZ`Dt1N33Fi71ZWQ_E3tZ{Vx}LWxtLY)72mFkp5%x(xLn{)8WSR~`lTZBwzN
z<xcljxh=T#Dbod$N{nP1xf~zvk`pjL(507fG=6hfrHAuH2@5?f<iD?BY?qtVH11r_
z{PW{x_d0ly^i>%@w6yP0_8Chd4G-Owa*wHdbwH%N&?Y`OJV62ZtI-^zxTeWtm(iu|
zV(fpsjrsa;ikU#pb+&nLqIx@#hFj~^b3-equ_H>pz|Ea-?#!^|BM!7ZS8tu~q08d)
z&=6%>lM6n(Umwos_#YT}ZhyIsIe*<$&*Q^+_-B;`^ciDN;Du9D)Gr9HZ*(O6v9NTk
z8kBC%Vs67^;5EBEQCs~`%@R}TI)`J48>mn>(f~7VvpU+_K>JeiAdM1G;?%aoZ_V}&
zr%qW~Lcnv&z#mSy&dkNikr4}&>xw_Dd39Lv#pou!@z0Vr7Ue_9g$y^^uiryv@vPIG
zDjHrEnwTB`{POxb@_VXPQ!A76#9)ZEKj%O9U}_BB`n4XBHu&>vuZ;%E>69dBsF=9u
zc*}JiWt@7A%#3zf9tj9zsx?vsA311vt9RjL97S7fc3*L(A#06cW?G#4#&{`%m|H-_
z5O-N0{Ef6q#mGM}b}VU`Epn|8zq>qH>%Dih*^|$R!<%PhayQB4WPc&F@1?5F`>*E*
zLQz`GNdeSlH$U7nfZeG-IdG`l|DKKsBBY*`(K;w@p);5Exc}9CIJ(;u!5rq&`<de%
z_&k&|#q`pI`><KnIU`fomUl~e=^>di-YTx4JyG!-s;yhsa^6*gH=DvJIqn}`AIG7*
z=>4<GX2c?txhVh}_l4WXx3Y-Jt<~`li}%KzKA6$V`gr!Oj*_afJi0UDwfqLtDcS?S
zehTh-Ce7h(`0jm=`lI>z6mwZqmqz^k*KWl<(wm*@dvmdE!yhKS9QxLtv-<6^e*||{
z@<zXydO>!*0e5cu`*GOuuMVM^$K<kMfrAD*GUtakxL4bDJFQ}6l5bAM1%pu(IvGKp
z?9E|PMXVV66Wgx-BS2ozRmNLq&Ec0GWM~c5^76MQ;gG)L`(^Z=sCpfhO4qr5p7S3V
zH#l@oZO?NJ?{zvDExgGm9rHSV&Fb)GN!Pg%y`_k}(jj8cHR0OEJP*IIAadQz{Tj{r
zv5I3WHQoqp5j3&ba#2~Ug#I^N$u-xNPVr$=v=zmLw{-_U-nG!)9DMZlh}k;JcV}@r
ze^pCj^uouZUmaCuE*Dm7cPA~%x+EgI>nB2pgeab)70C$1E?=D6|CXTB1h!DBTf_{^
z^wE40Zqa8-xZ3<<4%e5_5`4>hMC4T_Vxew(LDbr?vd&8yA8&;bf1<z$??~=VKc0I(
z&vJacJN0saIv?5nW&B=^0Q0Py3}KGtuO7KmAL;vG6OxC*@2mNc-%X!AC<3>+f!<^e
zm0c^gFa7E<F0ezcC*o_;g`A7xbdMLhLbDSxCU?5u^KK@DNxP3^^<<9q6eijwW>ER-
z^Tqs;WZy;`S0~?8ix7s{pC4J1Z+R`PJU8GUIG6R<;cb=mbmT`tf)i{4IECEHO&hxP
zs=6`z*DVBDPg6|qf1+6^93)2Y?P<2`krcnKPm(HL{bz!^&|8U@Z00c<ux2+{pw}0g
z;`2k(@A%|!?d4uT5AW1EVcp5Wvg<F0lZu17Jk*1dG*TE*0wG!IR=sZM={`d7u;TeO
zEst!47r0J+-wDa-vN~^pNqw4$l<X*Jd`ytyX8f#;!gXUo?o{%XGBVmTn}*r7nR2F2
zl{K0!Y4kJF!xiP)=Mo_DdLKhBN8;8a^DrtTn{@2pV7Yi%+U7a4%P%t!4@**=>lEt^
z^0kA>3frq?eGk4_ll$+_r_6vj8C^DC^~hCat!Az6Jf)XQr68EI^GjvXNAlSY!*8+M
z02-K}%6#<wY2mwRQ{}}au(fHL!8qo67IPHnVP-Pix(lmEcaXXo&rD>$1ZWszC+I&b
zeZN}M%IMJC9xqh4hv=T<_QJE^mdrE<%f&eEFh9#L;%Ilu|J3u1i!E4tqa`j63BL;1
zrHmasokErP?!z1UZpFBgTjnexyx69$R%Pa7UbV!;u`41tltL18WF_w(j!&H!y`GLc
z2v8%L@|la2n92kpV0~*qXTeD5<9d??h?0abfoE49l3gr%r{KBKLgzNVNjKtOT)Xqx
zY;%xBkHO*1d!wI>Yh^vM=(W1tNpGS0@V)9p@46A$&92MljS{cz?(Y8l`ZBC*H<Ih0
z`5$G6x3079FXwmu($hPnr)2L<Ht1+B`3#YYzhJ5`znAx-OEXhgq}<%1qxW82l7l6q
zePYm%cnPVTKVvIdov0?G_@#2b7lysx=Y^sK8Rx-Po&i~aRQlDYc1%oILy=#`iQLJa
zm8RY%m}}*HN>Q6EpI@IE{E!|G$L#V`1#wj^ufA{IkRDm+*ANo$_Jq+6n>{oe0a1Xs
z0&NgoK!6}!q$`vY1Y?2PB3M#bB-f)F0ehM$>FvuHDL!G`bP4qZ`LXybYad02PpTF&
znNrlZ6~><3;ePf&|AEmONXG^+GMi`aM#kQ&@2v(27Py}g;gEJak0EMd+Sf8PQp`L|
zD#5P9DU?N?!-!~)Ctc-!axhMnH9+#<=co_H#CvZhRCQx<%%PYxjI`Ekn+SPNPyV%*
zBU_+#?e<V)%p?t?$_RZ~^V6#6>^7Mo`Zh<ODDR)<rw7+fzsr2qyzdA?Hz^nsUz#3m
zneD;KpYJeak)nHaW)J#Ot%<NV!GsgWO8-)u==LUx-*{bIM}&-_9J9?Dw9k2_;L@3g
z-}F4HcD;#s<LTB_nB)iuI;<}r&qk)ng^?pO)Lnx)(q*TMxMKa^M-jS;d`S;rJ?8)N
zq%2EFG&R{u$>>522!xny`Ny*zoXN(Bt>s~%xL09b9AT8f{M%@uY5l|X?kj7W?}c#s
zL4sz|L5CTpPd6;wt^N)onE8>ZhVPsg#}B>B_;a$|L}W_A6n%}}PYyG@Gkc~_59?1g
zw3=yb;XP63T2*P_zTZWU-Vfi33_PbN{;7bw$+ye9FpY$gfP*>fjFlHHjHdF(^0PTI
z+*4mCoql+m>h5R#_$f<HeZDszjBAdpMT;GpA#8NmQbC&F&&ty3Z<sJ!sCC#|XEP-{
z_wjX5{KHjhSTs-EHQoyO@fISr;hQe)Td(YCDG5`PI)gEff;ebPuopO&qJ%RRI(ysD
zk+_*#a9+m1)umYpLl?C`Y_bZ4CB^oh;LWAYG)7`%#*m54GalJ>%^ZBuQCI6r7>VFf
zVF&s~^diX%+DZRY-)cB%K7zbE{XEl1`<|=x_u&aP0a|UjjIY9ytK|c;{N_Ov&14xd
zFiK>sYt+VfHU$pJ?*+bSH!TNgoNx8qM4{w}<`6V4fl3>frJ}fsM26qNl0go~TxRo3
z7Kx2Oyq-^iB{J|+Y|lYC+1<;8WI2(RGViH{hsvBU{glTGNiw>@JDEZ*v!!boBpITb
zX4iAF(&5{E7@v6ilUmhdy#*`|A|k~wCALqOn~CtM{7(<3T&dUQ>=~)uZJ$uq>ajMm
ze-U77CU1<lVi>>aTzl=l7$Mo-ox&lUV#+6b(#<3JsZcbU4oJn=u=kN|p&j&J-Cf4J
zi=U^^mOrQ&b)Hgu>fXnVR!`Q=<G6n3T;KlVgj*OA*VNqwY8`3H7FMO~t*=df{BBda
z1A%MLLA>Selsi2t3~$dLM6CO$^!f+2E3_Wsd*s|6_*>BcWu;VEywWD@gzBf?w7#nO
zRDBA&f19~(_39X?4~lcEKKX|cQnv=XvQ99!k`<_5zV%X2^T6FUwXla&d{!|%%|}E0
zxNxJ2?$X%X)aih-oQG?Z!QBQ6Qi)7(wj_!EBNus>@9UjWHW|n;Qe>7V|GNg9zQS%t
zyNr3{_<*uz6K)TwIzvzx>Kk^Mf$Qu-Pm!`(z?4mvFYPlgm+V#?(GR?fN9!1J)jlj$
ze~E~or!(|D$TA+&^e96*`i1VxttPh8VXz#@#>R-ac=#I_QNca+MYU~i7Zk#}sQSmN
zl6Mj@6-SAB`wKhn`*?eCjjrm(*Vs$2ZICfD_97XW!X3i0j&Ng?G~0Q@p{1@vxi0Ai
z76q5~?g(=L=w2q!F5JKWbGA}hOv4g)jncX&VJc>H9(L*D6a~CA<Yn6ENS5D$osU2J
z;7o(IofAbbk~NEzw~77yx4d&2C5c+do_&Ss+R{ONjo^!KD__{RCNlf`rWicH4$gaY
zVNUc(7?1Vj@LgD*aoyu97ip(&3t9WjSmp#HsxLa2?pno2a@Ft&Qs+MDa&PzM^7Y$b
z_<U@To-1pkJh5^zK``jaHWoHYupeTqJP1c%_lf(2y6Cvws^d-=K|LVNdOtBdky4bO
zrF6fS;78!?i4Mh3vMxET#$otFkkVAqGIEHZU6L&Ju#P-=zObI8zn3b#QG_{S%A+)W
zxo=;K(Uv0aM1OieNX_*F`=XJW)J4NPvBf>VW}n%|x}Y>urCl{`>pOow?=p=HHd?hI
zT&1176y-hli=tHr_QpDUEm%2jZ0sH_CE0$q>%j#P_O8!1S>=NMr&sG*8KPc>j}GSy
zvz^zt1W%-o=}x?dM~P!)B(ZeEWo7r3dS85ocFveQ1uCv#Dd|~Drz_H^WCb}FapS~=
zg`beFkbe1|xZORMw=*l4r%Be6L;0&Ww3qX!U@p;M80-ZROlsqg=i<C^D8mj@kAE@O
zaEeXw)UY8hT=`i!cmD<Avtxwu?JH$n;h83&FtL<3Z>E{#Hc=e347s95NKfW>f5j@y
zG(Y41Rnt5H{ow{8n@3kUX)X^Pu~VTL1?YME_0z=gi27%9MwbfTk~FDZb>8l}Jk>lG
zacm(W9f%*ZVb5q#a*|vsOGvh&Q7y<~$_<|jQ-Y=WST{;$i3j4uUZ8jRT9$l#zei4@
z#yrC1HuG>AkGK*7FStu$oZrRF<(&<w{QJ0q&$3E%Z1ik?uHef;JnWk1%V@9@>Y3#q
z)*tUgz8(!tH1&ze@6r0{PNc<QPdOUGuD}r#S5LVSEMc62N6`GX2t`fK_lv;Q8Kinr
zB@10g*=LVg`CxR87q2@`U)dV9$%<Zkv$C`w)3C@fWsvrd3OaiXHmwSfv#8~V)Q@40
z>j&MFScs{6p8B+(j_#|}uZ!e{CyXle-x2qmi@tncw&KW2!}qIQavD#t)SEo#1c&C>
z@VTaMuHsKsxQ3N(m1jw^1P=<k_ng|ktEgBOK*rgJs){q{Q6mNQLR{izG7xX5%3{@*
z1KnsBNXWGHy{1i@pC<`QkWFkADB+!pVS%f{h^3!!w#jPeiM^DNF<4|+xYR9qzsaDI
zu>`v>@ae3bhmfdmS?fk;<<FrvHy@;0bCA0r5PWP`?>uWVA-G*3kWn)+fh=_mbamtA
zw)BuoVT#3aCr-`P3Ohd>FsSfNCtncT5_dS{5bdD78f!0{(L}%YeC3c^a<k0$8w)00
zYmkHze^8`RtqjBOieT_NT>VfPgVT!K8f8_anD)KaUb=gE{E-l6htOxt!3AW_;yL})
z-1guc&-}T~EC7v$D*3{Fzg)^Y@*7$ZFIx-8ncKXOOFJu?G$`kz{p1NzqeYL*E=kQw
zX$Otg+&b4QPQu$hlhy*SzuCxTl^(w9ti{;ScDqG9+>{JPp<RO9G98kIkZe(Rip7Ux
zyoomgcL@>9qBy@|tulNMccORs=QMUL{Icx5yXgFfeg)Ga5PO|%>N?LtiV6~B-m-Tn
z5sHw#mA{0C7%F*VI#-*4Shju4Bi+eJM)}zPpDBZrvLU4<2cr;D&o^Zg%*uZL$-VZ|
z?cnaqU#inij~zZf65r1n58YV%%GmmGOEozxyxxb_+IS;dxa!Ni_}v_0)ytY+j3y(z
zh{+zcU2oAK`uL=Sxs6g=@&42)s8+(ubX=Q-Z)QCZL^h{xM~LFvSWq$0<5}X1?%Tb7
z9+Hi6waI8TQ8a8wok=4K-dP&pmWq6muw8$8?3~fny<&F|s7TW`N_4S^;!sm$qG+bU
z%0uwb{6U{`RoeL##d4f(z5WYd?uH3>cF(J{QEJxY6smWBtB!f-jUk-o=Ts&s%dPsU
z?ug>Y^rgV7M`<pR$~j<es*&BD^fs}gm0S~<O!?%=`m_G@M2}vx#q(+K@#4$a*4d9r
z$eagyE0MXj1j&)UBi}=xavH%6+Fi1(kYgeTS6_+#e5mxnGQ>buSs|y~2F$#r?^_3%
zwd2Gm5rONU!n!S^9Ci9kmd0{?J++s0=SzGVxt48JQG9ol*x7_PP@ZZpctn>dY1BrO
zg7Q~3!YE-KCL9Mrj8tJg@KD=y=dGs>F>!6;0)rk>A(s6|$m#u+vL5Z%b2fK=Dx>iy
z>y<v{!bD_mGRS3pNfMi+_Q}dAzwUp$Y#v#s!}sXc_>+Gyi-sK72!i$obV)|0h6&xn
zx6`a#&a|5yd0>xUZJ_IDR*q1cOK;4NUt8wOzll-RCV6Iak!oIdu2|8{R$Q^I*DLRl
z!tzsx9S5^HOuDc{`P26fT3)#o2K(EkzS*mmMxa!Mp)Gm&k%V>^T;jAfFd{kyn)g?P
zZzd53tp{)<gpu@9=?S412{N*gjj1vnB1olymsEkcR{6|uGo>pRIL5F<0vRfERA$A;
zmoNarGd|=9L~lvbck5bL<v$$`BMr8OQ6C4yDcm4~4<T17O%mqBBwtv*e1BwNEmD^7
zR5JE)bcOK!>N!I#(Q4NhdI29&H$NmBR2I_W6egb=W@_a7a+v*H<^GSM`+E<@6C<!T
zEy6Y>a$RRImo7^5%J+YpZvO5Tq_9M<>^yLsb<H;%-r%BO08bfRQ3|-gp+&Zfn8oqs
zaZi?VVsECD-}=OzmGgb}kPDzo;pM4%|9dxYOp9`wL{t=nh~&y9ZIX~9XoL~;rFPR#
zJmZ>!Is|L@7e=%fuJQ)-%lc$)j4J-%<Nq>uTXGe1e~97@g{BA|UUvd0CwLO|`NPkz
zbR0K(dd=sast3{_gm?KVFAa99@0v=HFDERf$)5ICRA=r>hv2udzs)@iK9mVJa(}P)
z!^7B3wr;$VRs0|9GfIOiZkzN1S?*hn!|OjJFT$s<V0z*OD148%hjp&0Nqu=}Ys)H2
zFI!hRtxH|o`^qU@*rQj{qCd<+Jf7^0!(gb9#q7IX%p#pO(VMaN<O%BYr>6^)I(^Jz
zg4Wx#aF@@6Mq-Q$etriLmfXdIUj5`VQEl>>FKA(MjWaqYj)MG4=y!=8;ag%f9x4yi
zKrE{G;ebp;>gad%Z+uI!;d1v-xw!v+i4Lw#sSr=`Ya-vIGFE2C_^lj_3`PQjAy-F?
z+Ie&jKGYuz&<ZG%AkLp-M=Hs(m43^&`0V`Hk0VI82HO=~Kn!o8LEL@1qgJdC<1%I=
zA<!*~AL{dj)H|9YI9{I^L#UIe=~uU3SrT!=2D|W*UUC<{R%OU0<)qOQh6eb-%8S8Z
z`{T}-!$vBN0J<{nXYrQijc0zx2g)UpkJW1uGDbV&guXn93rS58Z&tT%-ki}Pg-s@O
zCWgwDeV5#gxcHWyym4QP=342JOx|FuAqTtbH#fF7m`~ltm4ZdwQdrw?0?*Qyv9eMs
zDk8E!KIQauIeHac5YDmccQb;^09<Uvh<xF?GwHonNo=0ylWdXGJ$5HS>qVH-Gu*nZ
zaA9_Q=B&G6c+z21YHOhiqBs;rWx;Lgxy?5G!xS0|7l&-#G99Dnr_&6UA8)Qd4Wqyy
zpX6@cTyi(-$*IDG*=Ms(y!<HlWF#)LY?UJVdQtJ{F_8er&gEqGx{a@|)c?Rp%HizU
zJ_*n*LW%joe40%<h|Z60Z=0~Jk5=}2&md$)UohD#g>10a8*xYsYPOzc3Ek~qdubn`
zBFET7vZ`<q!4g65L|4YEs_$6b>6Kx)A5-1=eX+8;F4L$3`QD{+`e;Gnl14_Px)d$T
z$5x9caJ^2K?~PK;IB=K4l`<WZugWwH3~A0qJwg#3D+U=($aJKQ-=uTQr>w$fa#t>&
zN`Gt>`Haiq#ZhlIB<gTws&q-)r2z5d_LAqpCs`O3y}R&qxT5iEI5r*&;&CP2Q+J)1
zean_e*O>cq*CR!07`X-P>eZZf+Bsqe$HcAUZSX_Ct+(=aY>bFL9h8MxUs)|TC^Bf$
zbp26E5sf({z=)XfU(=ajU(=z2|6*<rs?u|$YZJ|)hB@$Vn^X|K;8nYx{C-J>>)<{h
z)E4Xwa_ZRaxsqY&$ld+>8r}Es2r`bt1dx-IssjsZ_Ky$a44vH>vM7}l-T*dy!;PF>
z%t6V5^Crk_n4z1_%;uAi7M^V!=ykG`5XA@fJT2O&%9N3HQlyyIYu`I6UcRN&tel}@
zJngm`r%1*-Xz01#NHFJX<YE<{<ipC&#ukyY7_#z~=J5gnz7u!N_Rvk-S*EO(FL#vY
z?EYRF0I^^&7RAS?^_{9zmWGeQjK^K-#NNk=^w|IC6M0+cT)qdvsYn>30J<vpjeQV4
zt#Rv%c44dNo>l*4d(&k6=!!@ats<pk68GSz1uZNC4iY;Lr+n!QC})D2?K*-R(aoad
z@M@&}fJ5e}P&jFh$W_VFW!0RIwUo3^4C+79MzDT~^N^JAmOj?B#I?DTZmH<Hv;Fl+
zQ&kv3{!T+lq+<GE>geP?IUYYeH8(G%(Aod<*R0WXnm(-vfmNFOrFq-D$~1b&aCs`0
z5$^P{kky<6_SwfhbdzoC@(&tKG@=uVHW-rhzrU01qF<2dIqvLqPcBS~ckcYAGT2~p
zOS5?d5cQTKvnwT0?q1QnUx|a2dK_6DZmQVkOll>chg)cmeJHQ1kQh<f!qYyvuwEMD
zJzKr}HZl-a9LZq!9hq58L_Zwn5tcoZHXSC^GA;j=pc&CzP#xuJz{Au?h>bNjv6b$Y
z@K$L4QD?=qDD@lufFhaikL+U!L8{UyQ;|?Ec~0fPsG5J##6OWq>G;F^h{@n+?p^pQ
zn!vnaU=ytgE6Vq<K6z}VKNP+qvTb(qeV780-TE*uQrG3O@H-@3*~{McHkPZ8w-jB|
zB8oU&qHFQ6yRrP#EOL%Wqgw}_Xrj@bNwXR-5+}!Wxu(8@%9bBz;;|~+wVB55!w)IC
zT?l}!nSOp=y;JPq?b}b8)NcfFpk*v=7qv})1PYgBGVCM?!%NF0A1wBIT~_@{VBN!!
zO2w;};wl&;36u^iY}TDNyN`g^zb5wOnqsTrF#V63XIpwoaTnSi`NrxT4v_6WG<v5w
z{=@5g=7oHA%Yky*lBLT=nl7(iukDXl`BONTYBDe_%&=(BS;@6NGfSCDF8+hAWqSip
z|0*XJ$Ij~bW&V<8Dv_3Lh^OXm=RSRKQn1cx*z|pwzKQy<=e&lP;fvaEarNs%^($@3
zoJcIPYhgG!!TPYfjhE7&J{_G$HkYne@}om2P!t;blNxrR3=^XuA;%OtWnCpA+h4FF
zX=Fcz56p^3sqL4ISVp!C#4`$<f0#wPunYggoB=jQD;<l$69{uLi^Ce!v-vGy66V!T
zd&9}%pB3>})(QMb$!kTQ4<-LF!#p1vdUb~br^#d7iOz;AVJbaizOa(N@FI_&CSUS>
zYqjg2C{|TO5e)<+qCeAL##@Kk2IbxBRG7M(Vs;-tDt|sj!NA7m^bg|$^yCw@nTg&g
zs(BYBwp6P;O7U&D^~Htt36cC=R=xa@hVCxGT>Y2;VyZZlIv%5F6M9g@X}ls@{Jj2;
z)F)E&+JAmD@a+~D#gfatte9=>VkO304%VV8rR?s8jqLI3CO86#C@Uf|0h&V088SEO
z3uSg5#?PUPzQX)%dDI1(usR?bzaMgJXUz#zl%+J+$PT8|b+NUeo0$27S_WcPxh)Qi
zd|6;fu&C<x;XBWKkLLsYPKekU4(<^0kXUkDSth#7BNw0Apb)(OxhY@?Ag)~}JIfMq
zo~yUIX>~N?9|iM&B}W0E&C?>SL;0}@@4z%Or6o5%EB^y@{SO462T#Eho35&!JxXEw
z<<+$^hDhA_|M~Q9gPX^SZNR4`x07k|znK<#9bxn{y^eypxbdh*c$$TVESF>diTVGn
zQRQ98o)+dLH&@SYp%l*5kF5DdW_-rQqZJwwuSN%cQTg*a`giFl5D7-Hqk1yN=#0z!
zex_a?VA5d3e;xhj$@}-s-Jr#GK$aIRuiA17pT~|=(Ftxo|F<FepMn*|0Iv8s;5$2&
ztu`J%<JXM0c~)l(WWyE*c<AF1Z)-wBd~6Q*tJzB<gl9~V*dyQo(K?}$!Xgxl3ZNt-
zom=QT>lPYV@jg?*A@T6tl;2@+()C-C<<3*GJ^)D10I84sy1B1=s$JgGCLlo5N{l;}
z2?LHL1dxin(i_dxGZSt-q|{uRaW((@`s#{9nFL~hr~vocTBLmY@z-ZzzoWf(otHX-
zpr|}kcleVSa0wbCfSti;UH(d=63uwN`oR*F2N1s30lgY}dCI4Z0r5bM%61GakHPZd
ztlBf2IjHv(JQ^;LvV#G_1Jhf8MUXKj)=xQ$z$=TE^Vdl_gz9}fk2cycKo`$snhRmN
zM{i{2{^b=NeCUS5n|FZIXf26m5ak;7r@PRbL71vvC~0oi_6NBFTJvZ--~*D(PXjV+
z(@2SxTWH=q8M9O~jYa~O)@LUxv>JiFK#~KH=5EkSd%7$Fvgd8MhG5l&w@gTv+xM7>
z$)(p(o&a899DyR;a-e?;)WxzEYqh)gbj>v+V-Nqmto~=s{MSB9V?vkWg#x)zj{p6j
z{&TY$kNCDH8iEX2p(G4cBNdA;BF{AwI0A)}5K;?)(r@8o)|~}}|BEaYKn|WKGq(sn
zKYPg?k!)-kUMUTX7%F;}&0&=2&@-O)-(q1yIlHdW)m%_wa6no`iSUCT-`iOI;Sj>>
z&C^OZ9=GVtmbWtiG8t<VP;OJa&8C8}anCgyYUSLWorna=DUBYCrdVas*mwJj)c}w+
z6?|l;18|!d9f~MvfmdN2VkyJIAfG{;U^tEL@xeARPzlx9lnqZ$j`M#E73OC$wnQyt
z`O8WJ?R^HU5%RE$Ves;YfZq!_Jz9{fs^E#K?Neb=M$ZE&n3T@k$5k+TFy_in`ifEO
z^j=^W623Q4Ig(<aEy+py<$E(E$9)Soer((%2}F1Q*1DaQQBext#wvCH&7E=QHpB$Y
z^3b2B;ot9mL1ptdqH%hKYPa-9Za*fqOvKgp#yc0$^W*?#BRgKO8enw-5t%Ky9n8Xk
zxU-E{FTSBk&o;u_*<Z*C{rR<~ZH$tpP@9rsB^HUoUIuD(Xsql3&(9C572_%*w@;6E
zW4jL_JI{Ld)PAsAEOIss@;TsT99*7oQ7pPH|J*SsD9isuvZ(5N8n%4kFbum;pg9S$
zmDJq=YJo+Ezy}gY07m{Oc-hn(PL1o!AwDRzTtuynoC$)=Lam7LHWqKkiI~&?)UTBK
zBhH;;fZU_Eg>QeVqJSflH=TetCH_y8+rQ!qdPyhSeR-(QKa^bPOP6L(?v~&_PeT2Z
zg`=(Q-10eDs?=L}pRX1wrCrGQsst}8{meJh<A37DUy*Tc1B^fZ3zF2XQ{XodS98qP
zdO_z0qL6V4dNJez5^`f~B+|TUrGR_lB0{zRs}=zpyt6p|!Fl9f&FTvZRI_459j+xF
znNRcMJFxTqKt-RG4CiN?&GwFdht}g~+Q2OdxIqcS{-k!C*qjVJVQihiJA_^g0s`@s
zk3b|Bv7E_%OctR4YZIn*Owf`D@3F~zWIf{a;j86gQGk%6WE-e-89l%ne>b3grKNyS
zDJt<QAIl2s8DUuMb{aKG*LJ+ZsmQ5Al;L)1lcKLwVh~YG{xip#ZA!r(ZrZy^C{Dse
zPUH%3D9lVhChyJ_{{fs&l|QSOn}CHUYne%$MU7ev)Q$^7;&-qqu@w!o{c!tmkTc=;
zQK*K#Sc*VoBUAh)MDL1pQMZw@%1W!4ztbFhaR)rmn&CE!$CU4G<x?>YUZ+0M9%<^~
zXZ|PqB9^!_i1bV0(K&`!ll9do&!hL6T$L0k!sVOsROCcnoPU=%6wVV0$vY8jcH>Zd
z{0RnW&)8%1?%avpO`n<n<&i1X6mZ?|QNkDpY_m3}bqqaiHTx^R>f+qZHAI+E_2EMv
z9%(r;fB1@qQ0)V@YIJ)D4P^tr&kJfEMj4P4N)ICkSn?0auB=eB1YMr=<fz#Ge6hIM
zEuM?es{UJhkDR&Jvx+h~-;37i&UKIA9G3XJX=-le8?8)jUQ|k+?=qSR$LOzkIKC}>
zASer?P%L0>3~dCi492>>*~oXj3dHj}PE&Qhrg;^Lmk>f+nyC~?dPDIQ5p%+04T~vf
zhCR(IID32gG??-$%d9pEp40NlbX<4xX*36_Su=HTT40HswR&PtrSLx$Bm~SL<$&zl
z1c7!AHsgm&xhbu~N>{|1^(lXJKfBUqeGA0AJ<S^&{F3YDJ{v8IVPI!ZDW(XTS$3VP
zc(~Zho_E=j!N$+Y1LUaC!=E)mEQ$gIV_;7^G&K<m!2zFBY+R4t-2Bd!YgoE+|JCIU
z5LH?d)qN<;?tkJ$OHqMj=_YkSmK&R$d8*JLpZ=4}Ofa3wbM!O%g?LpK23EP_Edv@6
zaJ{#@V;jL|rS4u&JpCN(o`Hc^xvT>T=!Li1ICqtikd?4RR-RG+Rror%?ND5DY2mk!
zT`$}A7bqc_T;eVdrgq8QX^+cz7l#)*e9K#4c&k9~rKK;fQBD}lUQqWY2u(MLSpAP&
zpB>AWPs=;uzu;4=QF;cmS7kF5)+V}Qm1C+9)&K%et??Y7!OC@%q=9AQN(pHNCNQv1
z+RCk4(uOS}&`qxxyCkIiD(_t?JvfZCAZgG!6==(IzW_|3j?H({It$F%`rzErVlRE3
z7PZ2+NA@Z4g?ct4n7|gjD`?Bk#{}dtvD{fcmnh~@UN7$ydU<(iFEjm;zTFVuiFAGe
zhE`ON!#c-Mk(VftR0&ra>`gQemOiVV2KLNz^LoFH%IcF|!EM@iW)HuLXFTXv7dh@X
zy6JSjO*=Y_fYnD>`d2$gRUQR`f{TJl?TeCyT@V#Q)(-#x9xvTa#`wUxb!_G4=IG0l
z^gjVAqGu1pl6N{66atC%=-Gib=7**<k`#>k8_Q7My8P3W!1TXu$^Vg|__VVbTSCOz
z4NawLcT8{w5>T&Wn6GHRbtb3`=M9)Bv*|GwLWhL{<AJuG&!fDj))9rF`t08hGG9J9
zq*m7r=OI2&s^*}<IC?_{*WZ-F^98V#q$1#jBM&^_N`dP$i7}iqgTUSfZ;fI|7?1WX
zn2$3{k@(J<HFOBooox213IByy{<=LL&Fr+65Xac)2kreXkGv>(F6L5QUIRIRnYBeq
zn>5&-f@#tG{yN>VgYVGrwE~9c*>NKck}6U3A}xg>q)ZYm%L<2g9=6NL)^y!HEB<De
z;-%?tVKu>&b2h%(|IR&B@Vx6bueU$y$ZQ(|O<n8BnHyfr7iZ8eS4F|#(t$>VcMyRV
z&cI`o32#!Ks;W#r-i2Ny1VfRgPS^gq%FuaQSPL?GxLRFm1acsx<qbI<j%&iiY9DHf
zVy_ARH3Wa3H}sDSz18l^D5=~$dR|n$`fH_}$6RPTt?cb`+2;#?9!BUIx?Ol_H~4aW
zlC^42_vbd;2iR{1DGpQ_?HK;^*8YXxZ13PG!6;^W6Qd8PdHbL9#$xI&%iw4oJ{#Eb
z9Je}iI3ozfTB=EdT_jRmX&BR&U!{PmY5NwF_nBj;X@#S7j)Gmcur2xKJq9CVi{KZl
zT>x)bKu$=9q7s!p-_J|LbG;XY-vE?tcY}J@=d3yfb&aZ^JfY5Qp&Qy{GXS}hIe*+x
zX%qn*&K1WGUp;2P!X&~aAmgvpsk*53+E7q3_9AqHRo|qw&m0FMJU+7;bbAn>T0+PA
znQDQt(jw`a5utPKwo!}8rOa?@ZljN8Oo=EFP*JtCSKd1FN_u!My&P1ApKXf)2-kDr
z8TttkhT5P=^89Rhm^hwwm6&IK2Qznr3@Q&Z99GrNsz}i{U}7PUSsE>NMs3J#GF(*_
zfg0oa>k~I7nC=$+_op_QBh>wZz*uZ%0rEz1uFbrYebh$cFm<dwFDNH(d`|MnICB)=
z;PEkApu7k5j6ujzEWTCSR9t_;DD_~G!hpvN<f0NG&;mgM9)T1_o>Xx)HoyP4toFMq
zD5AU8(xeb)E)5c#==tm{F*|+aw>U+dZAK~gA2dIKC9n6*%@4xErfx}n9sT5XMH`8e
zxVw&acu_94cGo91&Mc${5*q-rXK6X7E#<Sfz7E0f&C4o=7%}El?gSJIXiAV~t4sW2
zU;#Vg1{eBM0{7+54WS@VDA(*4=Z_dS(nX7B5_$CHr7?hCTYJbe$DEb-bXm~fzh^t&
zo8w)3<uzCikqrWN-hyX7yFH<<!*fX!HzpmoYKbm)ej>82?axwYfBzO<+D#=C_<F5i
z-W44>9H)!!L&3B+c#1$tW-(Fe3OQFvS$)>!*F)aYn2n!72n66UrZ)f#uUO-;%ICr8
zle?$Z7RxH9oW8UT%23>eX5+WzxfK_`zFk$%rbk1aEsnaJiNNPA1ouDMu$WJ}PG;`A
zeQgiyq^8+OA(p!1opG1Xy!k;;r9Dz*CIoshu+gBDHIx@x3H;Mo6_DWoNa^POAP1+G
zUfkXg|3(QRVWM`nJSCa`!9Z3Ml*5#R!8*Ns1SYUb)VXRtc?MuP5~#%R{85;Hf4Rd1
zb#0|TS7X`-z*IA+M<Z3(LP1hXe2+e(3{op9LEgu-$8#`EWRjj)34c)xC<u^n8Z*=h
z{k#(1M|wAK^W8a0+qh|$tv=OIq57jsfCZ*BaGwRuAYm3L2-O>d9j;Jif8)cwuT6TY
z%;(2GIA>jH16KAXVD&o>8Jj1n?cNJ_@RtRN4;gbvA1uBkA{!v8UN~bm8pIH&>X+Zy
zkU8aA1F-b`c+>@5k3QQMq+Yu}nYCs2zrEmL^<F7iU&_x40hR5?MOk+zXsf(1md{bD
z_d9MP<j$aQ%VY+8hdc8RkF(Yq3TGREy3>%Op76t6^ZlykX|zj0#q0h{2L>KNiXakN
z)t>tyF=!3KAPKm2&C0o$EB51WrYjh%Q`tK~;GNlMV<pFglE)sBFiG%dJe+)FQPM(7
z{N~2{2<7YTuPZ<`+5F;*+|jzCCqVp7fQd+VNPtDU51DX7?s|*%IQAJZmp#g=pabD@
z`Ms}Fw2w_dRq#7B$$~+8_nGf?vlaaU08muW{$B@e5Z}g3-MYjv5On<xs#9aOLx2-6
zUWwosE&vSepEq&C18!8wLZ0dFlM@H3R2IC?F)q`jy8}9MUq)?$0v)Atq{!P>P<MBr
z8X5HAuob-@x>0p`AFN&KOh8ERNKi~F_1Ra{1CMM;OtWCk3RUcL-XJvd!wHci(EKn1
zO#U#MDlR^bYw6r~sSmdn`#6TuvPZ;=cL9_Rn3g{#jA1Xe1Y^*>eGd={WMQmpb%*y*
z1#Gp6XJ$Kq7?cP!iy2Evb5-l3@7eMw#|ovOMk&dZB={?*1NHE-^AtpGT848rd9T%8
zcXWHUtOJ0!3+QeLhTQ+MJepds@*w0{w`AuJaW*TvfEQ@nQNxsjZkgbd<HO%fPX;i9
zi`b*1VR%r_$Kqh2f#7Fsi^py`<ngN?reuZdM7xn35sc(=M=r>%UPTNJF+YWaQ7_r@
z&zdz^S`3T#3gxmCo8O*wjM@>$Kh59Je4x3s0vKUV<H@VC&+b`$@W4D!Tnl%AB_78>
zu8kL%%tiEaaeW<Ir=VFi9s;`pNFssb8YcT7a0P>w5hO);F94VHRVZ&Iu?)}IaEsg|
zqlhlYn$?OKz*WU|K$V}5wjO<s%!@3yP7zfA49?G1(9ASMLS;F(1jQFr1eZRK79gU3
zrbifVB9tXRLcIn|{^!V9;P?cx0W|RY#%BRvR01c88IOx^PRNl@#XsHaI6SXu%#W5!
zw_-X#^aMNr8EAn}-`gHGfB9I}I|2&5ra+^F7%L0V^*5@uee1ialF+&*AVe&yUqX~9
z5lsEA5|;XpubQG=GzJ$bOX6OTt6{c4iP9oP5=LjD5l>q5QV`E*m`OC9K;YSBmc=tG
z<ZKg)2Itft^+tyQ>w4D58DA#N4~L}*69td~kJAYApLyK)&CWo8X%dw6!hnu@*N%t`
zx>f5He0s85j}Gjjxee6|%UW?`g!!3X_F!!hHvz9Zb*!pSc9>`&Zif;hluJr<#qeK>
zAZVZ#n7wmWObYnCDZ)Pl;GWZH*Q#FD7Yv?7C9RAzV2#Dru-Q|{H+t$3{?lsl@BNv5
zmqLv?wv55xmD9H{(!#7SLzfQ+N?~GZE?8e=b|<KzE(%~`zY>(2e1nhnQQk=-#P%+8
z=`K%$jZXrqSZ7Z_JuNGzyy!%pQo<mA5{d>lo*Gnn?ojsF1NSkv&%>`p8Uo0WD||6q
z!K*O#oI8kIs^Dx&G9DhY6d6!DLCp{0zJ6d&Dy&V`{thbmq17?&XA;o8A7K}4{KO;3
zB1pTIZ#n#UY4>g#D*#lVBBmg4;hTH}w7?9gZAh=1x+)jFM3u`Is^DQjV*)b|j#iL-
zDZ64w5LA3^a0cS((Slp^hBzl0I27$%pwfOdj2TAm_)_LTaZBOq<K|H18``TJNL?&1
zsP{{pl|l^aM=~x8l%g=MFjADNdn=|{=$myW`pZ+sl0%%-+rL^uNze}Ph`%{3sM@W#
z))vEjS`anS&S3@-rmog6s6Lk@UL;-cbBgh`7iQ!CcC%!A|8TxuaJTZv!$$yZ&xvw%
z|8oVLEr#<+=c#>1vWMZ_U4=UZ9XdBgMaR<cu5t)>;f3~B&)uSmYbfUeu;)#<x9lHM
zQ5r7xr5s2lur_^aG-rX3h^HN0xj67+B2yJ7m!d%JYOcQaH`%j7djMV<F?PmDj-{$+
zb_H%v^xcfeuyes7tq8}OBt-VKF7K0yGr|%O$dEW;Sn4p;gK7CR4}GDH&X<N9*QCi2
z44&N;X7)@!xzS7Ns74E~{@sbQ{ynOUJsb5~Y2r;lEnkY=6#l~#{%4662ZWb6#hE`s
zp#Ph_=Ba=tmXgu-2h#}<_Wpt{|L<CV0m=V2UjP4J6T0Goim#2{G(iYr?7c(EGMpHy
z4AUJ;)olmzx@eYt+5hj-{*P7u*Gtqkh3(CI*D20H83-Z^=^S5xM!gwO+L6t*#R^r*
zym9<m?T&<WAJA&YK*1~=Qv*tH%wpe5#hy$VjQ#63v{3Jxu>D}c17r0bfS^^OEP~<C
zdGzD2vjBj$X4HT3^TPjrD~})u3H3@gL81$wStPzZHTf_baq-+vVaLJtG9AQ*3Io>C
z1yF23kU|Ej0~X-M3zvJPJ`BVv)0vfbA5X}i#Q^*c1?Uh;o-0ij-`eA60PrP&D2`N6
z6z~0bU2LfXf!}xEo@lf>tr(=g2jR05ByEem^_iKx^m`FCP@kR!DDTOL1uLg%0$U&!
z6H-gLbTZ+$K%?V2&<9$NKLD4;yNUZ9Er}X)t<l8bH_o1b2EYJ7OaDQrV#IO=n7>FC
zGQIB^gQE~I5O+UT;lu>-+y}dBtmjA>4R6wYJrl$JUX&<8Xt1(Z&HazXiHISvhO(V*
zyrh6P{^L?J*2=SolTR(LH-|z5WEArii4U0-KssLKxkq7wHkS7#K=4BFWF1s`N`2TC
zIFQciV?7R<muG*0Zf=Muqy`|gWMoH$$%#g7Dd&F=nZ3d|cU6%&0|@@<33DLo&b<Q5
zr|v=t`dRqB2EYW;W;q<X0Y2%sR0ill1XK!GJP)QNg5*vlXb{GUCml>+<AczAOczCM
zgCA6D(!hD7o;Bkm_yA1#NWX-75c2;qCK#BQvCH(95~c3o(+8?BoZPWtbARTCtDt(N
zp|-ywxC$hgsn)MNH)Mc7-g<)^fTZaMB@$fP8T1fng&Jw!Ex)a4RSSni4WLh+@_K|V
zp9KT{wU~M0Q7yj+DVKCvOVB62Kn*BKm+{`ocu@m&ej|5AtgOazcqP8ovME#H=TTni
z2A%ZAo>I;!rl6S}YI=2PWv~efW%1(4Gej^`<G(%=-Tli9!l;sspd_33umHXM5E3q-
z#em(%uq@19+K#X&%uSA%#V>qfFZu#5+>a0d&ZOtwyFWMF4=`T#a{SecFGPr}pl*+r
zH@wJW(Cy#4*$-ILv-0roIwOR#z&J>;>T4y?G(eN@5k)W>$ZZn*pilSrCIb8BZP_R8
zOiu(G{T3klAldH-*b)rjid~_An+O60GrR7G8BnMagGfS50Cz#LMGrtdY-<HkyxP;m
z|C4(E3QGS~Zz6sZBN?Wo{5(G<UaH2@qlLki^K!QX5;5^hm1v?tJDxk`&nongUSM#e
z5US68&A8FYDp-2jXY;YWx;WQ3kYW$mSOABTs0z;6kiN&lc~*GRRKH!FZ<tnkrS43>
z%WpK2%?l8QZN&91S&jd)?)+_hUqu1VGs%pi=1dF)ZE@QOz!av<UMuVQ+t2?K_^_e*
z0QAwlOV|AKuUz%VK}<scMlH|t%Szc_$^B0k&HfDIK^l(Ymt6VNr++;XI!_IZ7w;|c
zLtsTZ`@a@k0(|T>5BZfdNj&II!I0UG(_C@-3v8R<KrV>aZz^sqeyw?E3>0NWa83p0
z+ouE!w3lyzH=r)-Q2EE_f-Nv1cgfxU>{$Y*wcK!D8F9UjKVAk-M#&DRgbs_LUM~5X
z1ne{*Q_@KQ`5)SRczB`uB#kqGharZ}34xN{JjL#xIV7HVFmvJA6IwW3&^a$|wWVF&
zrcft;C6NCW4tG}PP~0GxWZiskf8O&>v>E}6ak_-l>?SyeqX}{~oFk#9Fh5lN<A%$g
z!7G~SbqT5X*8~oPp@1rlfesgN0c|{Z2zZ!2O4AnXAfF2*>=&iqWmJ?2wYl&+_0l@o
z|9QGdxq--lVHgX<OA`Zjq_nMh2fb$tx1b<}v5XX;4bw%5p)}R{h}9i=&T-b5Q?Z1_
z?E#Yp1`7Hh1sJU84*z)}C!!i)p;T;Dk-t}yra6uh`_XRwX;(NYZxU6LyhPgbid%Wd
zPu5XPFgh50xl@j!)7HcvgZB@v-aPBp#ZT^QLzUifKP*Af7*nR98b#k%$zA47sCJtO
zN6I9>VJ4D~^Y_Uf5Y0t|2g6qvE4cXTd9O=po69MO)Po;`yq$npngPAg7aypVnSW@&
zA^u&CLyfFHAk}sGeI#?S$5Gnnuifgm0B0pMPFWgkx<>zfJ0z-P%ZI;)gn7t)5@_QE
z2!^JM#UJ%uZk}9)%R&n?!whioy+>VjzdeR%TFu*U-VYH$jWBL@iBCqbo|H@5_2)F;
zYHTykMs~6Wor}=OKi>N}<EbyoF69za^RxQ3GVK%4lYEQ;t_a!?U;`R5wu)AB`ox(+
z;=kWvj&tOybP7&Tu$XQ@&j3-M7_yFptZbRD3>TXNY$0H&Dp-H?tHC)=p#ng$P;eqa
zRd1fK$I8e%(C|kyVPrcGN%IwxUdt%#O<*kvQDyZNGX?l$2IN1%a&Oaz(AhXFGF~(j
z-6cQ+dH5TL0F{_xmjN7+v-dsSdG;4@m7SjiUN09wL(@vvjeI_DWX%4M8R~yu2bm)T
zQjMXboPcb7tDU)3d|u{0Kt|^dMk&(4j%bP!?WXS$2a!5NY`*00>$A5RVkH3U#C<;y
z15QMtvuPn^6yrg#aaa6%mDs|tmC(@f9JF56457AE3{L_e(^4GEsGA;KiaB9uVNLaB
zImR!H^23y$w=r<eK&QI&vkS~J^s#z_6LjV_A*EY6II%h+3F5Ct9t>#B;V$_3W^9)9
zsq77aLy26#fCoX$<gnCmajN^;q)bueY($Gh%EyxiTyhdf$Gzu2vQj>vZ2{(;2*T7w
z#81XkO;DY@5_)rxO~{+dE};krUT6Z9db_L=1YVLQWO<<Bsa#87Kj#PzC_qm+wZYvq
zE5Ug&{do~Gz~8nE5Uw?h&eTO7354m3LvKm{j;>JK)PNaZ2mb({g}$4xX&s=_{fUBI
z;N;tj74Ob4_uJ>}&N~EXea^%BoFH&b^!05v60m-Yv9K7un?ib&5cLw5fFho%6ArQB
zbCS?kD*dx$eQ`v0Q^R*Rd*pQbLWKU6vzDmYx-=qCYL>V?^i$+WeM-{Q!ehWgs#sXY
zE7OLTbwizoree5<6#ET+KKaum;9Q%3iTE3i9G}r^%-#DSW?c65<ctY;y4dN{b_+mD
zq$snhD-}~J3X8m7aO11y#y$9gk}|~RaAxMZdU%7Ca-VEpzgrkQQ`^@Na#og;wbK*l
z2*ofLo|b8M$mV3WujK}DHwE~3sZXubc%?SNX?gD;4@D64VO*aW6Og}1MWz9L9chWX
zB6*pT8>kS@dw}@fe3&&WewYb(^9`A%5O9b|DBxf(te-#@q}c<Ac-;wJOuBl*E7m$V
z=4yrW&A%^B-C8`x3&=r<A^@T#rMMwa3Uo45FvOazfYmkwoe=?f;&2NY)#MvIf<lW#
z9>+c{RMOZ2ZnV6ch+tehEXWx7OviBnnR%Y6bjuV96}>(%P(exRQM9Y@v9VB~11RK)
za=^Jg;*fE|fP`@YNwxu<jS)c>Mu`0ET(|$7$FEG3%oLA6pMQpa9$wxhd65$t#}fKL
zZcr7KSPu$_CSXXUWp2kkM*;(OJ<Gi|xf9AmZ2^w{o@o-usy8du{0QC=#s;Z5E3Bf6
zqif()oUpP%Uaw=oo&^JDk`JP)U6}|r(_8^F?EBm5#rCWhs4Oo6$Ws!3SVOe1B}v=I
zt$qzxbNd^&b0$Z@Kb^hgZ@Ucm&+r8NMeaZhSc$AJS=9&<x$uP5u@<gqZvpp}0E_>^
zvxgya$4ff#*wpwg8EYUqTdckxjyYB$sRn3LF>Og_{K(@r0{1;<>PUdD5=|?6D4fbc
zOZYHr3Ok430Gv*hxolGsd;uxkhh8>~X?#**5ryk|4kGAhUxHHtHr$`fg<;X##fRY(
zbWEeC6{}wYpM&(nVwbnocm$r5kI;>A%SllZvEVF|Jr;xj%_=tsQILc3An~pvuqa~W
z378Wl+Pq_ik7Va$)NXzxXBfED!B4%0zF;J>5oAmBquBQEtE2Z4Oe}4>L9Pf!3Y~az
zA^fEch=4KL_$TZwg>K+wD?={wj@m#R2{#rQ9$dcjHmBSr$ynD;M}WJr7htCaWzYdc
zI-&djLQh>`SlxHv%|7H{L2P_QqXJ$gm~M`iON>4@*h?)y-k(h1;$@g(f)VG;sykV8
zaNDN4-M{y1;c_$W4cq4~%WpokuJMgEs@DST<9pf>*tfG$Zg{wkKp_wB_JAMRF~^KS
z7EmJEB90pr&jUvHbDGs#C0LM)`<cxp(eGj#*%>BAV&Yst;%)tsLHAO39Wsel0JUQ;
zO-_@heGFt521q8V)QN});9gJ>mUo%{2-$O`9g|J0!vk?Yx(-??;Zv|<r+-VemWi>;
z*oKUK2!f0scBYOxttH3~NU$;1!I7T>*Wy)oQ6uy8vx58W&BPA_sezUGSIZCCoXtF&
zgT$z6;!o4+2W(1|{ao|IIS4*>hWHF6`G!!34NzGR-6&%6L+*)8;^rRhu;YlCM6|0F
zt1Gq~e>%pBEQ)93V;BxS#8<q-_Kf<4*-*h+<Ktb*S;b`XU7sEI9c0Si>$vPkRC}1m
z12C((Ea(IV-r@UvUhPE&)(@Xl2QcItpU8E0F5dah)s5ABj--uZ>5_}|MywP9|8zq3
zcx%A5=3hO{*^emM7zGJQ)M?mC(QA*vM&sv}U_(rk+?9|Ktx~5>_%%6K)W#mY7~RL>
zl+)A~PQSW&TjtQ^XUG_EYbR4PsYRE8KFx2|YxxyVzvv8Nhor{juE``PB*=#2be?cu
z;s_^&{M|Hs-QavGkJv*Se4nA2Y3|RnVH`1G9Yq|R7MJRDN(}4SC0|iqrnvV@U{!Ss
zPmel`f<A%p-HM~f3#$D|onxCT@pPK}4Ns%Xvg)SWTjhJ42@poVR*TBIBl;+m?%yu7
z#_e7oix6A*kbr<stKK0jQhVdxB|e;(!NR9}w@Exbf+siiTSge?zDlCXrB=l$NxAxm
zdje&E<;d--vUz*?qw6=0#oH_9*B{^><}S<A*w*bedi>O}*ZY>AJo{l3Bq#WXTv$I)
zNL<xEUaqoM3S*<_%P%MIh~n^m6Q0;!7Zbhr4_urkj#i0uLD94Mdz2=Apr1ig9Iwbt
zg6!o#CP2G<XNb8lx!tw%Qg_lFNG8AS57$Q8wPT8Yr&H7>N1Xn^^oR;xgT5h-_VuWz
zN1ciJVXl|-w{;Cu5-%z7#{IGemhT9JQeAIc%*<0a7r`hu(S&;T)*|PXHy<X6AnDFj
z`#qQbRTd+vT2*t`qqz&+eh(2y;nr{8T1{dT<QQUX>NwQ(Qb}eI7h6S%v(^lvHmAR7
zQ^JZg6bP0Q-);FuPwTdqE(6bYHxDxI1z`zRNg$27iDCfiK?5XUdukPM1Z>=7kLK<w
zw^8xAD$^`G$ijx8;<y^no=npy@B8?`n2||)8=U)rkeUq_W8-_wNTS3TM;4KIBZ}K$
zFmifAGp%7yAVC~2?2gifZ{t8Y%e=V_@Z0}k?>(cMTDvY#MHCf86cp)AMU;+GLRF+m
z6A-0$i1gk&DkvabY5)}oB29V+mEIDmp$7;MI#NUDKJmQgeDCp`_uu_<@8E}HI5IZL
z-fKT)tvTnKPaus&H*mlCMEQ_!xo2tsxuj7Pp>L9SmP}PU((LjCbKO~f6cG>0+k1C{
zO@=v26)f0aLP9Nz0QwOaW3&J~)<{J(lwT6#EP|=!>K4ed#zWpeq)=w+`uvCN;O}Lh
zVyZ;iInw>;)(&Y0#dRcS(E-`n*H(<1C*5HQ_N^Tai{b<6%wO&He<FDikvTk`?=(Ns
zhLiuHI`f~AJCSJ?Y-RBB_35(&9eGNrM`wv&Ur48VH0N&TGD&h{^s+PU>(cr+3N?O1
zgh3A@9dEL_X+1;m1qey|Mt3QjeUIL~Ma4{jb{0JBaiyPg{{($W9!DV{ca|ymRI^**
z_3Ct#Nai}+dFoJE`sE4G{M}%;%)0u*Xy{tk%MC7JDn^@abZX3_H*tc9OQLoTn8z}U
za*=Gk2JiC-0wjak4RQu4gwLJ@_=ZvGPI)!p@n-r>_mtZ*1Ft!!KTm3T^pZYw_tZl&
zyCX8)?=?{?4bVCY^<NqSKwgSte?VzsKU!$axGL4!BjFSVy=JmaD|CY2Zs>nvjCz5V
z+ozxO5d`Z=+BT3OvNnFAe;Gy58ZjjQod^}V)R<=s+~pF^F|=N@>@OE*i%NbNxXAyH
zg-n8g?D^eq{#5VM%iqs32L6aR{C2qFdqiW&i?50Ni6#jN2%5wXUSh$HDJo`XbRBwj
zc!xE`>DpANS?PyA67J6E6CdvdI>LyIPF^DQQf9nu&?{^ZD?;i{FKBB|Tg0VgSSxP#
z^=78{aEPYE_af2cs)5>!94NmyV#zCr3+<^{e=fWW^#pU$@FXi=7jgQ7@7Dy1hKfih
ztZKbKULbq+ny^F5+E@wQ#hpHOTKYVLSxbHN$2u?9BGN9umGQf3sVnnSFIhK{cuw^+
z{l|-TE^5N3WFSfwwM9J4T!jaoQ;{-8mn=%ZF$4*t_Xb;D4ecWBH+f*Yy^6A36tp2{
z9~Kkf6od5`GmI~P0p?iE-8Bt0XX^k=(74$fD;v9|nD2hG?S*BGsKA0+B}-H4+eHKI
zN@N8PXEK`_#B_6Ay3$oMgH!GQl=@0#A^1U;j?72qTU*H_AA)>}-d563&?rBpv1Jwk
zlE;f5uVJnyQ(mHe#V&RI%FqntcM#-x3gPXWf-r|RHZ`(I1`D&xYZryh&#kFl)B%~X
zJ&n@qyEI()zKZ`4)Z5Oal#%E^i^ei&9DbN#(E9_5)8h0Wf~8r>4;LAIFaFrr;Y3Je
zw$E~XTt^(FbnrwvF#v(jI^A?8MR?VU%hf!jrXdWvn<R3MoPz3R=LOKcQs(y+SpW((
zmHp+Zy?a6h3)Gfv`QZ`~wYEa8kKRb`(l)#X6P;6<IsL9mamcNx&{`(J>pTJHPCjH|
zlub8kEq*cn)t$_k%=xOG+l0p51X<bnGh|oZj7Kyj4sqeApn@vQ56_I#Hm{BWJWgXH
zarT;^N;4x5J-2~`h`NvNi^ZKw`cd1bQ}GEjTzZm9GQuFV_s#p+t*`#4Fe^;V%4EDz
ztJ*;e(yNvvH|4&{?J@c>K45(MxI<V<B3<k1V6Wy}{6>fKl-{JkHD|^6?QbtH4o%`T
zKU)T9Jo#*2XZWy`Q5A6E&YfixaxWPrQx}r@+3G~~f5$2$dYFDx7E}fIFzZ{%VokJb
z2O>%f#Dv7VA*yuTJ><<@zK83n#Y50v@9%jaq1{&_IY9}lne|kUGi(QAk{3lOpyrtI
zhKe>mG{@o|(4=5%WmGS`9C{h#M2|>7bUIWtg*xrNoQTJR>5dQX`y^=+3G?;r9LhJw
z_tcs*)E;%Sdpc}m(&5%-12|9|Jie3lL_R?BQD%evS>Oinr9l(84?-ywJYS}XX+HHb
z(480lF3D@blfJU~WGZn(;q1xK3DC#+h7V_1C7qS4nj$dtJS{#fxcFgf7Ba^x^wWu#
zr!+F|%JlBNzviwR7!jc1cgfsO!x69zGEYZ5-?RfQ5rdk73M}#YmHFq7?WlzD<)ElE
z16jtG-m>jUU*At`_s?@aJo!<Yhjt$*G>4D2ZXG@pXCryL>cbRuvL7H~$P7ad0Lwr4
zJgQQiaa4CBgO|nPm1UD?!p8@PM|w=@3rZHZ5n2BzVcVI0%mzr_P+3zJZ6rxhk2Axm
z7kdBU5#m43<VP+db?e8ShlH?P!7Q~gkMqLo2BrK6d({SR78vb;V4KhxC>I}AjmG{%
ztT>Es!7twwr6n-YA_~+plKdCN7S8uE$&zYOD-`wH&%%upl9ev7kYtJ$joQ<)%=YLv
z$>pA6W>7^(4}2?V?dC<=$<3v#B_kdha|<&BxmmE5M5YLdQC$j>BmAJJ_BlM_;pHuP
zX2SM|WRIUiyyN)JAS9{bqZYw}#^w(m%=0bS&9Y)dH02Kz_PB`|#ByXJzU2X1uv>@e
zr4vmRZ>Z(&Wfodr)t+%ZOK{RWBZ}wh$Gfksz8UyEe_;^8%zH^(kg};J_)?cWkkd7t
zy`l$gZeK9L3%p3=qGL=GN+~_yVy9KEe0k5+3bfOgEO{azcQhvxwhP33>2P4_?r@73
zBW60+zD09C5XZQJTHGy9`-h><drk7^>g4l!A|ITK=qq0U<eM>~+d@{7pikOwvW4PP
zhRsDI0#d}2`XH)J8k=&%MP33)&|+cyh@pF%$8rzYNW9+kAMkhL#g~dBlqXL>Z?QSW
z=PEv;p)C+h;eqv}@1|{?cp#lYg7pd>krd1<=RN3vc(sdCvRT?)&YbmE9sb~UH!F(U
zY@WrAqA;dAXv^@OSqa~QPY|8UEhpKU{w^_;wN~b+rb(pKr1KGQu3hNl6qZoE@vZJL
z^_`}tvz6E3BeoA?6o@gfhiaj;LcH8Ff!C}JUp4hdxQPCv=`&HxKe*5C`R*-R|6dT^
z6Y`S|=x9CG&IeE`o)eh@e2odFN|$tDDQ8JU*4_dSgbN+L(mL|i*ZrB6!sGo5QoIFS
zZ*X+l<(njB<vP%9vL#>6dKvuY0_;@nV6k9ty2O;M3PBdbv{=vKhMf*&@Le`2Ny)i0
zjZN>6#jy2$+%-f3N0TlZQ?y4(^UQ_eAA9n9)636Q+1_c*+;Hf{mylXFPts1{MSDti
zeh)4)##1TTczq;`Pfq1x<c?1IUJf>DM}>v&_SXR86eV#&md%WeW&pvrWT%7})d);W
z+7M>bRW(k|%yXd<KX3y*dkaI(!AoE2TlJD;c^XgF{-4U>uY&b`BN@K)wxXBx3BfCT
zha@*8>M4;w-V^MxS3=Ca^ijFV6E!U2UTrVLpZ|C-=t;x$Q6`+;5TyN~CA4;*p^Yyo
z_1u{H1>e8sG><m}>Y;&~OZLD~1~dyUEWO0K{VQ=)>;zzcjssG=eU20Wk%!cs{!D#>
zE#_8bk}BOQteb)9IwK_Adgs)AiSccU(TKWal-yT|`1c&4vu8gt0$cY#S%&|j7W{Yd
zEyM?I6@U1%D@YOu9v<zY>Y8NF!~%adN)2hNuVlnz*WA8z@Hd<Zop#c3sVfeH4#BxM
z0qFi|fGlh0AEgBz`|3@VkdsQYpN%N}IKbY$o7&F;WOa`&e>F@rd%m6{_g?7IHL#<V
z1JP~T&j`ibhHGjY1ZWku8dYm!<-HpP{us3+*ozqXe;S?Pp8>Q;&nsj#gd$a8g44-k
zY51=um*p#$w?@T0%7}8Po0Ic<{tJP@({e)ojCZJeGi?VkQ3DdIQ$Uc5nho6bbJG8}
zfJDf=?~i#zfqY`s!ZIU&?upRJv7&0%d*|-55~Pa^+0|dD4`T4SwqpB|AoT2M!k5>X
zd--RI^@D=}#An^jkNxpC0_m?fx3NBcV#ip>G~L8m=N=A(+i;?jV^!5bzrNSFDfAq`
z8uZG6YrDfo>U%U%H|{gt5u?zG3%ehm#xa!sKRS8d8xo#}Kl!dXM=}@azxn*!lT#<n
z>`>}G-(nv1*<~zBdY{AhK~4a5V!QXV$VVhB9!yf)>HN{NUl%A!nmu0-boB=PkNdqX
z`5w?44lPe+C`y@6o>Sl=sEx~{bVm*k5R#q8lk$Lj4d=ER_;TyIT^&n2YZsGfQJA&J
zK%6l!|NBsoAGP#*zJSN*y3bOC{?Z`86Pbp(zY6M|yGumSu}BZ3*}0ND-#<JRv9V$%
zgaE&J&@fc;qsn#V3m$?8BFl!Af)A2GfzR~ZZ-{_|9xqi?v3t%eS(x<KjQp>${l4|m
zty_Y#epU{sE>ijTxBM#|diH#{f)N)ke}WKbTNOy<ARPej^D<AjO4|RG>|3C5d!nna
ztr`R{)VN>u1?ZM7xL=?5MEJt|!z*-n<Lsb}I6Vm7xV$qD@o1|?N2(=B=h>r)azJMg
zbuWWvCuc<)uW1*&8}UVLn+ntMs!4#zyy^VuX<L6mfCOJygFao5-_`+e{}ou0_(Xu+
z_H@eUM!IKhqRrP|<4YP6T0!n+afT;b>taV9JoW+3eY=C$m2kFvv%}3X&`~?-io*AF
z<J*9y5sKqiQm;5491Rc#8dClOU<@GSw?H2W1lQX0pZF(=oMh(w#Wa9rjgqz<hzuFZ
z*26usTmTFl$PzBXyIec}IXlpIAH!e>sFvHCKg-cT@$e+u80LEyRQ`Q<JqU$W+z@4`
zC1Bk-%4;feMerW!Zs0MN0wgiJ_H^UjWZ586bCQzbOF)jx{vF|X!wQV(Fe^Nv@#rq)
z@BoNgJy6D8OU|jLw0nTsqOZn#yaVaVRcdC$y5?6F%Ryk(Uen%qE~efdFJ7n)x+pXi
z;ivW`V?b<gk^EQJDF9_>0=)qL5CnwKqIlch^B0qBD$VgNs^0<rnKaz|s`6aS+L$gl
zd0-Oy477Kg=Z`;fPxY-i1l_z17%-yrer`KFyXtZAUC+~sI`KB4XKI2sk33TGB3*n_
z&e@BKtx7`EF+V{Q?yER`A0ZI{qE7(6<@pa+eKqPT?{{DzGy1oYKiA&5X|3`n+1O34
zD)2Er%>huJInJKOm!_(nH34H8vX!*T&Dn&y%>K4)z}e~?$s;-pUIG#ZI+uTf#$Uei
zPx<`DO#%n?;2DejxC7E=fu!~VK6NH`iNnc*)&?X0T?eQaC>356Si_vOb_m-hWH(Uw
zMqD3|h+ciCx&mNHQhXmmIY23ye}dL-4t8KAU<f4LhCt}?miab82axtqvCy^E-KN((
zN3es+zHEKfA`OOn{J}-%nJ`|;EXH0o;(%Ae=6g5a9ttkUA0&ZyD7aSeW}wkhOoq(r
zKhg;a?{l{Vd*kBw{v2O|C%Z$QUw&>*C~F^}3z#CFLPMYwAOYB;Z#>Aqjdw!;S&*5*
zWPS-F3c9Yf8U%}8T(8mQZyW}M3!Q*tIQa*BI~%*SZv*9FLLT8-KFVn+l+~8oA0A)(
zmijqx5eSZ?!uKPEwpD?V`W;9j#ti8B{~&TXq9vqMZ?A^a6g8b61jhd4HUQcGQD%#N
zkfh7{pVjy4(b@gYeBTS;3FQ@VKtdZnWo8_A{<YJ&I6V(@Ms<?#5YEb5cy>ySPR#K8
z%Pi1oEM<+oanJxfgs#Q{xA+k@AfPWSw*de&B9{fllVuZ3Rb($mv<A)^co;;*mb%sh
zSps_AlQ#QB`v4^h1WW`gW!8O*D`t>XyYJ^FfF;2jwo|h2>fP@|T9^W>ptpQ`6H1T(
zz<u%}-Xp_;Hni|^_3GM=&sYyiKA&gb-p1T|NIZW1dgDp9i}ATSEwbouJRsQ{phyBM
z$9j2+*V45)5d?T*+IAY-YWwrC`(|l$^Q<OI`13x}3GSHS-M#U`vdNn%Ma?K@F;ZrI
zA>19QF~gP;NwOAd3>>YaoyUWMBp1aLU<@+62aNGev3MyJW-aw_BXe5^#jyL3kf=lr
zuik-1*p~D?J$=zw!H)4!08u-biOUpw-M*?CKl<^ad-eH@jj@kt9bZA{$}@)z%!rZ8
z?UC+Ktt~@3K%=izsDslZf*rAS8fFQm1ULe=KEc50eNVq$!~Hz0Pdd7qwTP{F_Tmbg
zYH&ndO6ho`=+_#_DiJ&po3)HFS_uzyczK5}tUy(qs2t^%Z=cWebnHd7Z+T$jA@;+;
z&^5SY^!L|$Neicmmpz-}ODgk(_ttj4m5h%~K<b5Rp!|a-(F4!MDAQdkfT_9B`oj(G
zGpgHm;Xe~vpiaDqb}WlG=39x0!mCh6dl8&me|bL)16qQ@vKKs#_{d=gawp%F0EJD}
ztm}6??-||-FJw`tSTd=j?1rjPOX}jd(D!#%x{_Un*h=LU6CT7aU&*^)`3&zV@u2Fe
zZd{GtE0;-H9myt{${RT`p4Us%QJ$YTIkS?7Qrzo))ZuBW<ybLdvV8-|Tr$G3edLEB
zL}z?a9r5eCQ;w;n(cc31egYm#J{#_#l+*2<d^CK^8{PeEXl5^C0O!bt+5-lcVFwB;
z&8oEm7M|j9yU86j%14HsuJ^b2PTp!Pt$*hkx8l|mhNpq0g@2AV#{b`JescZ0Ki4YQ
zT(61<1k^K!uy=p_9#0%#m+=RA|NbfgMMV15uXv3Jp0u{}i?a0r4gef-ZW0<_RUSgp
z@yBNrTlC!?w_<)vOqvO{L#~wM*)wJZ>TUB7uG8dLz}X7Qhl%|Rzhf>ki^jVx5L*<6
zF(-g=toY5QfG*Pl-}!A3vr=~?HvU6sE<O4?zRMulCcVuc1;iaj5I^8g#yH+E;o5?4
z-T*@=)@>i}szlr}=>-=>$Om5<vUr{Q<@@;p-6n~uj`Joi1`g5~E7QOO&<Tdwl=729
z=+J%yF1Qmez*4Pwp>c(MC?oHrls=VOtqN_d@c`=`lE+gqo1ht8%1KraH0!Kxf{yL)
zfRCiHOFt0bS{50I&Ca2*gG$`+yZ@N%@i{UQGJ-Qt2zdf-y&&P0`x5%dUxnh0x<!^k
z@Fc@ibCUD7G_?FK6%eYh@HRJ)#Q@DDoQ&+XG-Dgt<xGhVSKkXOJG;w^MXsriE}WL5
zdpVyykwTucKDc!k5Wbr1XIOnW=_B?z2?U03&3DMgF=?1&_)4}IAp0(Y6lhvwycNkV
zP$4VR4%-w38WskIr1-+l>#iXjjz3sez^=8ktrdlFg588at6h6%sqZ-%U#b(wlB_?7
z<plZgobK>p3ZhRNz1v@pKaE8%M^s3GXc#KCo>1V&=WR3@EXm%p<{3=e*Bu$XUDVld
zd-x&5oMC^Q=pr~6V6fbKYDGam7CW<A!L`ug0%J?*ev{0G=)Np@{??#=w+bw5Oyh<?
zMZY{GiJ{CEaR|BkgmxzZhAlDGoU?1IEL5B-4fRq~D(vj*r52Jl?e!|nRp^UGOp3rK
zTPu03)%83%pTK;DmZwUcoUqp-0%Y^@BT6lfWV?*G!ob-n8LC4g7>TXXDVClP6SRDg
zGo-1f(3aD3fy8pOFEYXSvBNuzQ|wWTjF`iSSQOgX9m!L3{`8=ajYm&XSd$Rx^bm>j
ze#=s8JFECktGzFRpS*p<1@y^<H^)>rr{6$t(nmEjxKBl5R(Xac)~d=!O|%g_EXa1}
zr(!?-FNU_a_FA-hLK3ixM{iidQ?KR~zF$NSqPnRn1(P+$Fl(@ZnvD#uQQMlq{?C%T
zKDa9Mc6s%T^RVsKw;ZnK_vY=M_Wkb!<4P3NVsPK8%kYP7>{;Jwr>|QqHLtz8f?PoE
z>4&PFXy1@`ERCFVcuE)2UtyxHiBfgzC}3=UWhH6u%--T@rL(4&Jggf33A0z}j6L8L
zYCVj^sD{x9<z@6)Yx<1QsLks4_j^mXy67GbBBGmV(d6re{H*9VtoAeapyHai+5#Kj
zl=YN-KOzx>ecI@4$Bt0#tyi>1s?RlCkD^mH(hu}(#5_%o5T0EyYlAx0y%#4=5)htU
zBp2MTeA<%aHnup=9EaN6Wl#O}dGYlK1Ui&jd%w&r(L*`|8)78gW%kema<wNb3iqS2
zE5BV&ACjtSB`aer?F*0QW~u107o=NFAuV>w$+6sts=VfSsD#}w&IM*G<#{S^=e89J
zm~uHAV5BdKogQ;o&c(D)Zif!FGX}-?-esfmPK-sRkOuA3t?T6Qq|A}X2Pu{~8KkQ*
zB|F@^%&+HZJ*Uulc6f?o)pT@DM=nU)NU4B{E_A0mb$`YxR0IyWP^Rq*8KPg0Zw^Ja
zI`+835F>?2VqA1-b1`zw8vO}R8(|bD^*|XME|uVvGQAjP^<r*2>)QP8nI2QbfoNKQ
z{zjup?@mJ4l9L4dY_W~Ey8;Y|mIHX;)d;BQg63W>M5x0O`Du5G-%%drl5A~(uuSFT
zrW+Naxkb0gW-_(2pap5`c@rIWFvTtPJ>C=&<8+g&l)`T^!hL<6s}9<`RXnW_@APAa
z`Z6FJ_L?UjK;HHgzRetnKwz!ZTHU1^Sn+42xL3yGbJJ^ak=ER4hCPGtin!NcA_8=?
zJRL4&Iim}DIBE{WsOy<7woDlBGL6t3<}^KU6*k3X$lSPVoYiNNwP{JDvBL|Ye^nWA
z{ZT<K;&Uq`buA_{bkA(5sL#L7=4c=rQ;4dX7mV<}*BIzjJS~`L&C%%B7&hBDoyP|g
z6B~9dU0-}LxLkCA6^tw8TxpepoKm~h?gCq4ZT>;gsY>`ctKPzU&I>unVlK3BeR!gf
z8A>0Gd446KaH)FHl#8Y;q;V-XRyPfqeM+c|HKjPzEVbw;{NmVzuMk4lzVV8-)O)M6
zvZ~Rgri}C#LLAMtBe4r^>Ge*m!z<-}R7Z`s7TS3}ge?i_ab1-)W4D--RNd&*WCya~
zMgQABa)+g_QkIU2755x?*jEVoJS~`zWZ3-?QFNsct)Ly|J}Bk@%M0)l>*>(MHyes`
zeima<$S;YW&gg@N?B?){3GUdALStGbn;%F9KM|_s(jK!}t6BCo&fHtSJ2tmM^))iK
zh}yx-B<z$>8t;tx!ZHKHM&l9h3C8voXd`|}?Wx*V^21LOGh(Zp@rbZDUI}v0`*5Ka
z%H?7&l}A{aw=)TJ2}G9#TL`-+qLlKye&)oI=Fi0yn}%mK@5=c}q!Lgu#fF79`UWu^
z9-#yHRJrs=*16Ky)=dTUKU&^(m*&f}+#yI+>a4tFLZ8aDJJ+j()huxeMT*uN^33cf
zlV+J~myJ0=pfB~EIcJDHhwvwJg@q#GmgW^SwoZ$Oi%z^GUo(aUn(UjReG?uqCHu`8
zD7HTlP7q^BUYh&)DD!@TS&U(c(~~~gQK#?e(bth#D$$$e$%h>d7B5X9I^8Nsu^5kO
zE;^GrV8`3@Eq+^-tW9=oNei-MF=ZE<g!o!)u9S()4Qc52l!su8Qrt4$O*`$r0V$Pd
zs6d5cbN|g@*k+%loQzzFlM_7cix%Xk>SEg<AzOI^cd5A$hQ)f+l7XRkrOqk@sA}fj
zO64HS77uMk>qe0jB84mI<lItz4G|)&c`(C*q*!E&xw|(TJiTwBE#Z_o+Y?jWow&#$
zbD+G~-{w0qt;=V7vF54^gFwl9*9$|7L&@~|9r_<@Z6OtQqEp#uGYldQuJ5^rY0iC7
zVoDnV4a;JOTJ8u2p`>xZ=u_Jw4`a92b}eWW^wbn`nLG8i!tHPD9coW&geM&8IMaSY
zMX5~RD-_^48caPHbh?Dv^-eYvWomER(#<X18@kv(?VT7B>Xsi%`rXN;ZsLONrzDpR
zzIMfw+nMeT=UFsoHoQ$Me+*%}2UB7(&%>C6l5RH-7FKhH39xNUZzn8emM&E1PYK4>
zWv<*&D{>xQ+1Sk8t8s|+&VUR#RwL{^$vM!u;XsvQk4_*R(=L14p^MYg?M_JLqfrf&
z|4_6Qr{xRDu{>X8rCm}pvwaUby1CtEx|T?~L2LYZI|BE@3z0SXFt^IbOw>_~W_pzk
zp@@8tZQ6?*o$?&rYwG==XWL)ab5Ic`?rtluBKU$sbt;#+uX1nU-b_xFm(m-@;QHZ;
ze$2sj3H+cRK1W+IN+(<+nkKx)#HC-_Be%AZDeetZyQ`nKF2$X>VL?c0>feD%nrrMn
z7pr2m^8+Rn*5femv6bw_)c!ojqvONkbP2hj24T0{yDUk-190anMNuatG^b;{3e<z<
zCzm!|&nL{8$YtV;<XL5xl?xb?PYTsD=_61jPER?Lvb{X!k483Uh%F&ER5e;H5yf0{
zpB^=v)egY<nCwkAwhCQ9pG&9!H1UPGyv*~G&>V#z1qkX{Nu&AYwG5B>y7`Li<!qIg
zoAGn#l6|KJ>o~O24-U(Ok48iD_NKfwV6A>=454!9mxjOj{9O{EB$til2%ANEbl;CW
zc|FO>u|Mw8nt8}hCt658U27{XlwaRUe^Xum-dOQ+32MFjTXbxe#%J>CW$jHqIy0)J
zgvfC!jznK7)0B!lKDr?z=<sLL>Du|%`OWGojK0c5tn9;PaQ8<+{l%u@^6d%dL#(R~
zSdzc2%3>YPoSW815V$qo(c{q*_}E_-pEg9laG1lDR+GumoOC;-f2pS^XJ(NhWFdEY
z3VXYG4g;^thmiK3j-?c)U)Qu&*M@2!b-g$?UAJBcxpOUe=y3=Pqm|0OdDzgkXG+yp
zazV$eP?sfx1qNpd_0{iJKt@0kG??$wvsN4RK&I9e&LTLhDLtHdOFIo+G(u5W#;09+
zuzWq%!@C6Lu{>#CWrJQq&K>eWX%ebZ6f4qCaF1qM9<Z@14R2f%fQ}B<uE7#Z??4N$
z*9l>G(C<`pkR|UOwlIZDpGp^;Q{UchYIIxJ+O0_P%J)PMAPh729E0=NYNywq04LWt
zT%b-q^Q#zV9Lh51KG9&D#TUs@EGs@|Ascw}wFz7@o}$*?LtnP?WHfZEElE%MfjQw>
z5^c5Dy`N?hRj>2+os0+zDfZ~t4mvtZCOCWg^ZK3S-g`10HM1RQH3J1taD7NGCS_=Q
zAu$?0o=RFV|GI4rn=<jZ9leYl*@V}#0%vlumf4@Ma|w}NY=S0)E)Wt`D~*1`2xp*>
zn9;~X@w{b&$~`GyVo~fl2@8ASq%bmmMj&2nLw?P3T0kl`(N@I&dfA-pqo6Sts7Cq=
z(;<u6P4oN49=;@es$x+0bVspM4?e4Lhs(1UbU0+jmW+(()Ab7xrwwi3Dy<Pnp@nu!
z&8t05o=P%iddiTZXbem<0@tC_QO4d6q&Na<15=@X$rLp66?3wkbqc3qj5OgZZN-9v
zhr9!3YSu$Mam-(fO{+Bfb`F+Z>pb^KQVuM&V%pbBhnoxcbs``=3ezNSC~`n|e*`WC
zR}HlkTQ`S7Q`Dfw>{q`yphx(jNAXoTiXxn+K6Mwam{2EK+-s{m)Z21=Hz6ao7P-pS
zvjh?76-!xiQyN>)P}#yfz-{Z(wNtJOLkk8O96COkI~MBqKaqFbI--twDsGu9P}%i%
z9+e@!5L&(%M8%W(T_%iJi1=1>2t~)**s~WCg%ORYXMsISW#{8dy4CG_3o^q}H(_n&
z2Q|Z~Z^Y5(92VQDSh(C<<;|>F98u5q#%SY~ViRyfv~_CNa?O`ZJr_5&Di)Ls*-$xI
zbF_R(57uJA*5G~<ooS&R&naGqa;YgtPS-Xfr7Ns;hplU$)!6nL_zhis*2wM6kaC!U
zojMSKN?nf|WaS-xh&XKd`eNJUXKUqN5sY@dP<#qPM*YX|3;BAKiZtYsAH<D#xo1DE
zLuGY4SEb??KK=bhecDWe7G$xPA7VBVx=h2N`KqF^nYK+&PuO2lj*;$$`CKUS);A{!
ztUjc=TTVZ1{%JzGG0#d>PerLC{{w_qeAwg8gs1lctoznx#sr`>LQNShDvlu_Qw;M9
zJ>O}a4I#PQ3{>QTfqXTe4B?ILSYfn~l46v~Pqq;_AA~Zs^9jvddaDx+N3qk1%nUX5
z<gj;W3d8P`5dp0(9MFU(Satem>u4-{B-qadx*en?bUsV6l)W=`9-AuFFw>Urx?S)T
zy0rmexu<SMU#FY>(cSs=jJ;$3vwTx(HR)TZgtMAT!K&tSgso2m<}`A?%TDH%&X=H!
z9k#5cr_9g;a_!HD;^E|CQn^-^&ReR|BY75sR~=)!l(d|^psBgmH=AG6l6QY7)@}ER
zj3r(3(34((iXBkM#+FcATT8gk(&(5Q4!aRuMAN-CG=J@#wNYYRrL?3qraiW1xzuAB
zc3><Jw$NYa=IfATE!5l{dKtkpZ?5b))K5Db5}sd&qUy&?3efrM4|^{WEv%(@-(H=z
zDi|UvT<oYy5$V%eDygjy=Svi-9jyI)qfXNdF+P}Q3Mr_zoj$lZY!TdiujU8rpe{_H
zCqOB*KY1|D8p4Am38IL#gd{rUnd#?hT*kqc1RBft<%6v0>K!_Avpi#K4%;-t+MQ{U
zs~&@c{_QDs%cY<kB*hf#aKS_Sty!{Tg-3&C%5$`y!cVntMPbahq7x_W`W)wLD;DyM
z9>k^+D3s>US!?PY=(eXE!q%xwcemB+R&Jsf#sp>-ktN|d9?FQ?L_X|#N-7_<1HX5&
zd$hOt=)q=XQJ&z+#=+;5l|NR-TmEm8Cn8j?_dR`^^0|3i4jdNI@;owBA+fYkC^b&h
zY}c)hoYBti{FL|lG&s@s=d6sl2W+KW$yiy&4o>Z{Rg_w@_3G#w!~VAjVjSE~GdXy!
zSWd=wSzeyiR9ZCCbUt(m&P~n0y<R7@(AiSubQ|ut2*_y$U*sg9Uq<Z5u=};2>lqy-
zGEcG6na@2&ob6Wt))Eh-h0Ug*VnOMA{FIW4&}Y)-We%kp5bhoo`)1)Wd)R`wRFIQJ
z200^?A!m0a1{H&NA;QYS$QO0DLcYtRDmFk~I=R1>yC*;r!72BFh4M1Op~&0xQIL~r
zuk#6BQ7Nrp5;mpABKXmeO>%{(lwWbEbf;ze=GB=TuAZ<Pbyw67yDfKxd-zUflZ#TW
zb~>3<@IQi3Qx1Pd>4kU3$Z9^zJE`i3M0scXnCZ7s2~k<Yta(_R9N&f(&sF~nX@4!h
zRa1@Fm;Yv>)ZUoGM`ui{HYNMvdcIJ_XGEWAd#e~3l9tG!)E*K6(z3`Q^$1Nby*y*p
zVX3Y5srPV9n`JwHq7Il5P|}QgzS9VRv#x44zG<9B?07C87HBsM@hC7vGb`q4Ms)SC
z33gZm<U?pM&rtz+mkShFbeOsE_R`2-m6({aSzC?>KPT0<Wbd2p?(N!Rym~EF{Q@{>
z9%aOWz?5^QThvEs-8%+-t3s=I{k-;Gw-ZfL(Gj+WslA1V2Nm7A%G#VH)VxAVl<k*-
z3n;d#6FQU8XYa7Ql~m-e>eX4m6Xt#~H(;gw>#L)}ecJFMhThX2d=iYq-{C{qdYhjM
z$l?PGgJIzwy=nb6EW^2;gJQ*~1IUVD#Z@Kva;wx_(NPXWOx?=SF$yCY+$OZ#cOsY8
zabEP^sSAaOr6Cyd*08ASDnNTc@+nQ*zl6l&U)S0`<E9pmDvrc(707M(+t5*y#8QeE
zV~1O1;i+8DM)mXWHP?DAw|H*fL=I(7@5$j)(V(!eO$a+dr<Q6}d{V&pMeqIj)%KZt
zh$6}r<xC^^aBEz>QwM(C40UqrbGqn`m|VBm$@gBeYz}J;eJ;-`2fCm?yyFV((~Oy(
z+Sv&ZPOdaw?VD=%=5SrD)$%=n6^*f{L44++Hrb>fot|S$Y*3dirB7l>+PwS`PW(HU
zQ|Fu<R>z8Us(F*i+?RH~Cx29gNUfJUrSvakm0RlW2-8_Q`;O{~gCs|@422v?!E$ky
z(Wj=aB3{w<oQqaMD>z16DxMQ*T{0i$z&hu9j=Je{)+Vx1@O~E`ei&Qv*)+yu$f?*K
zASsU#6mmp9kk-j}%#VXF;7!!%d0muDERwB4<{D(N`kW7|7M;VCfk<giuwyGCR;fzF
z6|u`pa4*d@I=>;tTA)lFt7_G)5*xqfDeZ2SZVTZr%H`Cu!VT43o6)FlU7zVFN~!KY
zj5uJmH}g~kKAXO5Ue(Xh6w2zo+0F{SGlvR|k^31xB4ejHZTL1_gC)!phL|arI>nOt
zF0NIoIkgmv-ZNDcnRmxTZyl69Phpml*2g|*bYA-8VRg{qq|fE{jkxLH1`qs8)B)S+
zHtIL9B5(vj_>r3|_St!=xrj$HPq#WDDeNJvtJ7QNNn;@SXmNC1h<ARc(x~G5Oku~<
zgH~4M*6A!A?K_SBwEF#5e6ZKdya#cG-cVuWR?3ptu)2+WQZ$Yt&T(I!(?KL>+T@{^
z$bxe}l7hJ#o?o?`J={H*+O^@1Zt;8^Tv>7<y~kBOJ1es=UxBEH-6o_xOhD5HJ}DP@
zw`auUx@R6!On_c&+2=k_ZO+%)J{7ro!^Id+#8y(+`V<TF<Wxr>#Y=}pxop#>><*)3
zjD1-9V4Wijftmj}w+U-RMGR*)_ju?W<V>T`LeY$Bk{J=Jp->Uye*VI8#D0wxLn5ZI
zJSRBYr%CA5I7N>G{E60bHcADVZ|pq!vlr{|rx$=2bktpG+8`0RyU%@~pt6vF-yJga
zWEus$4>xcXJBWpBOe+lc)_Kn{tY~2@dc8IRNjxa6YKO3&_!z=TO^<8ji|q#!i0=hW
zA-Zqy5<XD@lSRTHlOSlxQ*HNq`f4<^pY20*($mAliyx>G5-v)bQ<+DBV{#%=xV+uh
zh1Wayng!b45j%h_hK5oDH{qVSx7T#4Sb~5s@5bZ3>B<9~mh%4O1yAoomv&v(J%5dr
zN}wNBc*j>G_N}t~;>LmV$rbg5&J7~q1vDi1P9746s#$I{HI4CvWcB!$j+UICVdpD=
z^O=<mPu|TFT1Z-tcjoEzUh;HTbex3MZq*#7Y%ZBqjVZz$Q=*#}8*Se;gCwAD9p|AB
z8CE-?1of1*!aBqGgrHi}>}hxu*iy!hi?72$?Tg$8_E)K<;hLeM;hJ$cp>|E3<;ptO
z$_@&W?t%oqQ$Ewrd`#yivFwhB1>MS@bYt<GIYU0!hRX7`Q4y#;Rn?AXwAq4=3psiO
zOZ(c{mRF`qvmb`d@$9I?)w$&h2j*3Y^W+SP&&>2<@If8_adn_2=22Dv>UoZx*>yv$
zw^pCJUSZxIvivHZ37?){+zc7hOReKAzv4>&dNWU3W?3H>e}!wpZ=@v2sAU8dV{Y%W
z*z98ks~2H;S$2ni>J>tB#>Qxr)E==rBVtw+&wnsf=1J$c^i<S06W3@O);Z~rpN(BK
zs|YEu#^GFcMweHUYDJ+=w9dUM7|l}&O1D{7_(qkr5Kmg42rmrLrdEV&;>w+I`614O
z2aa(Gd{-T=>Pl~6_YB~sLwTK=X7qem`{7*U<)wxGg5^Wgg~3k8SPp1rnQ2Xu77iX;
z)Pss>W&p8pp!+ixFbt-$Ij723m!zsS9}B|Z_w@tom&kzn$?DpKt4x`z>e|kRRUV;#
z3_SeOtxqP~Y&Mf|*uW}Qz!C-Pi>N^IotD(Ka*`pQ-ToG|X1OA0tr&PX&@ATCQX=N?
z^Z^{zdRLh3p~+Ces7q6y&U!-Ol!1A|(>thltxnPjYZ>r<h=yJ&>i#cW4!;)&`@dDl
zDy~xZyn2#==z+l~B?jV*lV~ua`!)FgyzGy+a+bE5fRoz$=S=^d|GX#syZqZVL<$Dx
z-|*+pJJZdbx*Boi65|)4lY473>>O|KNZ@bR`}YyZ6O&f76iNF@91(0yA3Dxn`TZNg
z-^@zkzZB5l%Qt-NFEf65@U^y@pjr3#Z^M7T)e~?jQ`$%8CpNu<-_O$8`7hRky>EQ+
zsqEkG^Vi>wM_kd+vU}Zx6~hT7zYAJ4n|OuejCdfqW%%}AZWhGnk->0)BtbbK<6(1l
z?|BVH3Hh%_@X-{mSp3_)kO@W*$mD5jO3P)QS_geK*GPd<Zvt3lY2zqxAiMNCod;A(
ztD)C>ZqXUIW&c}94de~1zcGb(Y_QJ+pn2Dc{~5wRKKa7<#o<xLtLEM4f4y=1-F<li
z?2*_)XwLmhfblQUKN)oCFFtHH{+}0Q5ESK2eA!5_`2*o1%yaZl8ry)^qkNW7q<FAC
zNavc5+R~lc$vc5bBU3jqlVY1$$`_8-#gB@zqy!3JKy>3E2#(VU3in<#t_P>Rl&;3t
zF1FWNl^-nGijT}n3usj>f~FLs?QtS;!F|xWxiZx3@!J#4Y66=8?Bbhb{O6<={!a1Y
z8y_yxf!5?N6E?0@!*-wv)MjdPY&``>s|1-#2j>IS*?q(u9iWdc;5%l1_LtQ<v$Uga
zn1H-z#IE6z49SZ3if5T;OOp+9p{iMQnTK=bt*sXZ-xzLdHE8eoojiqmWV=ZqrKSVi
z(Gh*hIt*)-{yc`DH`m5Fd*W(Yd1Lt@g*qo_0iVDy9BnVUx$J(eo}qXaRY&pZ?%xOf
z7X|PZ5`u?+O@yy;8sjbi-`B8`WnBvr-=CP!!&Rapis;04T9ub?7lO7;i*j&t!Xwk1
zu2iY1r1KZO#(~tGjc{be2S;;s&?A1(qv0h>v~zR#0K9++fL`pCw<vo}l2`fAIU2PM
z60!Z5$_c<*3CQ|S<Kr1b@sz)-_Hd!jhn;_aC-rD2c4E>20|-uWx9uhL;nGBQ-EMXe
zC~FvK!$h|R@LQvn*X&~5dSz6m?JnGwIyKsw>9ij>Ho$OyFL}iaYlru$1<vk$i>7&=
z;}>k~UKH3xk4)!87Q?dafg&t?VK@J1H-A+js$n(HwZ?LEazy!WGm)smFQEQ@iUC(R
z{)O(63xXqt0%gQ^yD$kC62h>hWi?Ab!J!O3<v<YUdSHb2)tyjst?U_CO_`LEZo#69
zYvvSfsfFYC0uDNDebzVfL=RGw)|@l|89a@5a}(78bQW4`3>+H)$y)%HbR(74`E?_k
zd9HNeh88ZkiwaZ?b~a^&^S*ilX%^_Jow2Mn)9C>GaN*LV!_!A4M<DyU=Br3jWzEJf
za=04KP`h@g&Y^{h1~glL>tCn<P`2PzrrLwyI`8d3+wH)KS6p5b(!r#zwd=`2M543z
zjXy9ozP#u?D=B$i#r*(Ri>nE;-EXp`1y#bcw^BLPdZ&X%%~0$Ay*S$a1hbJ=qyilB
z4(97PBbjW$1Zbb47u)R$4@+xtG?N&*qrcN8s(Xr&uIp&a_K1If6n!unfjDSHtT<iA
zyRg#pdWiEZmW)Sye<0nAo*;F-R=cTP=j=8-JjNfkEIu8h)`VK0)<2xa3=$o!aUJoB
z0G5}w+dVa)K~EcgxKnDoS8DI^p?d5#>7sK*4;HHs9l9nSfvS=1x~WEeFu`y*QGy_q
zW4f&-T>}OwO~_^RrY`zk^~}BegurSHn$NmNl7211Uf1^k<y(VxQXJ%WRE%sE+bXE9
zXE^+6KUkso+md&lwDgbV?8j?~D>U6ZK0q&Y0S#uZxOA*hy&m#J%h?SDu9#8_(nAq=
z-V(Q%8|9HV&ea$9lLNiRp^fZtl4f)A8bx`J_FE^|+)CR#no>SA5jn*q^#)aUxT%d>
z*@_FTM7}-(X_fOH$K<7Ow}$>ra@B$DAm8nv3BJ^w4xjLZJsG%cTk&I}&lz^n<}s7t
zpK6EuBzdafp%30iNKNKO><A^6f)#Hcl*`BHLJoz0Xe*5{Pm8tYVpA~H%2@dLz9`8}
zH0W;&*AUtE_de#~*HRRJZapnjXT#k)jGH14eOD=cd9KnpsEkw4Z)sTxMtn-xWo-=9
zA)ajH@>vzEQHFe@)RvxtQh;UBcW<?z!mQCoTlx@6odFyCNrzsgtA~p34Hoc;w6{}H
zRK(WzJ=3o#+0I$F5BD7Dzjv;5Iew=dKQ%tU<q~&dL`B($rH6#aR>M}IdbYt2<`$Jz
z=7W4@g95e3)?zW<6=9NEqR@Tgpd-^ko;FR+_TUR_deiK|3Z3dph3%Z#`u|}#{<G!z
zOKKRfKm|>n7{RZ~V!s!gRtDN0&e+mpH#*~vPId7X^3aj)@WMzFJM48h0C%?n^_hW^
z&fYy+;v@cRtNF_7??ejqNOTGVCFp`c`oPaP-<>#2@4fbvo_TA5idMy@ANbWgR80^Z
zCkFZCsOUF5GQ)3d&xm4>mGQ{aEQpRJXjVR)QeJj_+60V{ik$sto#N{N!#Zpmxs&tr
zlZ>c^jy^ncNUr_K<DIg+@yF0;lfi@lHtRAq?#5aC0K?r$G;s$wPkVEbwCeayB*RYR
zs?|%Kvm*I(@!mZ~rbB6?#X35|Y68}AA=RZ=(G~Y9&Ej7f++Q#G>wQjqDz~zaSRvW@
ze)N)6#L#j>cSP07ZP_tZcz+*xv|j?}A1JmArokh#fsL1<sODUU%roDn(SZ{}2kh_a
z8I5+v9ag7=zQ!wkgD(1R*C+X$i<QAImqG8p77zRkepYQnTac0!<iXCrc+;CoQ9pTI
zhQ4Aw{6<;rW^S{QB&5ZW<3;M>Tq-@6RrT#y3jS^v$4hexwlSqPjE%tU_?{T&YWe}z
z_Ym9PD}%WpyuAYQl7am9s>+=P;R+Yu`Lb>FLX+HPiV%8QaA@MN>f^yy43-o1D{230
z*?uqXpX^(Np1B9FXKZ}cL8GJZR^~qI?BLj+Y*Y`%$NYry(S~xfQV<0Z{qBz-(z-3n
zI{vG4Ta}X?&%9PV(Vp9B;=3%TgKCUC#PAb7v}de0;R0A(*CNAdWpZ#jKrP_~sRkmA
z3Z%CUKHZ8<xnNh6G-~D5K{NI<(B2|L#3mC;V?zEp?0|x1rc~zzQ}X6cSx@Zd6u~fh
zT@=kX0+4*PhX>pW0_3I5Y_Hc6ZsDeLUV{B`)YzUG>Zp+EJmfJMs_#(qS{S{9kUW2}
zB(x#Inzig;q3&=Yf>;>KJjix8ef#HDS)F%NRGj#A%G;^6r8*0fIG+_<teBvh#&kzF
zdcPAra+{XJ`~Bb3s__W2f!TYsyz?tr%HH#j%5|o=DH%H&seQp><yt;Gy38+jNa-^{
z*)+V+nxMo?+1e2|Q0(Q7K#Tu8CI0y0;BG?JC5Ns1aAl7+wG>{Zr5LU)q+1hT(;RHg
zv>fi%67A3=$yvP(7pO6B8Ke>t{x}5A@D)k#!CKpT4wu)a?q;P@uk7Mi=n1UPGPVir
zixKcK-Ucbqfk*Iv%GzcEXERC2oAugwBT4B65c9JS5Y#QFlh$it<~~ZIo4pglb?d_U
z)6r}(yc}Jg=)JMkU1Fj&=8lT3Z#Jm7@3Lp;y*APqMm-egHjtVzRPhn9$Qu=mTQunH
zrd7yU_qSDSHofO>_meA&*jKalJk(wBK+zE2P2ijd+R>53+S`<IO2h5oV7`c~SYn^l
zs+I4(?OL3A$w+f>VHE{U?8%D1W#M2302HewZr*8Le9`bCO<nvUK4B>lOx;LleM@(d
z?F`=3x?YYo`K}072uZ~imB59hy5JrpJYooQ6l5Xd%V_WhQ*pK3Bab|?j!(<3U|Pt0
z?u(`s8%6qiZqWti#!#m~TGtVKV-hw6ktK6wn!3B02CD_(-c}-N%xQZ*8#2C5+dy5#
ztMtMSINiKG9pzP!qDNz7pL}@pR~3{^2m}Gd55Y1<ijfhSyo7=6Rru7~U`O>!YEOCV
zkFhvj2vR7C#trG{2(qkX<}Z~1knP%2G2)S*fdaIL^ZAGKey<XW?p#LOXp0qrv9;gX
z-{Tl7rI;K;v*M2Q41;Rv0jGZL@WFPjic%Sb(XZriALQ5TZdawwh_jcvVI9?incNXK
zwK(Z(I&Pz0w>TF#T$L9)^;?0$NI-K(&<lvI8S?ugh_4-j?SC!8l9jy_IQyCIe}oa=
zgHMO7Q$b-ozmfuA60py7`i;&uH{E6D8A&+zqjuR=od)i3Z7Z;}$8t@y4;2qY?D=j!
z^K}*pVXDI_gj9`f=IuDsE^XuIlVtD>DysNM_}q!s=ie0Coz|1kUZHnb&H)`>Sr~Ga
z;n%J|kR>6#kM-t*_L|Pu4UcV=f<)Jfft`b8SM70Y9WWTSD#s`1SJme?`!>mW2N<Dr
zdf?C-XO#0U-wQH_F07)S<NLh(ktjUc>i}cEozIr4&EHiBBl+Ay#S%U&plbqG)?zm^
z`Pqn{!(lxFrd6NgU8)XsvTwf%WamT8wyjh!I*X}IJ9dEvNAaubyl+jX9m*EgME;h;
z3%U{VKeapCitmchd%QEJ?7RCj2P$gmHYcxNdBZtdd(@)*a5G=r<#YAVS6sBV`NY&?
z`25|Wpjl^w`&A(9WMmcrZ#Q3>n5}(?cIfG#ju+J$A}s+9Oijg=5t!}f;3K$H2r*Z+
z9c-8JA~J3Pzg5>G4;PW72}<udZwt9r+<7|3`Rksz&ztNfhwdVKMT9~)>Bi?2l%UJb
zgQ~Jf=M!Or0=%LNdi`s)u&yI_+Yu9aCh?Vpd+}h`{Ipe#70ve<?L+R8MBsi^Tslpk
z5nC{hgwn;(*3~fG{Ww&#5{@FQdo@+%NQlNEnxh>F&Ct1X2WnxKTrne+M`@3tLA#BF
zfm?G8eQ~q2ePzTg>WHFY3}m-&U+@&x{;Ck-Os8tgs;4<HRG)$`II4?={k2=-x3s{o
z(!``p?Np>x6$uMB)73JDHX!GEdiMFyt4ZknnN`D`gbF`#kZ^}L2I;)#J{^QFk&KC2
zT`1Xe*r=DCGKDYTQ*&@AvM(HSU5#W)-m%2I%LtO`ApKJmR8aK9*EE$TJ48B?s6S{d
z;C_|Hq`LmqtaL!+Xe_fzB)<Kv;Dwt2rTkcyf%K=$uXT%BvOT82acYnP($l;+gH!kr
z!$#brS<_lX%&Wmj>EHu;Me43Rh3~f3EopHEh#asNj$&LKJcFgtIB&FoUKLaFs^`*%
ze(5s%dSAZJCP=-{IG9LCYX#F#ZA2oKlb-hksGV1@n0X3++t&MR$12W9aqZKkzhNl;
zQvPo#X$u2%E=~Z|%~0jIqfNUe@y<us<X5hPZmwx~fhi;-@hT&wz0cQE^xQ6iQ^d}-
zQ)Ug=a1e~}@XiCUiZ;b}g|c14XX1k^58enRF-rx7p4CBF?85Qo2quP*sx!{08J{Mi
z)z)8W|B;4QU6O?^h3IX3JT+#XggC^flo(GV=|(Qgxra(#*$6U?t?G(0YmZ*v{v_P2
z(`zhAqtFUch?qH5m|qkOD-AcWo`{<yHZQ^5U<<{|_H67{pUc14=sNo%Ozik2&9{HC
z5q}{i!{>B>ascVGRfKWA?6XkjV_COpCWZ+W^!i4u$DI<KN=#!`JdIZ&UA+!hqw70;
z)=?Y5+rrD2(Onq(y}>bcZ)A1xrH($+LhJJ`3SwRYZm8}rc_Hzy8WF$q4rqQ6Jb#zc
zP{9+Qzq?qZfBN_<*Z}KD=Mzsp(hrVR+B!9=;?1ZB4JIA-mKG#i$alUT23aVyS^o`6
zp?yR^1H=svO=j+-ly;zCq&q>Pi-y{nqu7ohSg<d!S$InsgjiQ)Y^f^-KdpxOR1dl+
zToxvOu!)!?zEn{D^4U}mqr)=EbkV&fB1A3IRO>Zm-F749Ro1$zkpi^cDe=X~(*mf8
z(TrX#I2lu1$=M#r&EFREW4M68(tbBnX8<BabUNtMZ)^EqX_>(_QhXzJ{*J1|R3d4Q
zC7Ts*AQ-_RY9Okk#g$_X&R5VjMr*VLsCN?9E2zFTj=2AxkALO!lH>AE?F8?-)jT3t
zBEBV9oXT?P^52H&ujP97zQK+<i|1>d(7)d3Z~u|t13jCa=?@rx`~1%zeXkGPBYhlM
z9sahw{@*YCz#CffWs#lz^?U#K`Jag6>@$D_p-0}p{`|tfBJsz6?g9_X9}2e#AN=<D
zp8(kImB)6o0<O%ge}3U_KhX(F@Vn9H{r|lFzw2{6VCHk0f_Aqvx#fQVX2*|z{PvFr
z|MA@aZB_XdkmIHAc;!Fd_>TAee@iWn)1Kq3?>Ol_&iQ{UH~uKPj!T~7ito7K{rAP;
zxZXd8JC4zwW3cPr%8FyG?-=SlMtYBd?tjZ1j&c5Dn&X(|`8SyQnB+R<_>L*gV}|$N
zU}_1!V}kpb;65g}j|uL7%N&jg?qh=cnBe|5Bnr?v#|-cP`wXv1dBmkZy#W3z?8ms@
zG46Ma`yJzc$GG1=<OIjC=P~Sg414}t@p24%9>bphpMgD(@@w`^^kjb~KjU*VL5@|D
zQR2Y`GLLg)k51g6Q2An@?W+Dc@AUBP9G|a+=O@p{g42Pto$u@H|Inqpb5@q;HQ8Lv
znK`Mma<a&DZ=7#E9cz7ztJzq>NYXxHU17|1%yUKPXir$Aqhkl{NczuR)PMfGtl*Qf
zm4|;geg9*VA$-KbIPo|!?|(M&e9H5CDe1hYH1)r5l)D6%Ni*jD_d<1i1IH0Kj=*sQ
zjw5g!f#V1qN8mUD#}PP=z;OhQBXAsn{~ts^!q{1oQV0~@{ks15U!2KkYHFg$UsaC@
z3=EviefD2W*RRt1tJaM#4%2P@dzuACQo;GA?XlDOxkS~PX`qO<jM2w{BKwmrW7IiC
zMMd|IOk>R#`?l{_dF`z2muw}%{%+5Lzcf!GG^7DF{_VwjLDV&dHx2dO8S-x<b0%!w
z*=f|%y>|yc@5D;RcP)^FD(Z;rox!Y9yZvvsWMEDCdNp!8`0_;~={2UO0k4VfVfJEU
zuNKkw|L`RQcXd{;2>(-3jF&GoUTkmazVlBYAe)$ik9M-h&CYTA=fsXxkgUk9Q+F6A
zxW9~P0qYtGMFO51cC$Y|xR>gE{BV<(_3wT<AarX97Yaz6Lvz8|(aap0h1XLz{mcvq
z4@-MyzZPd+!Qb$#G^5!)5^xyr(MhVF6Td$C8!+gfa;_o%(RF&G&>Kx!G5(jh>kSQH
zCBRt3Kap^f$RJ1P8V%hQ%S-)EAEN&WfjlLD{ZcYCzd=KuLY_UhsfP<KE4V?N{RsGm
z{BZOyZ4FG(UpoI=*uXb$5i}l>Mv*R=?qWUVZ*PWYYvKQZ<z#Chy}eMv@17?=So}2<
znWIQ!pUq#=4p)w~o8?I$2KVw<-+!RtQ&v{%8R<<^b(-H19D{rQ11P2@10#0nIfj$x
zf)4G~?|^G8u3m#UJx{Uv0+sAEQTmVH@Bcm7h6ZDk9`hWTG0&r;v*2O=xVOjA5(=0Q
z2A6_2k^i(@`B6vEDBT_Vbe;kF*RNDwxpwz?L`F=dx&y)QH}<}kpt1WA4}V(kGr#BF
zqJ7f%Sf6H&rWYTl_-CyAue|1=Dx#gY<(zozt9B3A)x6jC2UEoB1+zSxv-~vS?|&jO
zdfG_gCVzMs4|RqQm>io$Pso2g_j3!2Hk{3;9Rk*WdfxP^Gj8>T{^KvKtcqc(om&44
zQ?EW_ta|Wo*I#~p;f+zHo=MlgjIw(yWX60v(#GqvyF2oNy+~C>{M3+R<i&2;e0tL+
z_xsfF_W{pM4W3(ued4jW{5>%8-{16aNlY4jO#belBk|xinekmS!QOC<?;m_(Etr4J
zoy(;6c!d68&i!KrACw!VEYg3*o^T$Vsuyja1m`Flg1jBHdtp@pe>XRszy%LluH)j+
zD*gg@+x6&HSvd^4JMB!A$(4LKSp0k<Kulf0WcJ5h`zpG1NQ(DN)<zp_Fh9#SQ`e=v
zUxPL#meX1;Tx@^bGHmZ3olB2HBx;(UTbMgqi^+U9v|iT5N&9_;F#ViRhQ*YolzRu^
zcGu;(lw3!6YwC?n5A4$33dl9@W8S#CDT{19-#sit=yLRKN(*Efi*Ky%7wtIe%Ry7^
z=PjDscROciHrLtypT5pBps94*`(tGoMS7E_6ai@>y#!DM1f+{12vLyUr4s_fSm+%?
zCyD|pO$fcJG^Hv*LI@B6DFH%&Kp>Fv?woVay`yvQ%Ll%Yy?6H7YprLk|9aTJ{2;f{
zi{A0`fF*v8ryXOxk1s5u<bn74>5ZN^d-iN(w3X!T^=4#sMBV+Zk56#!Id^l5BMv_M
zSV96c16I%Dp6g??X&cb~@-{-VoE*5ZGVAOBb?dQjld0mu3)jz?6MB2okgkb6+X!Sk
zEjM_8-@UvSot%ZBEy61}8r6S(hY)a+mZP=#AxA5d3bF%Mj`p7i{5RY29#XDxNWU<2
z<{NH~iQvz&H-M#FP7&yen2IiU96MO^PgkhrVhY2!Jj`z=&^+hQXqA$KmdhX;ttyUC
zat*Rh_Pk<qts?Zh82m<)d6%N^WxuHWhu=SVu58ZXKKF~5<~UKhl@$ui*h=5Lk?vR&
zipjZvZvcDw2o`R1tTm1oHhi^3htW~xP?GMY6s%8Dy)$`RcH(f%Rk!0+nFKr(?Q7ji
zY;>@%(A|8l?|^NJ3EC7$z23IoX*A`NR5ccaet08r243cfiJW@XK{B^t)e`kBoB`j+
zi(1c!8&jY!(@UKvJ~VF?lr?n{2}l@cqj3rvszYB#;(%K%2|<_j4Xtddi98FP^-Zgv
zn@kz|iww|R%gV9T1Kuiu*tms)*@ri$T$?}$iR!*3<EQt#jeaQJ4I$TTqwaS<EmsrV
zg2_&Rkdf%2IDT}UC42ee3;VVKfnT!2^S_QhienK7bvv=ZbNR^85RUjGH(kFUcTR_V
zN<WX*IVH0?wzA1ChTMQ`6seAKuqtTp`Q-0z4^2c8TB)#LBnzs%C208v98W*cA8W&g
z`J8QV3X0s1g=`<$<=dQ9#IVl63TU}!nOMyUk%U@Llrf|<_s-_FslpOOJQ?dltRIbL
zdUf6Fn#L9qZmNaXl$IMLS%j+=Sk-oqw3V{d#qLU}Jy@1?XjP5@QxIXHcrE%|ZY8ZR
z7(3HcUV7)D%b`%1vCWy>{4z&~ZJ_<y+RD-r7jvRS5|6v0>QabNlvPQQ{%rVzPK#?c
z$n>bCZV`HbS@e3dPhKhYjJ8ziGigvb*3pwvR<OKx`Qi9$ZGMD4EK1|xk))e(W?wK0
zA2U*u(lZ#%B$(~jOxyFE{Bj=`1gAGCyFFqds)G;Ki0JhUp)se!8^(LsAoqN@1aB&}
zKdrQ*?(r}s0(fE@=C$p}R5Ueh#^dFBVcWL4x#r@jJlqDEyoZGvc?lz~oP|mFEsf-5
zMU%k1OUwmX7FtSX(}ByCT}~!xqv7JZx~tIbg#KG=-R)QDziv;aUc96(!0GAf`I(ed
ziG9mOJsXf9qc&zYKf=xcleKcjUm=vkqqKc9eEFbJs#@nc__yk#PQTU-z=A4<&xNmP
zVY4KUb&^Vv4r-w~OUFcvwcL&40t@_Ll6;Az==~xh5-bc=r;}d!1Zb0N<4aQv)0}jD
zI~O1CIIHr-&~A?mqycx96};V5c<1gxK8o=u_hi5~PVo+nVq?C)gqvVok8Qj(tjb{>
zS{7+MTpm?38|3(%X&E6ZTFX&jMF1`SNvRa5K7>Xh9k5>`dr#iUJjWraSTToLj+TJ?
zUXnU=ovT+G{j7$d;T2+v^9%ZaEYn9V<jq-UV=QPi7Nd3*p@Xuf2TIC)uXYhVdKBlt
zLpVK~rd9xDcO9&%HWeA>kaaMhRPETL1l@V)iqZ}Yhz?b-nx`N-X)(y^Ho~)Ln7oO8
z9E+)oQU#f~R7M=pK<pMOzz>U*BdNPY;@+e;mO?c{CqIps86ec`haC98*25PHu*JJr
zeX=|4{I4nXrWue@y+9Y7fMBa<kLTyag6Q6!A5}S#v>JH6+%CesGuc|UlxuOSBD$cu
zV|$zH7)eoGc<{K2`!Vbmx2v0K()od<0)KRVKyJ}+rc@?N;Cii*@ZDztpWL{7GUYV5
z77r?FK7kkDc?wh&5OJI{LbPPJTh(8_Wkq*;I8#8NpsqBz&sB-}X-y-=vcTpxr0o(W
zXMRbb#+0dR?voEs|Bszz-I;VbqOywaGReWuNp8-vr$O7H`c1fqWas|XA!|DYQ&r`1
z3ArSqcN*=x5z-f3XH}a=3hAOdwk{1|m*BigN*4%W_f&)KQ!6_>Jq=;O8q^=Q0yq1O
z-6`;xL|J`R(+9yYS^j4dd{386FQg1{NLnT4$(SO_MH8)uL=#=Uf*o#5YU<YC`|y|c
zSDQlK+J0-xKmJi0y#*F$Tf9af=Q&G_SU#VPn(ZF2+bf{RRyuP=b7#{>^?fYvomz*A
zL*4RMpV0RoC(7u!Ao|onXE?I`1D13z-ZgUy=FA4t5or6`M}DD!aBBcbSlsFvic1~%
zl<{&bPKTC9`Aknu4ckRmsbO;^1<z>y*o;z0m^?K%$FwoHXYK)+>07uZan-rZ$Y_sO
zN!ppXKItv$rm|ij_mT@7=#O$^j$1?YbWh)2dX?0<H@vg4C){K|wiN55QEG}PO?O_v
zS2dzlg`gi3?v&j3z$`gnjY{S|cWJ}mVXOShuTGp=ay5q8*3LXcvXCPR{5lP2XCYU5
zZXa?mCr;Yv|LR>QZ=LJ&ldTw63u2DzB)8-PtGV~hb0@cr+zZactW5`PgP28ga{~j=
zqw=3~6f{@fsg?R(@Pn7RBaW2}eNr{18LKWlml8wdu{!u_DT0Er9lb`5ojj0hJ%;_h
zgSjlX@N-{Ch<U1{g^o(f07)~R|5!|ZC^TP<@NB0#MqLy0K9Vq>Z4P#KPe_^kqM-_o
zCIr%R;kVb(Uq~gQ)sx@JqcvyN`?u`=xR@zEs!hE~JEgy_$PUiP&TA)R?;Pl3b6F4i
zpJHkSZG5IE&lHMbqxU8wP&2oE*^1<txtY@nXyMR<?+NgcEUvB=0LK6o;!$wQmc#6P
zn>c<=R*Ty>K}t@}bQ=ZPXqEu&yr-g<rka+qMs%P2VJULjsm@}j@VOQWV&A>M2b;Vo
z12a{vPYl=prRuq9cF*$LQ62?`zfsKYZIwJB55*p1)!gwH={%4pNyKkYD_JZ<{^&V0
zL!@Lc;K&k5yvtyWUzbbd=%K}@;uWmMnJ@?Egc7O42MEc;%c2Dz-jQ!{Ci>roq#W75
zt|~jMnoIjO8j~-nf9ruMlW!B$7jNK-tZunj6SQE~S8f+7rFzE?TpUBnqr9v_TsfSd
zomX)Sr%)9|hSb7Rse<O@k2yB$c(zBh+m5sfS@OHL8!4)@>K(g4rP8@Wzz(RU%%0n8
zl_+ow{2UXlP!kr_*l$ofZ6(g;H<HP!C?={}>!*dekCAQRTukT}Hp9{<T5!5uhr_|-
z8hO*Sj#d%FXxV}z?5@*p1x<ZZbq|aqN54wGTF}`^pybB<Pyh2@3yv+<rS2Dr9T(gH
zd$nKO?AN_p;C~=!N?dQ*i^eG^36H?vV<S+Y?UV$Ui{D?Og-=^K8^i4kS8U+aoZz+L
zm;o5CmpPL;R6m;bQpNA4V1dDCk&+~KliUqgmj+LHtofuvzdTPRq7>7GOQkXNxfufJ
zLT~5kjLP`ZjFKCm^%NY6eu$tz-RaZN{wf@v5CM)>R}J3qh;$KW6K$LvCqMQft}`K6
z5Ml`&*#6TA*Ut=n5+qGROb&XC%U|JQ3Skmg4a-rkdm@WCN%*Hs(ZR7JHmrlGa2OsR
zV9@x1R2RHCXY`}rMQpj5Af>1zwjs6=(RF(-+isWqBkH7aJ3L)NhqhQ*(}$V39J3*V
zW5Vhvz;T;PFK3HpX<-D@f@A@>Q}{+p<&6HrEKC{w&2E(cKkdlLr|mQOPN{;YKAf5D
zHC(Uug)i0Xrj5~2-#vf6%MY&We8PT>08!VUX&{g<SwZRk+1fe~XPx4YX&qe7K`Z>|
zZ9Uzglg+$xchhelS~l)|_~i3)f!V=7Cv?hpuYU2dFXGA$8uyXlnC><lCCxHYOws+Z
zDTa8}D<%!&yJ+I0Cj=3@3)x*H>zv$sV0z5AiE56nd|E}MGP?~I=J|CkmPIYB+Eu@v
zjB05Hs4&H5S_Dn$H1CS4reV%wD9VPKXSvVklK5SqDj(!HTEkUdUN*G+=c6;md$SRc
z$PIKkk)I!fe;4E>taO1&p;Of@9`-~g#&zK2$d|2qIZSIac9zgoi|PC2?SMlI=Zrqx
zp9>oqbxDLwsSA9^y0r;yFJRniYS+KgX*>O78m~gf{?*3w!2?;+8LuY;b7riEzBH4c
zu8r-Y?&eAG^E@c<PrDw-I>%m%g`j+KFAfRA1ZWe8G?2;0Lf+b6UzLe2A_xQ$9K#+A
zYPc2{_&1p8h|N#Pz6qdrge@)Ovs<od)#7$`1nB9bM?IfxKK;2nV&h|r<>LgZD_7Y>
zsNf$7#a*h(YHBs>WY=ndm*&g<DKpi3uqyjNXX=8Wq3a`>X^uQTC*0*-v}+^#!?1!g
z{dHHZoFiV&!aksv`mCIl7VS+=&-Q0t?<KQ$KbH;s;#k=*K0_;vm)oeSf?$H4y3uEh
z1dAGkQmvdhS4a}kxOhkcTEKvs(d>{hd_iM389}6qbcRnElOG!QHdtYKyCSdXn2M6(
zrA)@Zr4MJXnCfzcL`eKnkwi*<$p$rR#Ly@cNfLtg(vg(DOcT(Bpd&!~8<6?(^|uJ4
zuM;kRwp|wtz8PZ#=N^>d?iYQ^-K!Q7jhAnn#J;E)d*&j}KLrMh%NIv)Z~2gVUhP=G
zL7<ti(V~cL_b=bCpxq%qhG$@M%-<TXIm#TS?|3@o?ERyOSCPB-v^#&}2%gif`>31x
z=^IR379WkgsTzRDv|%Vy#N5NlPWVJB$bB~)N*;PL-#$s7EA}Cm7Wg%H?tfy)HGhvN
zOO7K-f~zR@e`^}CCcyrj(>I!o)lVlQZvFjp|FQ;%J3V0KjotDx>nbH(?5Fem-CD6z
z$HfJ`f*t~lu>butU!DNFczD&%{6l*C&lMY(ud9zQ{gC}Xj@NsB3J{;<0|fukiT|gz
z_u@<scs>024siXIhW7){WFXM0Q$Kgj3@1H#>%E;hp+iD2(tp|H|GL<B+at_Obm{+`
z`aj?Czi$o716W1r#UKCp!+$;j9L@6oIoe}Ah~?r>`r+T~*1^T66BVAX8RC_H+0`Up
z+0#1z(*3JvmE3MPVb;+IB*06^*yudmP=+4tDEm&;*L!>|accZgN8|DE@bhy*V^dzk
ze~aDz15hg~+douEWK`X4IiR~4rJQ{RlXq-|O&4MQap@iNduvrq=KUfuQJQm)Imez~
zX^`tfIQJ$300*j+Bfu2E0dKq~ICkT!y}Z0kh|8m3JQ9%4zIE=I`2w|OgN8=ZdwW1~
z{X%z?0e<z_|9XkH5&^&)uuJz>O*IJZMC+*C20#|H-SkJHV5$cg^6dZN7_m;wJ(W+v
zlzCpLx$3orjCb*^CFZ4qz$F(~n5_O+6W^=ine=U%83XWPunQK&cbg8ATT4b>JKP}x
zM|pR$Ui-V_{<qy@8|=@you};g>zWJrHQ%2hD6J;94hO&1@=cQg6qbK$rT@#3&n1s8
z7eoBI#03)h0KwB&%KO&yz+u&z5QcBr@g8r)Mjvv4AD#RLTXq5@_ejcCKa#qK?ULVY
zHu~B9-WeV@1fCIT%Dbi=ueH%EHVoJ80?)^H@T|5ZTf1K|6!>}zq?)-=$pR0P`|@Ra
zy1j1AMo<FCr#yVBK9f^XkkaO+XEL|xSRcwIaBHxqw6j3`IUULzYxyHq9d!_8v)hxr
zy{Mw15x)7o9>68M!uYifD=@1h{HpHl+_*>C-@+v;ja+x>Oa81-sN#J~bFC6FMjX|}
zA`)9Qv|t+fs=oGW?o|BdP@WR9s%bVKKpz!Tx{b3gltM}9awCLTnR!vZ?x+1G`?hG*
z`1nr#15<r{eH5?8)Qdm)&j{7+rvu^mjaMCCkF*}*+F!lh)!OHGp-ib0pk;l^I*BZ|
zAXo`92?ppL?34p<V!yft(2@66>?JvW@hrzvQR^-u-e#&jc>+G~t|Z2t6vJ7LJ_PKH
zx;bNG0eZ%G+51i#mRj9S#F84DN<{#enP--nDsD3z!}@YqX;d45H=1Ql*YovuIBj_;
zpV2zR9x$&1C?n6Nna~RGlaAM(9lH~QHIZ{qc(+l&)6@N?S+NA%?_iDe>gxtFW8{Lk
zzG~&n{2CzGUB)*rr0%{p{{@ivT=^)*2QlpJjP|Oiaw6UV>oCHc5pAti^VMw|CWrVY
zaIsH(GlQi;fr^XFLx(c=0V-1#X(kN2^GPf%*a2M6?<;dm;nF!JK_}h7>ya}(FFZ$w
zpT6?#S1n~RJOhBeCjMH))#jNZi$)t?3AoD|#}t0Oym{1~A<)MjVUNcFMJ>c9l75JV
z)9W@TUDCca%-A*J0K6R-!1V4CBVEvM<Afun90NX_@SJUr=<xDJ466>tD7FG{HDaq*
zm3XEwKn1lJlc{OX1jzMi<H2Z2ovnnZH&qP2pGuJ~o1{?m{v7$C9h^>CCZaDN`&y>@
zQ<}VY@Ay1nhg9msy2-uUH)c44U*?0T3kUYF<9d0)L+-Z=qYrQuj-34r#@&(<4+G*N
z2R1$rySwUDrVkjPA4?o~|HlsIbR4_>*|GjJXY>4YUGKL3*iRWtU`jPVG~<KyrJgN3
z={gARz1^NRI&W9Vx!k7`SoN=&gLjUe_;2PKi`iU*Vnu+b;sBDL%Z%S84i-i3;GLDP
z(j5G1Gr>0bAV{cAH=tHf+`KtT{8&i(23z9&Fb6+0tIA&IfMeK<idmt0?H+e?=1-;1
zglDX@O<6F6UYk4y2C#BF00K|4_n5zsgp_klzXB9ziUSaX#umEN;DdE{U5sx>L;Z}e
zg!xLXUe{1S_@JLY@{Yf7mLyjshn4s_b5Im@=LHpI3T}1|ZKcdgfLf+o7;B8><yhzL
z_4$wwf1SX6U=q2Zg|D~K5vr>QNb*iZjf|v7<fLk@zNTz4sAj<c#<0FPa`jDl82}bn
zg2N)d+1@VAjbK|--)JX}kf*XOw70IcZuJQu8&FOKU`~V>10uJy1>_%H5WuXh3gPyj
zC69!ab?nV*`L^Q9aA<e9833OW8B;;swGrIdRGCUb3kCX#+pA7_yyP0Z0X@WE%`J4u
z4~paYfiV$d{h|#b^W2N`{ND-L=(%`S#;*b3Ju?P#vB+&GH={2_Acx@3{`uMGEXi_%
zB%V@${}vY^pf(+#PvbJu#<8RB27fgmKLs0{4m6YX<z8kiNpBE`2@Q(f@+RW(CB>8b
zZ3yB*`*o=&7#;_oj6(*eXRG$-G1J;>J)iw)+k~#cn$CmOs8Fc5aI4H#7!3fm<kO*o
zGRK1(e5}KK!%(n=4$jw*k8|~66NB!uJMx9Lw+-g!v|@Bsq5g;#T%io4>9iquDohSo
zKHgqn6S&s@J{V(7x!3U{({w5t(8Bj-k6^C1zPSG_xgk77l8Xj_6g5P8XbjL{@St4N
zt5E<n<OyiS5m8-~MFqx!fIaJt@pjN%|87=^=L_j?@XcZgT&;<rt|>=T_PI>O49FjO
zJJUla@5d>-j}&s*s5Su$i#D09UlbS6$M62**kUc$Sf+%jJ=63;cs2teNdEJ1x+Mq+
zJoBtjEOakQzsjx-HyT4DzRl9yS94>&R=CCO+rL{Y@3!MWxvBy@2Flyqk=B5@k5Qhb
zldrtn@QtRmIR>&U+>huP%JH7BG76<ulTL|keQiv+gK#UozR?qNETcVsF@H)pP24;x
zBt*CNj@{Nx;~wsi(2(e9x)TKdEpjj8xYf_`Kf@ExwAOm@_T4~(Xm#3bWkdVn3}J*G
zop!AkgQbzo71pZp<_f{jEslnmsYmS!4dXhp)(}?i)3&k=^|W_aaM|F_vGRgoMaJ5l
zt<RXCT_J_Pjp=4lt2DJ#M4lS})3$R^<9qLqvHnmxGOstp5&uBHl9>VG(Zpt*J2}U&
zdHH0;cL2~gl*OW$LIl8mH=?yquCi*bJfDbU1xTS$&uOo+%m?5<(q4T)b&?u_zsE9#
z<^@PyY$kLbZc9Jkf@&wQCp^s!*<w6XaxtJCoG*(vtjzF<zxEMeZ54ur5$dTr2P;vQ
zU6XXtle-Jy;>Qy22nUWbY8j%pv+v;Ra(o#$&{y0ECg%W)-h3ny9I4AFCDv|aUq4-C
zS1bB0k`AD<)V3d63qSr@TL9LQtxFCRJU;^2cCj}9eIv%Nz*%XzGkg9>+sJA-(18;5
zxAeeO5~``G95M9JX8|kKczaBc)9CS4SU`Lp%kB+Tt|s=7kvr`lZ^Rw#VYCREZ1F0m
zF*(j|-HQE!NZOY_p<wOvA~lNlpup*x8igPlJsj7Io;9LEL(QXR$Hp>6<VOhW)5B_5
zZCx(JIZ>EYt+v(BgZLxl`F(s7b~ci7uvX)5rruNe*QA)JiTLj&TsCShjaTF(iaVuD
zWZ#TWPZv514J;1k@QN(g7|1FrEdanu6WSfX*Cdq_AM5*Ma({l>C;>CMIIx355+Cc!
z2HwmbROr&SP~F*y_5$g;L3-A?pZt`52PD}bB0Uc~x(>e8w4UDhqR(o^;9^<Fq)E#<
zwX3Iz35zCwJL&wCdOl8YOmm_0_scZDRM*6XD}D2X@AD1n{oWeKS2F|RK?2UISKTEX
z8qT{`cm~_e+hvcW_Se2XEOvKT$FcGKOK`2cV0pT;-{%QQ!cDp7AXV{F3~^X(Ic^~*
zt5G78wKmFTNI8Ct79QwV+haq?<n+G5BSCugI)wb(epCjQ7l|v)vBzATlHE|7O8Y=1
zNpMIRNCWud`!XhhW)_6<31ZSg5~=;C8M9slFk*`SSTQKrJJy5GahHc_rZTO~U-{4p
zLnGW#^&hWbdJKn})1IA}f)Piw6u__+y!%W?TH~`=UcPbE1*6lRpVZ<v-WHobWQl;i
zxNoUn>?6g&TJ(*xz|zR!-zu_)jHS=I;O8e)5~c?KVkb4MT`UFI6#<t&*g;w>_fMH4
z1+_wbbWG2e1@P>yx2{!Jg>V&I=x5p*H(0M>82{ujK!V%bM!m5q?{uq)oA@J=yabR{
z2WpbR$6prhGwQ}p{@|^o`|h?)DG>nJ@RNniMh9L6)B!Ip#x}|ke(Kqt3m)D2a+l($
z{qV=9sJxj*#Bt|8bzrk8E`jE<r>0TX+S;hBNN0s^k_1e5!xLwQ0r_^5Yq(d4r{fmN
zldso)4l0)bM8?b8$7P%^>XV*zbw-l5ZB6Z!PkXLDe5zk4<a^Bl+j0Lx1ux{^lCd;c
zSW5MU4r$VbD0yz4H_@(IY9fC+d?AI1W>V}vuj-x3f2>y7%IA#GGS{)~+PN*D{w4As
z3v?7r1D%<=Ao=kC2*$x2yF@6Db4$tffF5h}TE7QiliD66P(9?$QkK-pf#T_Mno6cr
z?bpyDulUiIRr`zYDmWnKJ;H2OqO{yawhOgaLJ!Ags9dI?4uJGy(FJfl5Aifi|03k4
zo?%5KwxYWAVoWZh<TS+sNJHYCV-S49?O8inML{K^i*C2wiSIxmyB5S2hJ}XZYezhU
zui?y2d~F1(La4cVlm~scHC5B0OteAPEQNFLMmLFCo_#WH$W8SDr%YYVvn02&QdYWl
z!*q=AEhAOrd9PJx4*hyt?~ilGME3QHQ`Tgg!&e(BI7|bAZJpB%0UPT1AD?F?$4V)(
z)bV{1jwfk)xqSG`)_9)p2f@Q;Ikj3^C+P!8%0Q@hS*WA$Ubw6TSHW2$huVzb(<5;o
z)Vj#S+GsXZFCF88Tm|sUig-qelKcc96knhMjlgh4%C)P7oc$l=N&4|eD9&b|_gw*H
z@IJY3xVG!+x;Go->fP|UClyb)QGNHIL-8wbxt4pq$2q%x2*psg$gSD67T0>}xW?b?
z{Px@F1mkO0ZjS04z5=?I#jbVgJa5;hJtICfzq{q%46P;HJbS49+lICNhU_tiNv&=0
zTw8Wi+YY?1{~)K(OcAy-8v9c-qStbKKDKfGMaSe#@|dZ@FnuJMdO{8TAd7OpbH898
zSlmJ#VK)=wc6IhYuvW7__Gp{9Z0VxX&}XwqJX^;i0|9OC9wg|GN%Z@V?^1zXFx*M7
zS1%wU&kO~(Q|kBoM%^E&qYu#N?<NcTpFKaFp{MiQg*w|uj)uIszmi^AUEQs*G@mmj
z;$As*uD;?aqfQSEQ3t!dtl$(EY#j^q@#TpBU{gvzF-7~H_$Fa%htg{4{6*_z`$7&Z
z{BqZ%&GR-$8tAmBG#gUNMZaR+_;8~-Y2m9|REX>JjK-F@k@%7M3WW8bMv)^1z1*wA
z;p;+>pD3vYDAX5)pgUi2gJ9Z29lQ!GGha6*I<Ogo8rhK4Nds$RHdxr7g@i^ov1r8p
zl&&Y5-_s5Wa7kX(>45HU&IuB63au+8H*3&Gy5m?!`;X20V^RmyI=50N1X^Riv*7nN
zFg+5Jww=ev;B0qJxeMWUEeqLVMhW#ua}`1hU(7-y^n2}O|GU?t-uR=?{JG861pPLD
z=|PceOe&Rki*L#1<?vv)bv+ly7RfiB_6{nTuEqGk$Lqhn@WeTyW#e*ZzJzq1Hz;Tj
z7u5Vi_9P|pP{b5mjhEtq1E6}pQ$6&gsSD>PZl&xWi8;WPFC$g74i`%hkTloydFI1n
zae<QR4UQ8Hg~-tNU5I@zjdFyy)?xcgP45;sX}4DeWUSh=E7af^HT$(TRmWQ-?w4d&
zcvKA=6_nW3<{~|t>nS2Z@{f-@0XYZ@za7SBs!Uxm(p?RPbu&vArpp7wukldA2R@2F
zcq)dcz`p+vwW4PpN>kOPJE85(uvS!WkP`+1vYZS;<Ys&^`<!z<QYGn<l()Q_;%O(?
zGSArpwd*Y1{bk$=x`hPdmtHTEs+93@w>5M{7j7^{8`=`OD-~t*=lfQ9wnrb<Og^4J
z(1h(KNTg-cxZ~fH)F#}r$f;h%z8?Si-}+p+<C#7?M_?IkFl()F!L_j98eVWzU^#PK
z?tONApN0i!_huqDs}2C|>nh8)8@|wI?Y#JS5~4_Y9-l`odiJ_zf@8POF2!0TR!2gx
zQ3!L>g$te4j<%m*N$DPYz1P)>zCMwWvgTiKt9|>L{X^8KjgMdvXanO5ZD>OK+L{CE
z!27f8@scN|+?P<-&z*IwIL<&~a)Eytk^=h)VLwu!hl$$=I8_$3{YeZ*gwvz8tQ?(0
zfJ_k#kkv}F%r)P19~h<y7cn^XA>oz#mq=FH053Mlzbz`Iv1HiZR9tXqzQ(rVka&<C
zFnRCC_YDWBk@%DnM2CRAA4s9<ZU+9tJLd>{VTk%V?S*nU<%}m9T<6#=hWj#{Zvyf{
zcwiW{mx(x=ZjY!7cvL!3q80SwS6Pdjtr<`wtWW=Fk{i_~-oW^;k#sVDV-gCsWF+*N
z)%DjU82>Jig<o)RC1TW>*@HQ5Bsh{50SC5jspMGLe<GRl2_gjR9(Pkwt`POgDcOd_
zxdLnCNPlTxVAfcj5#eq_rS)ExpqJMKV+)y7h29OM4C%=oUQ(cxw{CpWPeJ>8<=|08
zbrJ9}TXP-q4=&?Wl7KC$f|>)4)-VtNN%v-=U^iD2>`}|1?VxZJs$RIL;T(q2(w6Bv
z^v<jF)s;JA_dUG$c)<E&&)L!5>*lkU6;za2MweB_>*Dmyd_#G&7`spbr8`4IM7x;@
z(K8Xyp8pR7+Ya#jG!i>_MHMXx93h4?pgShg&hfSU2!U0M03+}xhe|!yU;CXLs^dQu
zQRacptbcgFu`$@^Jv)DV?&tIlFl``N+2f@#@O3udN=CP}ba^0P+g1M1ia{g;;}{kZ
zwK3h|C>+Mnsi5p0o}R{LW(^yUkU6lsroYTZ*`(s$13g~>fHVy&4H^YC`3=hS%BJL^
zzaSsz$M-acB!89K{jV7o#9^k#>@0HD;S8R({o&DABHE%Hb{L&MwryuFT`<Uo<gbGY
z!9bC=_6PewVs}+_5VbH5cAW7ifOGu{zwXh0e|iS5xglW+3l@rhd3=K7NC9t6OKP%g
zq+*Ewu)z!0pDF}KIJ{>DY81)dli#dJ*ciHwiXcQkYDo!hY~^lYx<12Z>Zss&FvT?O
z(r$<=(u2cpwgwuqJlgI(9k)1$9N_DTQF-%ab9HT?QvbHReY!>D>rxSKFGP_6rT-=1
zsLy?lW=z&sTB9Qkv>D38-z3uadcR+*ySF<22KVul!Vu5L&$oKt)b_Lln%xx$*=g*_
zO^zFZRG`QqEW^XY=r1vZL40}gTIiipyDM*eFTdFD3{ROLo|`H{>AcM&=9GU1Ltjw)
zI}V}&k;c)<-MCafLhx?V@wz+a-d|_34{vEsozrx4=w)qQykmXq*YW@Fl}Se#OA6=j
zHf1+})!xj6^TLgPiJQ`Hqs2wV;qwW>doy9>IAnRbC$g**kRWyw-tZ;iq~`90>cc&3
z;GAbfv^*YY?r!cL@))Gtj#=Cb8VmEcvs^$s!W(BJ*A`9lf<s}C_1!P)fHA-@<Ycvx
zmWlecH8G>qAtf^C^AhXbar=J^FDgMtYtNcT$=Q9`Qb#=2#~!qfdb2ocL+>Z%F;N{$
zQamc0Txk??#0A;V^9m=AEHs3JYeL6?v3%^t@x;<+6umX`fnPUN|EWhf;k@2!ZPKw<
zu{fY3Dag}Ud4nM!$@%W$3O&T8-ri2=x$5(K9nk43`_Hy>DV?4F`*}sT&|_oHx^WxN
z1^*d!SI%X!D4yi}*S_8zjXAdaj?sk7oete$Jp6Zr{mTsx#?A}9XD3?oKUh-UdHeWO
zZp$SW{hU7l`N|}gFGyPWU|sfob)TPRTi<fUsd0%pHrwK$#r7Q}X1T@*BGj3r`O=Tk
z-g&pZpYKf~ibQQ4`@-7X(9tyWZyxaf_}V45ql$gj;sVa=1~_o+2lUfg_lb%GZ(;k)
zr>th41UW>^swL3a@^|68@;F0&x8gp2QQ>>S!FKR{%5TXxAnmFuDvPn`+hy8RF^*rt
z=ehmST8-djXz$oTZJb*eYj|i_yM0)fTZu0)Y^lcf{a!QM?GP%EKN3mIRSq5h?lfcT
z!zKh#wKIHkJ4#*@*mW(7n|w(unhuNI$oxG}JDSfB+j9tXy?Y&2DnEq_!$=e7V<&6Q
zbNt?$$T1_q*{aof=Ibu*)*)aT_7uLc^i=p|Q7BNddzourE18Q^8#U#%0NbHhN)E+;
zI3M+H`SWr2`>=p8=D7IR>XHYNZd%vI?)N5g9}}tM`Kha7q%Urc_&Y3sTyck`m&2q%
zoD(QkuAZg}dnMmVIjeG!(euRwsGc3Vy|I$Ue+Exj+U1OUg)$S&Cr$fYp9=M{nY#Dm
zZ5}IvN7f>3_-_72RHMX<x{L9_LNbh$b5IPxFBvm3_wVL!6*&GV5**Q$A4w3&){i?g
zOkWm)8*IN&pZK<$`j4s4#EIDJJb*_6bt%l^qVRO3M26*{#@b&lReS28>yz1w+?v`Q
zSU2_H$$VC3HmM@uj`MMu#p0(EgT%HjJ@}Nay~U>c`vQ}@piJk)P?5*8UtRCgyQ7No
z)=|CT#>pwyN?w$fmKr5I74>3{yIOLHGE<vTW)ab8M{){if#&~p=uUwi<(v!$^Nfmh
zp#byK+&}nG1`|4@h<Mcg#YB<xCy#&q7mItooE};*4jH{`nddv_p%}%CMnap?%IB7u
xZw-wiX?ZMoKys2-0aU$uMgV$GRPNA^{D@>aqMTT9<~QKS*ueZ+_0@ZS{XaZnN>~5@

literal 0
HcmV?d00001

diff --git a/docs/source/assets/deployment/streamlit-chat.png b/docs/source/assets/deployment/streamlit-chat.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e37b9d70e15df2d253319dcd0ebeb123ee719a0
GIT binary patch
literal 108553
zcmeEuXE>bO*S;1djf6;aq6I;sMwjS4dPED+JEM=$6B5x|)X_Vm_m)Va8(nmw8@&z2
z@PC}&$vNkG-{bxGe)wPS2QJsl<Js%k&)#d@>t6TTA6_cU+`3MF9Rma7mYl4l8U_X~
z5e5df2tF?G2_#uS8v_GV*HS{_rJRHW?Mo+nGfNv&42(ySE)il1FC0nU(<t1L&!bCx
zk?b!nQJFUKsmOtdZ39j;R%n@av|y#7t6(i5c|h=R6}vWY_+<r=2G59=g!-JClaMxB
z#cJ!kaAV_&+blS(Et@5_0{i4eXRyo<ksBcj)GOS#ZE77Bh7K~We>YYw+u@xobfq`Q
zw}u{SrSFwfPBl&NbD%L-EL{+=mg0J~wG57ZeZe%gOdP%z2b+ql$1``fxEW%0{JtkJ
zL0=`??CG>|-zU7G{SJIs6nEE~=!nT@HGw7>Zo3L4D>^7~HuBaoYdOGBedzw#*uS<1
zC;HYhG$rzf@@E&4$p=M|bP95O5*d%2vfkygq;Y@Je&%cFIb46ZO;OFmsPiLiQ9Yd9
zcIkGP-RiitWq`__5VwP>YGOLXa8#?}zIK2glUw{3M~g%lfy3?R=uw{8t{fe)_kG}J
zFbGCqn#p^iCf-$Gwx5~4k~33M!e9md#>c=6vc$jv{=x))$blaW46HP)-=E+TrD6a6
z8(ZY^%Nzz*Ees5C3^~c?8t#}I(*(((BNVN>?agMhV;c$#x>+w=RplQF>99)1u(Qgk
z2C5BY$S#F%bTw})yeN-O*+4~Y`lM!p8{C$g-L^g@JY6_fhC0q7llSdr-7J<NGyBQU
za{KgZ8A))6#4#{y#Mfgwr@9~0`eI<=|JPqcMOcmjhhOXL{<!qte>srS`p(%%-1zVR
z{ris3*~BsEEbK+@hyBYC|GCJw0s|AroZ|JRMgKkw{%cGu{O4dqox{I>fBBEe*YIhd
zP4@d=IgEJ!bs};630?R*`rpR=w`ZPvEROMI{5|Q_!}x3YV!Y^|@#LVq!dAc*bHLg@
zzRLePp?{A`REdQ{<bN~@n*VPW{B_Z@8${v`)#=x+9L6v09<47?Wlic0nXBeayj8~+
z!@uGd`_;qX6k}iprZpYvJ-hlH_5s!=+gd!oau`|*VDfP`i&wthx$+(Uf3{R`LRu$W
z5KAHRvw;Ywg`h(Il&c?IT)Zz?fX#8?M=S};R>&?n`7?)Wg!1tmblD2A_^ZieendpX
zDOaw3<D}L^-j+`PYayod8QJ9!lyaTZl6;o_iwjJeZf2<W9ZOV57nNmytLA8l{`Tve
z`_XssjqVO$UbS#&`L5qQJysX+Sg$nc`uv>FZsKug6k}|Sr%UuimG!qD-5+)Y5WxYi
zu*$fdPon(+cf^CA2v9uyYhnJEW&O?BPx=-@uJC?RXD9`KPq5$o;6krx6u0?pme-XI
zieiVdB4W2_18X9_K58^RxN@lmyI}49`X1c}tBU{n?s`YwfLJZu2T{E_xSSoPw)mxd
zuvX4`yw1ADHi_HK@4#zc|HNYMCCk1nBa2~Kj+dBO5x61ZhB-X70$pXFVou(5yc=y3
zujF&a;)3A4W9t@6#%(5-+c&LWIIu?+DHlUPE7@H<PkZIQaMPt#PJX7AEiWZR<Dc3+
z-zJkS)vdkwC4;bIQBF+$i~ot~&K!KhGF8aUAP4@Lh2Snj#C9_khe|@F49%@G$=j6A
z9;|aYJ4+;0!@3N?$gYTRRCfhjlo5i0Fc32<Rbhq@Gb?v(jPFlPDkTdmT_^3>HV9w8
zmLgua6tcM?*+njkUkSJ?jgQ_}+E`{l>#{!VG$q_WR&KK9B$px>TWL3<SZ?0U%C5^3
zm))(^_3p-f8*sIa&QvC8ee;0aV3r(vVehvdy6&xMpU$}A$!wXLKD81(GRuL+NP=g>
zF)Zrdk;l6W3YqWW-EiXEiQX_8F*?CW$3v@Fi7lqC(F93>E{pM}SGkJTriTv4J9j!0
z=o^}n*>$TszXSwj8)v`X>K^(KM@T8WXn@q~i07|4J6X}nN9s|*tp+gs;~q+#Cuyf^
z<Oglwzt^p?>#Tqe$H7G}x^iSDO6LiMSn1rhP@8-MyLi!$WP-Tbuk<j%Rao(yhJ2Rq
za^yB&k8JWT_PMl}^n8B4+?V>$alw;$<76LhYcS@qIl(J*nr8gHy=mQrv5+i3O=?Bu
zkw&Sn{!G0~&@B>+2?i1ArLKF6V}>iixa_H(1v#6<DLzj=I5}H^9$yt1{2p+ofygKY
zoqy&qS<U=1t@AuAsI;7(N=0Ud$Hlf>h%(VPQ?lg3lOy+6;NA8J{#uxv40F-zDy>W3
z&?br+9F1q!o$&q<N&(+M&@||kii=V(9G~sOOfSv@%HPc26*?7P(RGN$g^$HTe<(*R
zv1!RzpFSMhrRVzK<Tdug>&hKU>i{G8bmsbLbqp;~q441@VdoC#`_CSPYLneP@toma
zyW{P9TVR?*g7-2Lf+-DYS!*k-XuO+o)8O;FO13;ZZAb>g8R6=D#RRoMR53EEZ>UOg
z<-xJt;{CnX@zi@Jo1WE8ato(scJC(rt{3;WO6)CmW_Mt}V~}`%P_5#OPLF<sh$j`%
zR<b0KrSl~Uz6-h9*QK-`J7%^Cuk0<+#GZh1I|;bUoPgUgIR5&!W$S!*DPR3Z*Hi9Y
z$NEak#z9IE_1rk&^Plvt6w|aaVK!WAD>NIj?@i>Q<-Z0JOrr+gS&x+@!L0WbQzi@!
z2g;0@RjIvEvVwlxn1QP=R~}rsvyg$}_0E=5FnWnoXVJ;Q#IzS3QG%KU5t`+WLuLt2
zjLI2gBg6}|VUz_T1#F)s1q$Aci(R!R#QX85+zg#MQ$;4J47X$1w6nSq_VP2rb&G{}
zpn>EG4()jO4tRHfoz<7mec$d{txr82;A@}LMF^Ufr`0NR-2GQa?t|rs<{f;0CECHj
z`oaPruV>pr+M!oo<ZB2Z6>kp<$*Uv#v&+c70$C}zI=t-w5bp~oVd$09iI0;DfQ-4u
zZ9U$rW(fn!8yM(#DgDmBKJDeM^P3|9@J=!^rBM1xb6a7+S1rOkSMLR)<V)za-wYbL
z+T3FRSk1);Nnf?sF>(CW0Tk#k<5og`rMZ|bz(8kv{lQnxzPQ6BAnil-Y5os@R2u+r
zhScO&&py5~@D9It%+^_5ZSEN`&`!4m{nfMoe~abc@Xh}%mVaF_|F>BFZ?XL8_x{hb
z{97#gUkyVyYqwEInuus-l^?#`^CVYdE23I<A|A{@#25@f|KP>BFA`4s%f$~s7cHlI
z6qZx9?}YaiAv*QW9-jm_m2s|$YqYi3e#wk3*-uvMPgGj2%uIveBJ{APqXO_WXnx_j
zglX6CTB@e2fJ*8VkOR7QrU=yx7b1%o6ykacRRVC87I)_9tL-*^ZZqmu7a43eYw_8i
zt2duoWu2X)q}@P3hLZKF3~wn`dqTcM57~gAFLyXtRTXer%=`9%xP41qOhn<;W(X;#
zi<@DYaklXJapsl_>HqM|x>%8XSVo;urUqkAK+86DjtlR#Q>>RxLlGrXFcm7g(l=j>
zPnMImvvh{;FNKTofvXH#TzjoWiegXp?<dkemx0as*@4yuO?B#=s(7qN<(U*elZ@J(
zRjZ`(C<(fH=^H?*LXyRN=rm?aClDir)-?lZ(j%;~61FPW{?&Q@0Xy6zXa}WfSKB<{
z8H8)R7(4wE--Qb$*u_p)>DAK2y@3S(;~5(H)Nym==PvPJi9R#&%g>BJj)`BT|LR_S
zizwu@uEDf3N{8B~r*YaI7k||vkV?pPqS}4;R~<^yWPbZ?`<Wf77I?beWwDdFP+_Xp
zk<sXDpf^(Lf4~3(j6^J$f%*-u4eB>W3N$6WPfX)Pe0tYtv@p`WPTG6056HI=n}gZW
zjb~1H0uFO&#gpZxg-W*;w}|Gx+P9#$wh{zgSWnJTkx`FSAG%Fr5%2u8^T$8Mn9Zl;
z{^BPBKAOM?+M+iZxI>zl(V@v<&`3)j5qzJsgh4s^XISgbSjnV*ljq1mgJla(Fp=2#
zrfvp^Pp|1TeA*?7K~B+^E-ay8K1TrBvUrd`6t#a&8AT_7JC-g1Sr;Y@BBUWrNfVIz
zh*Gr5qco$rxTbKIS=j_WQZ=8g5N|W9ZZ@01Z5~A_P!6tYINkkl6(}R3#EQHt{;GPT
zrPBwRzh7DOs!U!PDoPJp>5D(wG_1BC!|Tc?Y=C;#S9_hG;5U50#d2=;-ywxKY_Xo~
zEgfE`wt!@Jsr=;Brh&tCDwaFK#m-fZwx(A~Mo-8ls%;sUx?-zbu<>rkSq^zS)*xP>
zPnD(+f(3J9hUN`p64mfT-rm05ola(G_rY2Xa;U+LFyEl5VA|}9ztr9g7`?d{e5Xxv
z#p(EHa)1vVWt}UiA{lfW7U<b8W<P^9us@_qtDNn)a9=SkC)Ac&%sb-~<%xdY`9!Ye
zkaYTThQkE&;gz@-877|9<M+h8&x<X?<7|Hk&K7Q2K&#FZoJNoTRrkOatX=1V$gzH^
z2fcP<*$rt}VV2@`vc&JS`%B6#W$5e1IA{rt6gvQ>z%T`x4n`W3gH|{i)()+kR~#KS
zkx-vA_*h!zZ7g;Xa5Ezm!uN0(`yA?%`nbF25h0w*`wYIf)vz73ju@AT*I$8si85+y
z-WfLxcyyyTndCow^}irozd7RCg**7Cam{Sg5q;)qEM8qPESuhi215_-vFwSrmXGyF
zc2=0fK}b%W@<6bF@QruZ@7+y7&H;YNwSva`0o&<Fnzx9@`r2#0Y?ThQ?axxx(?g0k
zg}e_2OO3F>><V#@D<-Ylgti-*#n3#jtwu^%80AUbkYm*x`hxO+zj9F>%vIqu?H1km
znaUt%Jjod$#zO=`d95dr$Ph>d;JnT?%^|?h$a^)zF<bA_4M}~yJ0G+?xo_C)m3^>&
z7`B<2|5@}n4VEPr{i8%BiLcFjoks9s`s*rIr?X>6AuI{RfgsD3FYODnLpvr8oPVx6
zj{WU5g&6k^b^&<S(?Pr;$6sgs1+CK-q(>X)3Ve1`CQPc$tTN%0u~25^BTv+YAKrDa
zE1|m%=!}s1EYjJls$w;;r-Z9d*lUav^74Gbnrg8bwFTns{PkU7y1Q7n<q=%WaIEA$
zt46=#&RLCIXJsp{LSL!~Gok3KVedMu+Y@B~FX`HhiB#@+@oF;_k`l*DwAaZTz@VO^
zSgr8I|5_GkrNnidTQeFLr04hUMNT-|bW|UAq?nJWkYi;vB5MTjaHW$_F`qAxS4u-M
zq7XsU;W=viBQ8VoRuY*0YB#?n=O?6W#AAQ{f?qLTjd8PFG=@{FC{KSh2sv4^?SLex
zOC#JDa$bFG-#8$F3lScs67J{U3c5+Q>fKM})F3`uVb=RlEO7v<ShuFqeg|yFZQdt*
z84^=;TQx6G+Z6gCQBU9E>ew$EIzT*>TN};c)NU=SuO1mReim|1YwwPvPePS4tE4(}
zA{e?$?uz+=Y&y6hQDLNxv2HVKY;7&a^0%plO$za;YfX-?%C`MgT2EL&7^J6ZIxDyi
zvEfP>t(swHNrJLt05e2)I_gC$rM^MJZI7(skH$9dig_BLbao_{{H%<}enyg<$K8PR
z@vBFfG8c2o)>-xoE&Hy9Czcz?J-FC~yM74UQaCK-0*EPwBLiA1WIa2=7C?NR6#t-{
zde%<k^D)Q7b2Z(#+-}+v;F(Zy9Qta`&hHBwcV0jyOOUEHD9ep!sO(HqK^H4dGA<Jb
zmnGsW5kfG|)+66EKTLQA=-M$WNQs=D>e^i4YBEkNt9DGS<1hK;#|aFkT`|q|(~c>k
zJU)A!<a{oXxOb#O$=@RL=^4%^DzwW@;p;A+HpVZ&-Lr*XiYyYLpI`m@5o$fi)nv8&
z?6@O!rV!@ph)1HgqPwy-Q6hy(;Lv|fCVHlxz~zzn2|erXcAcRq-sh}ryz&+?Rp;|7
zYZv$&enQu2AMeN-X0e34XeMhh>lG}_CniHsY$d)GGwDoWjwj*Jdt~C0RRk3PNdt`V
zQWflyA|1*Jox;Si6^1NVl04REO@&%Ze%G}>BuPQh3NMWoU#Q`qOmGyr8cqh*aM;fX
z8zimfVX+H_zZfgg&(g^6k5*%rGiLf^H!Qv`a*_O4r+m1lW+&km!z%tQs;YrX%eOE{
z`IWUShwU4Z&wK%3G+XX6>B^R686=8!4*nL35K-N0^$Reas5G>J0&(x;CavVXvc87?
z`a?Fg23IA^*>h95RN*?*OTqC{^8vYcRTlL$+89O~KX(UfwkK1ky*Ln64NM#}P1fNt
zk94LA4PR{<q|h6g&dCI98d!}MF|q6NMFX27@eYoTgwocAU`$T>AVI84Ul5MjOkh_m
zv=ktkT@a=XHWQWOBe5#~;;{c%Hh#2sgB>u8PWEE$a%5;A3>rJ)6x}f%KdHn&bDK9;
zS`73wDQR%5b_A5@gC`_<K0h65lxpcn4QWnk0VC~ns;oj$JMVg?Zdwjzl_6AwRSlSw
zlWpSfbLc&$@qX5kBl7V+XKKpSNf6DCE(FXiDvI#}93mzYwy}lrm7l&Pnn@dWNsVX8
z$99b}6uBZlNPhi#h!~rGo&un#&U6VG1^}Eo9j@OKaQsC>@jy0_Cm$sXaLaLa(=#zZ
z@Rl#s>g_$l&t@=~a^FL6R!l*hmOJJR%&xAvxZ^ye6#p#d<ENWnL_l)P$|x_qSTY(k
zj5^wkPZr2L_jCS8!eUcu+!3L&xyu_Gj*D~9U19PBM0T(~-KMWu^fO=(W=OL(-4I?p
zG#B9Q5yvUxMQ;c}ziTMaDo!kST%d8)XXe<NK_4Zd02~slSGN^kIx#KuG4PraT%&Ro
z+ZD@{N^YPIH1-nQ3l)PBeAI}qcY#Jqg~a`w^K&H^aH>RKulu5|GIm|(yWmy8Rh$U_
zTWAMwstCAGG|SbNg3C4>2(-BOI1PHLSPQjbRo%M_==(MOK{I}F$4nz(h2KI}yw(L1
z8~_|@GF3~HWi=w7VHqeKQes_@3Tq*wW=&H~|GHqd_yx6Wdlr?N^wX5ea`*L$cGG&H
zX5qtI<iq;~L-({3vq~ox+QV07MmrxqKf5T=6+#>e#J;~p@r-x;6nJA@{prxo;ryX&
zv)*;!IpN^NiB~dapB_AY91~uca`hqv>(c(gxlT{UyQ9{XQv_`XkCUH#2^@}^`6P^=
zrPtuvi;KncE;0v8{_~Uc1+&CzS>}*Yo)-yRrd*bhYT(0Lh0}!~9Bm$p-9d#-DH%Or
z;c8Gnel2<abSy2JpGM64h1t2_WXiAA-S)0dJoGGxMh@n3agLn<CF9DfQFi?04!4}&
z>cPT7)?QZPw|3bU_x?4P!PlkT8M|nk(f+iRPIw?8I!uTNba9&ZV!EeFD<z|A;Ypi&
zSF2&jbv_^Z6s=7m+;5x6Z~u@^h6kvB45S#d=m*n&G_VGCejDi7>6PT@R2x=box|3A
zNV7I4yJh(WlkE99ie~9QnIii@jJn|3p*i%Hn_+#9R>nWefZ`iO{gHh_ZkutY%PXhF
zlhxfLh1zf1KY-%?#w}&9X)V+qd|z)Mnd`K^p7A48H*j2c#l`|~Npw=GpEaEE)iBi_
zTw4oWDF?_-zXsaUx9uF!(tBxedhNWOywyjl*RjZ-tWH$bY)reghNfp0;(7JUbj7kQ
zl)U}NzVqt<Bnd?08EzgvmwO8?MnlJ@U2-HC{aA-f!8<xE-zXDTt=54K#-~Lj-l_|*
zt2KDme^{x?{IYI+Xb(Bt?nV9PWKbZA%k(<bRs%P@b)@h@c4XZ}a>aRHle7m)NA@C%
z`EVp$@E_jo;X@)G5UnH*GoKycA8((mW)k*%>#XVFB+FggTVjg>up$ttRc#t6`^bJ%
zC-@uF4Jsx24O<m5TT@7dM)zH>>UWy?`D*8<)=;<|!QF51sBQRK1$#u-WzA?Hvzg{I
z;%L0wG?GCs3a6ZHa%E>e^AdeS(JrESdUX%IOGMq)RXtXB?u<8jgi!K9@d&Bo7TQ&0
zKawEjaumrJff|Y0`K}D3W<d<KZs~&rEL6|xq)$(qOKud9x$T1-Ugs{qQg3lV@U_qR
z8GN%xc*iI57XC;?%uD`FKpV#cWd2W=7A#R+C6yRH+l`BNEOR$<{!A>NKiY9>=sWsS
z1yeAzn2}ee`BD?5k|IhE7ajXKX18>nJZ}2qfIxH-D_^h7DYWrinzOw`e|GgzZxXMQ
z4brC8V?T$-c4l<<*7+wg?vAs%Az}Wy1yhmby?ceGQ?(z;Eq2SFAhoNyi1$3>Z!<lK
z;_YcJt7`im62I6pUXiXWX8MsNO4y_P<zb|%BT&T4*F<@=G!(p=e|lo`lePt7EU|EM
zqZ!+z1SICTmu3C5OO!Y77+JQEegu7{`fy)JCh_GV()jy8rcd{+edKJ#x5#Hvj0#L}
z*0|pj;0IRr#rP`Xrcg)w`~A}%T~|!x5t+mI`Xe-nDgaNFE{*=SpxYLmZnYhuI})_Q
zPJ$-yZwVhR(WIZ-G*)kSTwv~+qk{$sF7bn#IYP-#eL85YGBMaji%!^kQpx8A=A_F_
z<k~hUqJ>s1NPU)+rEyLsVdj%i-D<f7recx84X4WjWi%bl`+}A+R=cTMI?aNST@Rfk
z{^C6y6~^EG+X06Etx20B5b<}-G#HK5fIMRqEOVeWT`m&va}J3SW$Vccs5jG)cK63i
z#oHRF2*0ea*E!7ln;(jld9C)(TeO5-yAhA$+@o1IiZjf~<kkgLcr{>JohvM)92(+X
z!g_<|x+YrGqRtFlZub#KF25v)!}o$+9wk5d(-fT0OPNXCh~A^$wh;;8c%QhqerEOr
z>ahuen`2K<Qqn==pk;;0c%ku{@of{IA@d#*rFmyqEE?~&6oPSWa*o@)_txrXKU{LG
ziE;gn1MilaYzb6#l$ehJm^4wo2;guQ5wQ1iK=m*4tjGC4=}A1jOc?I|5@_>JM?|!U
zmG$KMo;?N!&2Rh~Y>5?}+Y=E6Yu<3XyBfRvyfG*D!+N>T#q$;rU*O8I3lt(>pm~U9
z@^C*Q9y;9`ui$^JV!4phi#dz(*uO6ab@P<>$05i#q@+i^`9RDZ7jNh};4O93j6FU(
z{lN5Yhli9lv<-(SBP4O*Pp2t<^RkqMZGSyC@HY{GQ_-5Qo@=^Uf2dZiXeskvT$<&X
z9p#&VFsIegh2kgRT)F6p9{ZR2T;_cYc(*BIMhbn1mA)_P$j7<MdLq}?b?z-YB|mfg
zx&f*Y`q}AnAxc`DTV^pZ=o_B<JZE;)eh1Y=veV~ZyKs)i^FLmfx=r=zoJarpt-kMn
z7N5jF-34sQw8LW#_|qn&f9veB-}<O+GxG$yUTuVlEVOTz%YLSjMInv@Ii3pS0xU;w
z?m%skua{sgeU4tM;Q6moC*Lb&T*$<X%57#EN4AaX-M~ugVuspx8TXXw_uFZ7v#gOb
zPW{5}0H_PNNi`@om2Vcsq+}cpIoc3NDmPCdD4^uozjd|`=Vf`LHcs;Rv!OhRkL*WM
z_6;PZ1mC*EVG|hHY3P%{xnF0Eq#j3&dCSFfgf8|b>FzD|F<$FwPH7%10i0nXu>sr~
z3dQT5tlo@}3TdJ_%2i0x`ba`Bd*YdT0VH~q+mMsJ@UAGv^ouQv)-Q%nx<D5tO8|=L
znyR}|3~z>@q+~Mm`17L&a}}(yr2yh4(kWc{ac4BMo9!<o2z8WFZgu#&fu?HAhyGw2
z^#0~rmbjmnTKd^rK9NZEehcJQ%VO)70MYPWi@I`enOAw^fDbbXNx;{Ey`62wcG{Zq
zkc%FroITUkC^vCrxo{WgjA6+HUnDZEAMX+$Q9}j5TCQ6Y3wHZ@{Rn)I@P+RooZ(_e
z!RGwP^3;$pv!B+A$gOEkPyL3TW=?~8la_6uP|jEfnzdhcdBn(~p5tJ@QaRVnXyhN5
zfo#`wp1FfYnrzn3E|toCcOLh;z}-W(8(h?ZM&F=;1h?VFA=%z(l?2OM@>QgC9+jHk
z@0I!OiQgZZ!Fz3eABHaTsI7uOz{>ir0H7KB=Is}bdpmZ!{S3bH&Vm}GB=t+<;Q0aw
z&VM*Qm3CgSr?tJcRZXrkT1A*QeUJLajTLyrfj8DYYJCZIYHq^YY?>uh1qN(9$kR`2
z#9r0c)bjTUfT#cyukV@|xxpwGHAy%i!vA@<`|*I>hc?#JIq&l+OCuMM4QyYr?Ik^d
zun-~2IZaq~*AdPNGljgoIONaF8jT)dG;9_&uq}!0(u7tMM(ZO=;3l0>T?O~{mwXnO
zx==JXoitHr8)vVDEDw#jp|+{9iS6g@H0=W3_MS&0OSx8K5nI!O3I!T0O^MB9Crvle
zi#K5P;@Zw!bK!UpA`9e8w6j03SrY>b#?oXETD@83FgJ;+HT+1a8{Uz%j%}&uN;JtT
z(5DKxgeQv542rL^)!Vf<^l&}^8)00N=BLm3ckQs}gz(UhVh3pl8z=Rv-#X|*T3Qg@
z@xV^{+3dRMdE(wt`UKm^v(bEg_-+$5+U;%Id!@lFoymyYJN@<E+a(5dt%b09E5ZE8
zWiWHZhmONk%k_MDl^EoNO2<9s_}Nj8vgb{5(K(%ykTai_wOQzWUA}wpxlmlekEHul
zI0PQIn@5?^X93{m>xP@dm#Tw5ykwai?FKaC?a+ur$#3{3ev>Hg)t8g>y}@j@v())6
zO>3%2HtmAdpRSyIk6%~zteB$rx^19`kvb<UP+c@G;ek<A%bZ_oh^Q>2SX~G}6D|8A
za}o_l`d@ESuZq6EiGG25`}CWmn~>X9oIw*x{*WAwODWu+vKDZ#&I_#~jHYs<?!DDK
zb2O^{NWVUig!4QNUNIovI$AslsrJxXm)Ppl_mtnRnYr=EJ#Er^Z_UA@{AfRY=IdDX
ztk0~;#QHT7;Zk(31nTrwVmBBq8Z%_xXA#=~K>Fz>%GIxVs$%lpTANtc0b_C%(Nd{T
z{MuAAeePBTf6Aa2U{X$GtxENr!M%-5K)vdCnCh}ItmN5zQnH%&IQQ3Lxh?AU+RVb{
zlK{MWLo|webwMTN_H&WH0^7l*!r)vN1LIG85NzWY#Yj_57!KLgX7(L9pi?%@mpFY0
zc~DF!7m%;WPk-wLsZvUX^LSUpNMy69?I|n_HC&T9S#7!YlUnE2d|ltbY<TS3_Ar$2
z0fj!ed)TsXYyp(%!^Ru2s|-DlJQ5$DEyJ=*;srOu?FdCQG|El8Sv~E&V!tbR;bjV=
zyoKFAKYmsFt%5bpRB2)Va4fUwbY1TOXAQWwkfT2q03+R_>_&LPXPfq4{J(Q`=&4A4
z6jf|7QM)cBt<!8=g+3q9Fu214MB|vcebV>fp&v@G@hEEtQcx%q@m4?upg;IeTyqE~
zAQG=_w|f--JA}9y(i$|n^HBv4C0KBP4W(#COAPjtU`+W2#pScYE28`#mtqs)2H3c%
zj1LKkMQRA0b!MsESQDc<c*XqGp$)2H^D8W(gp(!ufMVQFYdx@k=-~rj{DTL62SX%^
ziC;UkSpW*IxCFp=hP_<iYB%|mh?Ij+x7^CNbg0wU+ZWGiU{*>Z**;KuH3N&CWbNA*
z!5a$|oXue!X0A!$cukzhX*a{b8st=|e?t6=A%$r<Kv&3YHWTi%WpW?W7IA=NP77W8
zD@p?y5)RSUd{TGzpSz`XnmF%uRn)>4?IOK}ir!ht6irs`gSy@(HJ#=8C{a$+KD}+c
z7WG{8-2$j`D#iHs*~Qj-tdi%u4;Rjul~T7r{F;TrMob(Wixo<2E+xUGT}0d~Uk(v9
zL#qtR8$z=_uTZr^u7<=62+_l{3bRfdw+jc$*;yHKoLx5+bf|E@>MfU4Z?!~z>s1g*
zgs21jl8M~oSqjVXabjgu5M}_BfKNe+!#7Q#vx;HUhoNE0Nw^v<=5T{{@sKU?m3tP4
z#_Fhi`Xu2HTuz=>3H?UOc<^Sw$JS2{4>_nP8`gJIq(f`@2JcWO8n6_E49!CGL^)7@
zsW^@6#m_!_`zO=<UIIiE*PLHyK%OjHKOB7*5+}cHpf906+7(+UF;d40F!M{V0U901
zW|)(D8?otTCsMgg-}6`<H#V82M7RRw$n=tnc{RpwYx7QdeV7^+Ge;3C^L09aT1rGa
z(^-e6bX`IpvP#*mh$U>%pBo0^UJcOd)rS$<ii9XrpGn7SXMYdbw5i&vklO#DR`%;{
zl#n)u(0CCpUh}zGWx~!FG6``C@d9`=KBbodr!Lt)1Ee;aoue#rgSLAUn`wEiHTH$?
z$whMgvag1NWKGov=+o4{J`;!q@{0t5aEeo#nX<RJaL-~&K@;DYW4FM7lER8Uf%LT)
zJ;JG~8BOI>;idiUrfLL)Qp75-uyEC+spU*i_kbPDOD*W`TSwzYnl@eq4hLV*-Jhtj
zriV37b?HL@D1UO)@O1C{PNeIgq4v%TW~b2)zAyu{%xICm`3ix9{bE_#*!~a13}(3d
zZsV-#5C$~IaEKtKYPEbO2#@*^1KD}XDrBW(daw>eINq@B+dgxlI|ZbG!#?NZ7pz!%
zA6I8-QeJu-^~f!!CPyaK*nP~AbYgw$bH=9;Z7$BDL4WTrNB@`@K!M$M4{}3(M*|6Q
znqO80Nl(SjIEM)k*s>4`LG!Q;16yT6V`zc?LKqFJC)Fd|l4JqUX!8<!=qI$PPnlU?
z#C<0M;?U$Ua2Lim_E9wWQ;*YEVT>bffIwY?2@Fnb!4vdFFT6^}kRL`ziZtQ74-)q}
z$AbOUm^udYj6^1LFnt$0qrsc>-e<>V20^4;M?PXgqGztxi0(4T(n^JpR!`n%SNr+Y
zHw|>R_SA#wF&3L(eyO@AOpWm(i2^7$D;|*_Zolf_a@%^*j`0<cCQG=w!Gdlw<!`6?
zZ7Z;GM^@v$c|m&X`Y{`Mn$tCcgXb=ruX!caInijOBP{af`O!RVGY4ut;Ol38$+1_0
zvqR<Tj$-5@NoOGj6EZRq7Im)ZMD)R-Ou;Y<c740PQiRcB&8$N756<J{4M9LLxo6y+
z6BbQP^W*|Iux1JfzePF~!QLlH-*n(C<<0GFV94Kc=nREc)M3P`AZz<yOtrswS|XH3
z8UUpGcl(1Dm5D>Gt|x+<D1}@V3$*eV_qpyT@V8g>B-M0yd!DNzI<1^e_DtagLI$GN
z{$%I!9sFnL!=~u!$XTzgd&2z?Q+R5xz4FQ1cI9W(d%dalpNDySUtA;XgNkt?yiwE5
z1l#s@L2WW&f?Glb6mguoOYXr@*&o4G9$~UOHph^*oR=W0+6jRBU3`7>@I=0$Z_l8G
z(6l?w*1-1Nb<!t<Q^cMlYx005;03VLy>(ljrxuP$GASi80j#XE<|H_ZS!MN2V1vXn
zQZ7>`WjWrx$)|gR-6<QvR8+;kSjpYxF19T&wDnEANjdahic%ju&ztiTO};?DAbj{P
z-#$4wvNpI~EDbNdq8az`p(K>V{_S!c!f!mmnx}60qDTGbM^hqx#@|0a%X#{`**I2P
zK5WFg5&L?oIl6<M5L_}7-JJ{Ig1d|#PYx1o)xHns8Au&qI&`RIN^1(3Q~S!85uCVs
zcR()Tw_2|Jy>h|9vBOOVLq>@o2Vs_<un2N2pa5%11`lO&DE(sJ>;+$-5p@0r;(Z;z
z!ue`r-~BYMj_p=hVL+Jd1Agyv>k$NKtvq&bgS)C}-C1gA{g|y@3_Un3s+7o6IvY+v
z{Qfq8L3mmZEb6$*z20rdm?Y+`Y>LwS<fOVanB|Dw8OxU2(iT*xG~{Qwo*F^VtS2<E
zKgg+6Habk%FtZPr>_%1~j&*Bn<73UJKOZ9ayIymrHx7B!pCWQ|+y<THo3Z#((ZcbW
z>)YWgPhb5do%Db1dk6oELwwzDm18X%RwCG7i137Ue}&UkMZR#en{uYKM760LV4J>;
zact$+E5vh*<H4G#x2)#gl!p`eAJVP81;ot?@ekc_{9E`t?&+(aO!Y?nYF@k}c(yi_
zYkC|UsaB@OzBYkoA8=L`a^1M@ZYZcWZ%3lkz&&Pk9<ab<I`qlV%mYI8%)Nh?X9aG!
zSWyi4<SeJ9>!OpwY`oc{NlmBnu~sf7!+A7K=eNN}i2id%xMuqjfLc1$UC3qIm8VTm
zt}7l7DBb2VpUXrxj;<Yf-4{)q);xWA9&JRwxVy)jb>SOMrLJAaUmI{u2~lY^^4fA=
zO~#|y8$>3&2pWi#^1~t23u<!TZMj2HQXlLkeXy+}8TOs;w^M9wq7}!OH>eHK`@QJC
zy~6Za3*t-q@Vv@`^-eBcmL@vu`Ny*RRDQ%lkke1o^7~&<tfy(*CLSeQLrnHye_29W
zi#J~lcVi7_4KF_T=MdV{z+Mh1M^UB*RX?4dlE^{r2{SjvQH!A5ArQe01vrO6_i6tG
zw@ZZiLM!1cH~kG(u$BM^q7C^xAV{u#tRAJ9p6#(K1L2=?L`B>Zyyyfnu6tB;cgV8m
zZyaXnjwxequeU>)WHXtH7h5q`<`i0mZVafdc)1rV5L!+jX$tMpN`hJo&Fb;5|K3Qx
zt@=c?z7F^M1sMMP2q(NXV7b>!>E5rxB7I~-@r>1sl`Cs_92;WTS!HykJH9>V1Hs}i
ztFaQx2K|O*;ZAE>)b5Ck`ogo=Yz6ISyx%)c?=k6ziZ+H8e`kTHO2T9USeifPFq}Gw
z8*(Su6C3u5BH|wFa1>8AuJ0?IsxgLiA76yt^_f+mb1T?q?@8q4G;s+WP2#nEge|)I
zc8Kg+Id)1wwgK9#+!~oGUXM2{6OhT7WIcFC`)`4O-_J|ov+E3h-*0iQ@dw|Wr=B^=
z>`?Ots}XYDmaSRw7ahupYK$IxCUk|{Cx`WBCeL3ns%+_D;*o!yh=(0zNYU(Q+9G)F
zd)tt`OZyVrj23lnB(=>0fF>DNZA;JDk~tZ`=*>0{Oy?1%nZpxk@`DcVNA#cEsCV9?
z1(_+`NrBQv{2>?L*(ZVOyX?(bs<(Hhg`ZxdlyzU3CiDuk+VSvx!fKsFVs&G=_sQ6>
zXqvcx*lPd5i%3m?7^^PQZ<uj(BiGwJSnu9zLmGCj$KC+SQjfzRmVdK(wL+J8G)zcF
z>OV3xpstpuwnPBcZt(22&p4S*VS0HKf8McjdpojgeT4O$2jk)T2tfrr#o2v*cHovW
z>H!+9qO4?aC}^~sE)mqUo&l1YjTqzahTKyc;W0Y1xMWQSwyQSAE94q$cfDLfUrqys
zGWyDt6=@2-^ZETvZ@B2ZzCo!&Ss<?!BM^0geBZ9zc~^05W|OxQpm=866qC*i-!|ah
zu^!Gd#wN&YM!&EU(djLN@vTan@h;E4?_l+W8rqC&#63=ANzBPVmLNQxuJVQ_3*G<e
zb)UavcBcUrN#Fjgn&0Y`%5rZ4tL;Q(L{N{wyI8><M;V^K%Mg<J0LA!d$#muSw)K^b
z#Jesa>N^E=k+e6iEYbMzJUUU!Rko$EKs6kbj}#tTH!oCknaT^~qz<O|h%Nh=-Tg)2
z9bVzKE$WdzAEH)cKs??ewg>7)i5$0t0<`JU&*BR_NY=|QSYexfJPiBXFPONbErA61
zal{b5HVs7lV$z<%%P#ZeR3yZklh1a~pDJbd2ZBl@sWA?sBwhUcV-ml8J<5_6=ytiw
z2juclpB*9p<(<gPt<8t^M0>CAGRiaXT5H8VUADb|EA@BGVuRryzD6_1ZMa`Q?^B1(
zuAw-6&Kn0a!S@F$Ri8GkLiV+R{t*N$f$uLedi2Keh2^7BL+bm#6MFn92qpwWvq8%R
z%Dgw7N8>db&v}uTCF=-<SHH<~mPcicm9V%0Dwg}%Hxs*Z3?Agih^}6gHJT4%rvRls
zgdWfe#S+b=6g{EBUrPA$(9Z>+Xy31|RH>2_KMNJ@Z~J~@_szlC$v&6q`X`5!56$`8
z8l*`CA-vaAfi5fEn#HjM5GDUu(9TfK8GN_!Ioh16M!(iVF0WcDBwoIA@SX_}C|pe<
z%<&-o(-|K0Ere*1fNO%M8J7FiuExK_&TB<<>Q~MWFg=5gY()wi<5HMBkJcB>mw@_*
zTC=gz+KCyBfa`{PUh2J`+4TG5!uJbiG}C7*N}?D+W@E;ADz}oKDK&a+9DjI;7GZlK
z`q{AbP1%A48GLi=NBInAZn=%eqN3MkrO*0bB!2@t(S;opqy4W;$N#yw`nOMKe9nU5
zOJ@GMhS{%2<eLkyOTZaUo4%;#k-|cro&6^RAQ>={D&ip_u4q|>=;9%Arx!Gv+6{1r
zW%jaQ(GKS-F?Hr0YmhB*B%G8zv153R8{OxYwokVjzic;FbK<r&#b`xRz8Y}G0Yk>z
zX$!h!mS6D6jC%sWiV`Vu)*YE)pKT5(|IW`qg<b%MBnqE)#LJ-rF|A<u3m&YF%NTg(
zX9E#i^8DvP?qX+>!roHvf_?lJALQ94&&T;l8S2~nr~>2fdq*?1ZmVTah7>iXnsm(_
zMZS#&O;3Xx--3a=j_vJ#%OA{N=vK>cf7+SI8$u<wvl3@p{@v&+sOh2vvZl;~U*@r|
z)zI=jwD(brIkepH+V#Z>(#!THAnTiOHS{szZvSL4kQR!2`(V58fUII8zK=WYd4RBo
z(+AvkKnugbrahAy@O{it7+V71DtAV{>yPDw2t5k5`S&3)a+f7Ay%tA(YVg4wxmd33
z29=I$O*NU)=tty_M11D`ds0Q<Ve16roH<26)1T%+G9(sx3Ng7*z7H-9w%Bo4DG>o6
z*TBAg3~WFB_Fq)Ljh`$5?PJ~Q7hI-Yk32-Y$)0)aW!XFxGqE0^$>FuP2%{XIZqulO
zOjc9v<(<rPndr252b}mJGm1wI`FLooUwOz#Y|&>R5TnI4V^Qq7T1bwA`}ZLg(m|Bg
zyTUEsP65pYrlC)n!#!Jg1YOgK)LsjucH+0<2k2mNpPS{8Rwk5`DA33wf~<wuxo7DR
zcu>35doP%Mn-L!i+JKLNrKLm9I}s#2n_Uax@%sfK1^1{hGoo+sTTQX{NLcu`&Q6}!
zW2fvJw+00$*`c?Lwm8cLSFzvbW)Rlu)<8z`1=2j>P(O(66VRyM0H?O)WPRNt;`pQW
z$o9%ZZvQX&F}y4}Us=zc&`?2_iSrV*ELkH8%GHqu*qGjpQ{+Lf0c4Zz0Nkfd+r57r
zoBC+mU}ToL`D|SXS<j)EC_L@GR<2B0)Q;5dio1n6O|GUn*+lLooj|@-cyZYTuExie
zmdG$aB0On`e|_F;6urVR-F^B-Uq-i9KJwu^8;s}SnJC9oe*6A8qYT5cbdnCJ$}-$=
zV_)MCyl}|x=}~3b`m7FmU|^7<^H(lpT-!>O`(&`D$n(fc0g&g-x#R<AGKPQFOP5I{
zmDSJ(U0i81u9e6KQFYy(d5f|Hhf_B-RXY}6iZo9)D&~Rr&9gR%gyd_((r?n7#}*36
zi{TNTOKQOk5H6p9>`?B4P}F&B?c^5KUVc+GCN^%@4Ne~3O`5tO3u}}yH%{VTVw_r>
zU*A5^yQCJZy*@<c+iT}4$9CC6-&h>*MwaF9$;Ui8-OCYsb$}UN*aXUQG;DBPA-)l`
zV)8L@KfEi0P|hXHSG;@MOl?uTdqZHKs-fOs!*Syc`N7oLwL7Fqo&Hs;UZm$UKM2{}
zeGV}TexC)9hRQqNPNq!kD^Lgy>YUip`3g{%x+j9j7m{5+l#-T)j0{hwd5RDoqo3mN
zb$B4^X;RqgB`5c(S#$;adySo2JefX|r2@=l*p729=WoPElzDCInQy7Xys^+nacpcr
zMvm`N7J%nt@;=v_y~0eX#&)3{O<52Q`<M$YVW;sqOIJ+dQ#Ak=2b>)YJri)-YJ`=z
zWtVJ$;2#YpzsW5uqLH(_CpzZc)MC-Y)3MQuX@ACIqD-6uoX>H2(uZA7*BiG-J1y$t
z4eXog?43PE&?B{~0TyB@OraDV06v=Db>wCC0g7rfyWU?b0+P*e9QMs_wN=Q~qTZWd
z01bU^EzS$u;z75mTouVOCq-V?rnU4=+me3*v2tIe7|yQ_2ql%)?ptq44vkp|xC$S5
zVJo*IHDQ^=r-z5=s94xJ9pW~P;N5aoJ;Y@>g~ys)koF*u%*fHFZv%7Nx;<>Vdb^Oy
z;JSO#=c%Uw37~A%Hcw1w1{*}Pm7@Qz_O#$HSZ*ZZ{U3VG;ED8_g<91DhVjQ92!o(f
ztr87YzRxu$Z9qY))0Q7-1pSDfDmBWaYI@n^X=^!|lVU;Cra5|X)Zn}}*n86!_Lxer
z(QJ^xwnj-Xf4n0yb8#&fT47l4wZF_hX6ANqveQ;GTeN#me0OTsF<RGUwcm)3KgV38
z`}f>ZV!gY*TvNB#RzGoliRMP3LVqQ{S9&TSM)`@3rI)F0aN6Xx(ugD*`tdHoZFa14
z4gQ7v*{bc{H}G;Na)h6C-N2)+%HnmRcvlE}k}S&A7He&ND>Gja?_hhufY90qC}e$I
zf2a^w;;@k!ZP6Z1^NMM7@FIuyu!=h+(`b7JViqWOpA5B7<1NE>JYj%%o#?@3U^6~E
zU9>OMlfbLv+~)@fq{T0mg46lFtGd^?9Kv)e3VNVIN@q$hw$>^3)C2HnZm%Nt!5S_0
zc(*t1vsS5gKx#)Xrkud%cuGbG2ELegQe=Li2F_(vYL9&QMGZZ&t71HAO~ixF7s7in
z4>O7XTG14j;=|JE9-}nkFp)C{7LQ*au5*F=gxjr0=O+|;^$37K<+iafo@T#qO`fKj
z*&H7(O8z)pOiLNSKlTl)oK)d4>~GcRZ^e`I7IshCBZ2gmW4tEev~f~}n#UePDOK3O
zDzO^irY3Eh&-N&yYn;+gmWpc*)<-0AlvC{dZ%hgklBF+QDmDTMYGgD?_@j$X*cx0n
zqdpMlc9iJX%y#UK#s7wDwMZPEP*s3=Ix~2+Hps}JUtj5sswE%FaZIHh8)1%slOU&8
zM%d+kNg5#B%AICg#6s$h?l?_rZl5OfR#6j+ZMTT!DkeXXmJTPU1DIg09`#|o49+TB
zmI^cFC{Osvou-R<mdMxwIrWoIH2m^s+35p~ho1WSf^HGNzSG=<TTV2eLPR_U!!et(
zODw&=vL^Keb}MJwJRAH)!v6ZFC4aRyVstX!tP(2Pc*$XcdHh;#6>1kUwDf$Yg}JQ}
ztv3VOyH@}_p#w^LrS`?v7@dX#s(27oBa3?O8N4}W2?m2zN6a#x_BYRh?K=Z^GqZv8
zzL{^gfK4yY_tV)jH`)1jq5%)F<d4dCMLm9nV{WSZ@PqE9ygY8FX?DDsS`Z#)oWQll
zEM!^LQmI}7!QCSpWQ?#qI~=HVGK8GuS91FWmUb<%X}7Jv5c+E&SmXRk76j3Kh0TXQ
z*sWTn#80%cm~BP_f^7vd?q%Hp!xF6$J;v$!!zinff)1cLz@#_P#g)rM36*KjQa2IL
zkwSAF5TV5{L4D36r|P!j>4U<%w75T}#hqj?Ubd$IZ1yzza2~zf>oc)%lk(>*4wrm4
zH+;APx;fFb=na;UAs;R}iIIDEP(a|UQ8`ISaa3n(KT?nl_NnjBQw4eSWifkh)m`lA
z^*!~A<m|1os(;At=`9!}u|Ax?c(BGD#bbF_eN1hA%%^YKAYV<&a_ab6!`4Yq*b7<v
z_pNF~;<U@xvy>AaFQ-zP|Ir9c^cIIhul5n%ZR*PCZ`1;NH6k$gWhL8N7^eL6j0d&{
zpS1vjnz2D~eW2xhg85Qk1dz6}^DnM#!Yd+ON|=?CEA5ZA$1Ak$67O_hp2kpVIV3y!
zIDyfAX3ABhJnWt(N&vK6WW%*jKq=<<;y@~og5OSdv?zg5VYw#?P{-DBO7{28daTN>
z+D^K4H8-I)&+jrSS~Nz>lUB6!Byc6voDRw4f9R7*!%U!O0(9W^&Ij*t*$-Zqn{=AQ
zw^2+DZbPV=(r0Dva~M>{!%#h+>6%D*=!<!R@0MSx@oXw)8x^Z$`cj3=1MW#tKeF+b
z=^<yuCw7|rTwT&qxFzP$Z_sis>JAX0D3KB$<c&o*#k{~{;|Uih(Z907>o|NzV?nW^
zF&BqpI<Edjr0nMu%Z(NTce%fvwHP%5%1jfcdIbic4~EY#KvDE)v7taZ)M^WF2Ou(^
z5FQvlu`f2^e*W=&`Lv=u?^y`UCTt*z@#7>_GPhVD(7oT@r|`si^$8&6NWCRkX@60^
zdKAy0U%eJk*{ewn-cTLOKyiDNn>d}Hc>s7yk4KDznfEX?`#IWkZ+NIG%jY7TAv9sJ
z$QwQR1sg8f97jNXto9EC3&A=yYiW)xHZ03t4e&y<zLh4D(F7C_#kVqWxCR_GQrc3s
z_yK)aMl_j>^uU4GX<Uo<bZACP$XZ8E$K%r`Ljdfd?yY(g11vfq6fHCKvm;hVG4ec5
zYh>yz@mBpUKlB6Z(|*Ma1hTh@nr%QS|ER>Mj=NX#>uGhfIYv65Id!wilbE=1@393F
z%go!^^R=I4AsR3P&G$E{EM~P5xREWxavr@&20)+3a1O!Xuym#jnwGQ0NvJiQo`|P7
zvL%Bhjt3HDKK@pXX)Q@7m9fgBO?vjHXYo+sV2-VjPLpSG3=0U%fdlt`e06&?P;PoC
zjOVYcrw2$}Gwez4aS@IkDg<1^>v`EXZT9LQg_H`nK*EMsYVarYN2fi)C%)Wl2AfK>
z2;h3AoY1GY2GZVzar9@j$;bfWa~UENfF!@k*~f67Q@^s!nKyRu^HZ@-Y3>gdx9QLK
z;p|RZ$w*o3X&E_?h^K8u&tiYQ7?tW%({*QE*e0FA^L?hs9Bp#pV`A>clf;G-pi9YV
zXO0dTA7=H;>isPwg1_}P$j!~<A8N(liOHXgj^BevMCo_%YZN)ufdgH#<mh1vU4U3O
z<^CI>0S!|nZlel2JQmQV5upe7TCv|{fosOHtwy?Bzda8j?KSCdm)ML~o+$!xSPXR5
z^VyzCNQY6Hj`0Tf*MVdjrmC#(N87DRto5f?=#QP;xNJEgb4B^;yP;Lfj6Jx=UQm#6
zEvi#|t`%u{4ibtFC{*cy^Vu5kfH;Ta0TF?004{uT*>S;#6zvpSZEMKW=Ca=j^!#Mk
z6PT?on*Ghc{g-FL#4#>trBcf9;^$lk>VU31{%nw_mAci{cir)@T4vQFn<2Sx&dLG9
zW;0km(eKtM34uO}N&#bDdrOmhREH4{ze6IcgpPoUvMOyWLzk*=7su}`{huQlr-&Eb
zqWvalsr&#K@ltK}3RL(^^*1Z>k1kEWM<l_G+K9{@hVlR5I0Af0;3$E>)x%|r-;U$A
z9_DYx>DzUn^j-Fg&oS!iYyLmIjrduIpx&ALE`!n6<0x8hy$=-ewUQ=v<uo|{nsdv&
zNlu%mN`Mpu;e`GX`o?sw{@-31QLXi5vz=@rLnHy^H)6<~%w@j!P=VH9>RHrd%ZJ3W
zwRG_uhM`5%z-@o~rGLJ2B7E_=u%PP<DsiuVCEUSt><wr>W(say4Zs*)1JX64>#Hq{
zH5W=B-d=<b_*g&s_m@Nj`2)yD;yL#C>?%z>yrjd4G_g+YKb`AlUM#=-*MY@pJ#ii;
zf0bPk?yr;*-1EIm4kfF6aqs+^;lWhcw<iQWl&s?WyH5Vkn}{=fmB4fUYO*Xmr;Cq?
zlc6qt(L3M+zRJ8qtUKO!<9#coRDIl@VPWCB6RZX^ON)3!mj0EI{=Fh;eZ^x4fYy*!
zlTKAvm7l;tJ<srWw!^S*j{NJxOZtfisy~qWQdg?*?-TroUkauJNM_@X&^exeKE^+G
z6!1=h<u0!ooSxJBAFlb|z0Ux(`0;G*2QH7UPY;Zqn67hPvhgrb9iTA2N%K?<;564X
z;ayE^4vQVCC+mpqt?3(!T~}D|hmZd)yp_+bva=@v#Aw<@a#_!#LV^GdAJ9=;pQW5K
zG2O16((n}P2GHLVeU2>}2b>1ARdpE0K4~(%N|UXGSl&972nz4mM)1qmIhisjC0_K&
z32jW(>Bg}evH&U+sPK`6&L0XuA^o4FM$u>0-T_nfjA~nvLD_&5_m5U-^lt9PteZGc
zZ@E)&0$`1DqXn9BvtIkCrxKp|FXx}iCGmwe%!W}lm@F0WUN+(8YgMV`?Yp=gyxMUS
z4!3iCTC@PuZ!j1|A2nQ_{l%(T@Mhbk1L9r+0}dkc=9dQ4gPixe#8?3t>yb@d>Cp-d
zeR_!C^7&#sUS%D=2#0F3il`d=`W~`U<)fbfoWuo)Jb5*m)ZdX8y{ePG`z@Rks0h2R
zOaCQ?`ip}8&l)8vympDozX(s<_K{2B(mCem(&e$=Er;ZttufOG*j)fM<)v?(>U*6L
zgwqhOkva<c(o)Y}!L^A^cAdr@ZNRQDFrnei3+dZbf{(Y6VBJf=3#&xf&^TBJ(X@N9
zONEHO@yD!D^j&y=W<-5}^lpG=DaHQ~Efw=x4X(7ERQpKE^7wS;wTx~Lv_{rxvBwS@
zt3I5xmqM;H>buK8ToAQqf7K%}ax?zv+aFzJZ<%dU$SuVsLQ)sjhRjV5K0_CQlf{^7
zqvZRNMQ!_bqZA*AO(b%>s+AOUU1xfC<6h~c&3L`OC<Wh3RxS6blAtpQY&`qs@pek2
zukGpE1$oYp2sEcbBZIBsx`mAGDz=Bqp!oNL{DqBO8J~;jo0JFnB>ZEQZ&x^oB|Cs-
zzR(Dbe6{yevHCPz2OzSO%?BRCIA+y$lXSVtDRI~K&1K&otl5BHq4vj0=;aI$Qxd~j
z&TDu1h%WU2Txk~#=_O^B71euFAeY4i!78uRT=OO1T(7R&HtC-r$Tyt8=JP#uxLI7L
z+}y!<h^l8+@HvV!oB4a+zxwN7Gd~L&KcbGq6af&Q2EU<8y5PG^<K17TY8BG4FA1Xo
zCYF;*u(AsLkwz?a_z8qH#zzs*E7OwoP+YW!QMu7DT8}<nE<U=xb?ASx-(8Xd-iX&?
zQb<(XTk3v0sSoYeF1iF4_A=h{uVYZ4<%k+jH#Zo<U-S(Ukliv8qX+*Vdv6&Q)!K#)
zONt-@f&oYf3IYO3N{@(iC@4s)bSqs$hzdwccY{cbbeBkX4mAu)%}_%RH8byGZ)N+q
zpXWWk<M{r*KlB)2&06=m?zr+iua8~9^HsoZ@Z3Al0+0;$TXSJ^t1Vk7EX)it12#E+
z%MptA_6diEVhccErnoJHVVR@Sq{U#ufXJ7s8wxh_@z!e=5o=k~8}e}jdi|XAk0#2G
zcrBnsAvCwc)jr-(W#O$K>deYd-~sBU&XunK#TS!SD+3LMb;m<NLbK}iruy+B4xs{e
zWAZ@Zgq<H109%MEvUB{1Iil!_EMKWcU(u`$9T=Ex{-Soai~15%EPw&;q=&gHWy<cJ
zR2xk%W!@)az`MIYM=|Y$y$ry{czENP2gi%4kDW5f|3kU`ljRH8#do>sL&<J?ZLqSP
zT{+Gg?gcqo)!6ErlCXA<cV;k%g)5U@<K`8x<T%(~D9@yl9@_<1#v-O>6Ic-Uw?Tj(
z-QK+};T_m;3EQU&J7HdMW&%j)q5&{wSLx!_q6}6N(|&DcR3kbjzsrm9-m8s0Nvw)e
zCekqlf-Z2II*!t6jH@V7ljDhlRK1>4_`3f(yL&}qjRP=>rA%+ULkqg6cmT2UtcY{W
zBj7!YYR`od8+H2rtvxZE#4{<E^&}PkGRFy0msn}jvX<-oThdCQs}rkBw$$<iLt|OK
z2|`9O<jja_PfV}5hqodPFB`CwO2Jz=`saamLwFJ|rv-0D5L3qR=vqwe`ZivcfF=lj
z?%MGNq<sVgQEz4{WIh?pavf4H)VZm!XR97>HFU|1m{I@}Rg^27$=HTPnH}sxMYeXe
zQfo9gkn$fj3&lG4*%o4QU_*oF56gR!;7<Ic@BUV(O)#1Pfgm9GCV)!?PKU)8$w3XS
zcU}((@Y~-p0kNXKPQqcf(6Yex-i}@X9W%40@qHhU6me@tO_G6UL1np1oiWp%U^>b6
zca=-odoqQ0ys<AuUJPxNxfjQSJ-|cr$i?e>8W~7WOHIhjtwFm8AR?&fx3r?j8TonX
z>j7E%@(2>B1<#<qZbg~yv~szmoshBtvwjwNuvp8tcm!G1-9+sm?G#qxb_8C}&7|<_
zES=jwLK&%N$lT>!H|f^uoC}j+IjLBOrb^Je3%sqp`<~mJEeCMhCr6ZuG?bD#qpb!v
z`HN{)(hM8AT7z)eA2tQOeIPb^PJzm27N;9C6)NoDChWO0wz7B3&)k=DK=>W?H|*<d
zE%q8MTFRR&+CJmrt*DdG`3|7OKHk<=?@YK^YURQPe)1w-#HvW2A|DLk-S+Mk?L2Wu
zDaCTWkmIXzF~95>`j+!4=j3M=3Yr+S)K%jp;ZMekN&^|Jw2l{o7$rofoUnf{f96NN
zC>0&)WdB^5K!0h80$g)2-AJn32k}Sp%TNeCg6F~Rl+7LlNsylist&Cp%LgzJLwo&^
zVp^XZNUf&g#*3uT$ZHx!8B%a0ZVGV3uWR*~%O$6NE{TssVvihZYNpH8R6_DHg&dY@
z;1bfK&`xtYS#bJwW4E5;;N_^x>PN()=*T?@Q?K_1rS8c*H9^><CDLyNx|IfO$KtK9
z7&aBUP78JWv7)Cf$4Q<ZHwtP8+uwv<GccEZvm8VRKs8FF99BMQF?8lZbI~e>I{|<g
z(XV$9cenB7briyxeZ%<ENPpk>EK0LQgp>dr)R>JID}6CxNBC*4Ok=6?WSUc>%@@44
zxhbDHq(T{Z<M~at=mcl;``+Doaf}8SGA`C8h&ZqvQ9fhWSB>}~Nqg5bfWdbbpkv5w
zRT2O{e(K=~mE6&Q&*r~a)?fH^aC4;3YxVOi|1`dW3gz?Ec%$@J-|>g~gZBpxuInxM
zQOrc>l&4na1@6SkGnfp_!u@X@`molYV{wb2;ZPrcnjCb!z!~fl&aTGO-@C>Ot93@b
zhU!<_KNN@?EpF+qP;KNz2{&Hr2NHFvRRz(dJtbq8IG;7uGm!ngF9=!EkQcQQL1j7T
zJSUGm=MJ@Ha5(<img~+}0Hkeo<{4?`rPq+X!PK)S6(SNfunSwL>2lo!0UIM;bZ5e7
zPruu_o&abupE2SJ-&SIOR!>XzI^Dxmm!qkOQg?ialjV}I&#Rfbd&4a~kFy^d=DdoA
z^wcS-;t_(=W)^mZVGdH{fEk#y=rx3~nJ5`$Y;nNp7}mR+9D&{U0gJGe@GhOO`;257
z+;VZ0Zo361vqYMB*QO3bt9x|B3Gj_X=k!iZ#yah^CIKY`iO;{jjC^nZ9*B28{K!_1
z;je|*5EKr_27DfMm1R@C%1WfXto%=<&565^+@ybw8Nk%cU^WckFS}y6gRY6#-}tJ9
z1Ts4p^pF-iy?Kg@A*Jn%;2R@eOYwmr4;B}oQKE-R4XP=toL71iUtaevwlCkLt@6QM
z{A^d~rm9|mm$J83+o4fzH7>%|aA+E(Vwh$38JX2jr$-vkf@lj_*-J}Uzt!W+UtM}3
z0)6|$RwdlTc_^vKT|uBzxCYWPFA0oBOPaebt)&QM$K*&4iO~IYOk|#bmZr4rWJ{y@
zSihIu7G`B^;Y?5;5d6@m1u)-$Z5$w@g$DIw#0R@XgxugGFLSB(V65WSQomI!VB+0Y
zNhpsN_SlWqZd9$avddpd+!JNe#Nqj^xg8um(2%t}@Y;!W7RYXFD5?N0dD-tmQ(FM?
z6(K0LZtgS1!9>kdhs4FlD`tI3T>7<*CSC>x>_Fi(RY|@F#t-Xfz43g+r3J=%ezJb3
z%M%U}$}3e5z=}KPwmM;5YM3fcv-p5{@B4+wc4_BatziN3^+erh?#=lYGbqtDiNi|k
zyLOKOe#W+^CQ@2)fgxmxo>&0s>j)LAUszagk_O?6Uc6yUE9~|0nsDZM$0u)8pGaW6
zGgozSCuwv4id>~wRUol#9jiZ;BI-6t><AQdn5*j@*g=n6kM|XzIeQv%+yEYTMn9MJ
z@n^04RRicdK#ff}ksBlY&Vg%--z(%l{5#`0t4gTHvG2%~(tRDBx4ntD_UK`d1Pfs1
zuBK3TjG}hp`Tru~w65+_pX;pQd%Y!~6?wD`SW#PpP3h_YiuI%_!tE#--3lkrN|u{-
zh*TdyHW!6vnNNI9rg8gHfElTr5_4WQ4a;@7h&))<J6|-E?_!9YUfA<g*2!%jlg#hI
zX_X^Lf_?oYi<h((BNVr6;dpzv%E>(d-(dq!7IT?0LfqXIh2k0>C)Ri2N3ygF+VAWj
zhqr1KEb{OOv^GFuD;*<RVs*w$=2l&|zz+KTCcX*|^PD^m<U3W0XhyqkmrSoO`Pe?K
zVd4oZev7NS&DHKJf=EdTC7bQ9GIQQbPFn@&a?hE%ri`260#QY4;69A8EH7eOt>MEA
zHCTa79LYg;xyQ~WJ1;KjLj^K`UPS>JpCWI#_c}Y+(I(%5FS%z#Z`e~%u$c+NIDj%q
zK2i>vL9a`|i@RYpjW!rS30gxB2>#>jHDs10rrhCvjcIKz-T%C8?ZR!v4C*F$M^^K>
zPIC7OjE^F=%nsxxgy3Zbb|2tx_3C?Q^lneT+%CzndZiMz>c$`!#uz<f>O}^fiWdEx
z29T!i;eFNKUQ8x-C<AzJh0vy4nuQS`ajF%7shFcUBs}l9Wy*aLB4$<oK;za_wDJf?
z+t^b<y+8568`tsiW+Vn1C+3naM(!i&nfB9-dhGkdG?X3V9g3AwVESUY+%}YL7bd$g
zovV5~-(m9vd586W<ZU|0{3bMMBtwcGS<{Eh!n5sVhve;xcR!6pH@4*4b)b!~>=wgW
zF?ww1#y-K(JO`J^=*<{2Ep*9ZCN!-{l|NOwqohmxs-4|lE-n@8I&h>umX$g9yybm}
z_jr+L?3)xw10Lidld@X!UVe|D#q}@-G20c#BF?o)Ru5+}(J$Tf^#%7&z)?Utb}fmB
z$;(kJdl)xSOMNeK8SOjQf7|)uW0LfDQHq>h2{)rNC_l)IeG2E6&hyTDG*Oxpa>H05
z&x^avygVv{MYrP_|K8ngoFqKnVcKJ<uSd|qY+Z2s0Bhi~UTt=ivMssYoxGid+D0@&
zYt~`h7YXqE&hD1u-m9gEx2$&`q-UNl51TBvD=8I<oAs%1-?f}@>1WzqX6w^!{7!Ks
zP%rs>L*ZjyQRS<_vz>kShtV{=2Df|Xo@z+U^yxEEsYr~pXR&h{Vp%*Gl2Obl?H4<C
z(jNq75;O?IjdJ>3TfCSQ)^h6C`3EM|RB`UfTXBO4+bZ|hpzt`8Mt-CVE4;!CWiS{e
zL-)!J`|jHywIG=B>*lm+3zPR`m@o{1Nef*P?b{iD9jPHORkdkH-Q+gqC7gQ4Hp)Cf
zL4_b&=u$AsX|$-gb}&jwhrjTx!aIcrzeQktd#FehNHfb+fpBoXOzI<OZXju5F0_0j
zxzVl9OSVzKAVH{JrNRbgo2pqeNHbi!@m1ZqSn_VJ_l~Ndn^8HE2}BNluMaR5e3c|0
zuO3cxW>{+1EN#SL#ms|_7X^uYG38hc<F2rn`L+QaMsHwm!A`D^+|#?;(LnYHI$fzV
zts1mjUbp0B0j-F3TpN09Ia<@HH%A0SU#x~Q9`a9Y_FssX@F=Q(k)bxN@1p-S(Lf&e
zZ1gcH-TD3R{;3af0V6SETRAFrmBRN7u8GJ_!o49t;b0AHJ(MkE?TEJ#YYK-{*L(n@
zk6&w*iWv`3x2#3F`7xqac!m-$<$p}^8DR&_sWe-vL`So$8R*VV=trFUJEkR0SQY@0
zWWe=S!aE0911aQ38mmJwF1^|+b_l?U-nSe5@y=mY1#weJvczrKke0uBMt(GL$Gx=X
z)4B5q(S!?a+U*nlE;)CKa*Rh37w00ey{l7pUv3}m^E<Ha@7%&O1A|e+wf@sDE!CfH
zMJjn6TG>qs+@EW!yVd8C4nEq8N2#Pv>YLB%TM-os+{f0UY94s;<pF^4G$9EC@D@O^
z;8n7MsG(CHs3AH;xxupNj}B=X$KGBQz47{qpRL&9!cz8qKX*|Mj~<r*jyBmw2=bPU
zjp~_}yuz&3&*c^g%J9YDTd*(JHy52-h@I9pS9);-XDM_v>N$#_a&ix`b(ZC@R`MLv
zue^-~WC&N39aNV10>Y+pxJFgK7R~2GJT`S`(9j+^wnWW-QRDcwActR>h2=x?5#?mj
zz9hrDjfrQ~k2)P5klP{xg4BA^KMHj#x=b}7%bE`S*aNDTHMFT2%tJ?FR(TQ)q)jQo
zO>EhVSG7Jp;>mtGeUKNg|KdZC_wgXU%iFhJW6$Hgu8h!RC#>6#t)jEi4;VJDz5cPD
z^mDTQj=zy4fMU*4vShN{WZ`2OrLM~CEZmCjq?oSIH}jxIPeG(Ed2c5rAHPU(uBmnV
zamS(l1?C5t+SJ3t8_|-@S;AkO4m%=Pg)TKw`yGiSPNGwL2EZ7jel$fmfY(l7Z=4Z8
z+V@5Bu${H;O&HDIsD76X!s4@?_2!wVSYw)~LfX|*tYz-JyQxxyE7>c((2f1x{?q_-
zkEtuBdSLqMEi>#R%nZ%vpf76M$PiD73ufarr9};JV|<o`ZNn+i?RJEgA(&N5|7okW
zOY#w6qyAM{DFOX~NkcC|i@@c{%pv!ShyI_5P0C<;kP3v8t4WWHN2cKtk~)Fa&{LM(
z2CuOb5*lH<NU<T7SYENs2+m6MD7`NylvQ#3RsE((LG{F4^9{iJPjcHsn}3){HBho~
z-<BS}azJ0>U0R)&sgk-rLNBg5+|R(c`rWe(b*OCz5DbY!tt6TL7Vj&<oxJ_Wydk9`
z^I`E+_;hDJIotf89_bTRCA|zD(n);|9c!9KjO7Ts?7|<h@52B*j?0Z~y97|ZuU+Lt
z-9vZrq^pE3w#&;dEV%|g`W(pi>{z|qhg&vu_i$sLT5f5WdfN>UbQ8P}vfEi{EI-s=
z(RH9USW!b(7<!iu5mRaw?2j>QR=TUa{7kFjQ0mz#T%lBdP}oaboHm@@<HDJ%?~c6b
ztUgR5lE=0Se+|CX#Gwo}82pwv>49EXa`w~p<DdM%R;aO^L$|e8P`xKMouiUy)0C-Q
z$|e+$exHck6G84#uiZEjEHe-@w3*`V^|I;fxdcQ_U+;FZ7)}t6Fc~P(ZtV6dSp_G0
zDO8SF9fJJVO9q0)2kG}SWy3CUHx#NsydiQ-AE)s~yh`YAU|g%rv=Y@7<h0EbMV)ej
zY0;nKp_7tjk!1<wPmrjU3Y)1lRBCAWL?g~ee0LtGneJ@FR5WWpWn7<l2!tG8O09^7
zhSa|4m(ek;?LoauDJ5=Ag0D0R*_PlDrs{8=))k$&y?l?Ey9r1jDyea+j*>c}aI2J=
zj#Oc^iie*;s(p#YtK%LAp&|OtL;vznfsNw_I+aHoes0c!$=I>;El?>ErPzjniIE2O
z#jlXmRf?Aknh;2o4lOmwKrYfbfu%XShumr@?3sQe$QOSP*%(zZdeAqF>SOw$>Yg%E
zF*u@}a8Vw2$Du;W2GjE_$6IeD(n@p(W;iT0Lw1Q1K}_MuP)}QO97F-5XB0)v2yp|k
z?aOT_Los;e^YR?i0XTVmahe*%;a=>Ref<HX6>pbf;A0Nko26atid7A_g8mlwtDOm#
z_pnYjTlleuwF`D$B@B0dT45Bv4t*kW>wLnW1b4$L7%7}o_wKZA8nv(S^<F{=U<~F2
zR8$-X#JM4b&zs-$ts0|hQiST$^==~-HeV`mm?b5mI)xNpyw{+<kAtZdB0JQ;v(f}K
zbTkO`cxFZ>Y05Qh5`*t23f&#>?M=8|VSid-lmbPZn1z5Zv7kV9BvvdRY44dR>G@+y
zU<vm;?D3l`hI%ta8F+Z`v(i|hW*VDAj6&izc!hmCt=*lU^S+#wd4m^*C$IcU**GMU
zl%sI2Tx{oniuTe1|5A^Xdyd-3^GHcV2^)t^RE`fow!bMV1`ol{!_2$U0B=r6;n)yL
zGk69#my5PZnGr553`l^R#w1eqQ_1lzR95Vb<G@_l1pEOxk84Gj0*B0KaYiZ?cgl*p
z>QRCPckme6xy<5bc?u99Rd4*B%tju&q^bOSvK*vIl7#IR;Bpu9liXRXLTT7z7e#7!
zFa&M!C^wD6(*BsBMa5m~L%HG$EiNgZXuZoFWs=S7^x-7NV*2gd<M=RH=<whL3yD!D
zwj;4&r(Mx|-d_%<gtF7uutgE;N`80p1TmQ+<uQ}?^^&4=&80RIHNBF(+OYZ8Ie!CW
z!!P0YCZt?oMttgOk-OmG<gq)dHQAq#u>jDN3BAG&s|9AB+kuM!bC{!SQ5zacI*{8Z
zc&DLX{L$>hXU+P_EC)|$a7+)c8in^)hlu2vBu9SCNM4n3x2KtT7Yn9R*Y1-NZdhsV
zhiWqRd5$Atdc{p9&l>>gU$`Yy-)Q(4*->BZaAn*C$`(@fYK+^A@wWMVr20_N)g}t^
zC+1SZaS58L06$QHgl6HwtL@%$7>Yxb^JPZfL|UO@^fyuYLyVFiqfXp`oRRo^Aa>6q
zTGBYf>5{-~x^PE%wo{#Icx%zMEGD{+jpM#7R^FW#AEuPuZ`vd?TBjDUt1Jj??^do6
zFp64^=3z!45Zsf*1~7u$+ol0t4;GuObIZqIKjV=Ap&{OW&8k-~JXYt}RqtlXS|yxS
zB&`nGb%7))DACCe<BwV1S&Y7-l9mA~QOU@l_7ji;Tjt!<jiCKK#2Eg<cRtL~#5MMN
ztzS1!?X!d%3r#Med_qE+Izo1w&;+R}3y}vP4Q+BkVX`6mrcPSLq`!z_KzE2f$7ew3
zNCxPKu(m_X!?TsJ$qwg<{fgRboKJei9Y`NyJl*EUaa*8pxc3p*PXRWr`Cj6n;5_YN
zu>Rt4RdE}+(9N(hJ#z;`ISm2?dF@PEbITfMYosNixy}vlp#l}V8fczNB7#I3K9=L;
z;PF78Y#eL3e762zOiV~WV#)pRk*{PcB$JZU(BN@dOoXIz+Hj2>gZ}I^;TR}z<`XpG
ztOMM`5PR2&R)8TQ^cG1Ox8Y{F3Tl4DcwlWO=YT86_RY*OYS`tA!n=dQa~BFerng}G
zv(yh&Q#_pGpwwt5<xLarr~=`|WJh)P7TqwdDM9FsF?rIqO-tB~!uTv+o2g2iQEu|{
zP)4g=`jte{i&En(Bb9(h(;srxWaS}3=z+h4Ri%Ua?MVtrsj|IDVr$^PoFGS1MzQi#
zaY%6uVt4FEr+3_sVN1yFw;hkc@~8C<JQ!Oqs6hSL8+2@3S$K;Ln4Cdnj+ZEl@gs3L
zTQI#1fjEq5nHAtNE=T$sc$vJJ9xN*_I9@-S%8JH4_A{ZIAKyWndIakbPM7K%<Yi#0
zTsQsqJOvT5e_H-iR3yGhMUuT*un|0xGN;u6o{7(g#v_@F9FjVHR0C3?$x`ADecBE=
ze=V1EngRNJ9U4u!8H=C~%ARj@tgegLd~;As5RfFY`4C4Tb@)xv7lv#;%GRIyrUOcd
z^!~9wIOIM1$RktFE@s8P{xwQKg<z*`MPokjy4utsk&dJG6c}AmCP=!&_IgFrWlGY~
z(PwcAO5f*Rs%Zy03T5&9GKUe$p7+?k{GnlIv2o^=%MiW@W-L!*c<kJ#r={=tyGNuy
zU8h^!NXq3%FjY=v_Dj9W(~{~nqhASSg{_rouygUT9Psm~CM2p!VZP>o9EDXA2K?`j
z$OS!K>9w*sr18Lcg((h;*{SJpZK}vIn-3>Ys~&dKnHv8HhR$5%{<KQSJ|sSqwStUi
zyIXwLfnS1ScZk;tTD^PAG2Uomufd~7-oN2usYh5joPnBEu&>&{fKlS>SkC}U^W@Go
z12?3b@W2bR!(;jUy~rg&&0IW;NBo(u*0ds)1+d5*Z~|uUJoM|P%d$BFrIP(2cH;19
zp{^hr2|F^1=8QI(jC!3xRCQJ=;#sV6|BLYf--`9voXVk?1nut2z4J=c&uSnx?6pN+
z{!q9?E>du1I<KBQ+yX;hn4g_@*W&k0PU-0z06zP2I9jDl{`-7Opr5zv>WS?j&+`SB
z=CT|%Z)I$K0SE4mW71{y0kskC5U(`W8d)cwVV|9?mQ(32b$~Xi*NwaFb$axhqztz!
zyT``GNmQstqNwLQ_ErVeBIFdQG<Gz}7@5TPQ|lkERo5?-g^mY@NLUA5ZQS_5wTQ%+
zR8&MPiJFNPw&%8|BjY{8c(cLCgX!tn3saRdQL1@J>TS`(*+ID)1t<@LhR#~>V(yYF
z!P@gm5nggiHz9Ha!byQ^G;>MFof|XirHG_k`9w|r5{bHRDxi|R1NX*Nw?g{MC<^&C
zdyZ4A4pqHX$c&lz8WVLiTHrXXW1AN7--{LQ@dHITW{a(pK-mT{P=xSxmgN-*33Ip+
zk$^<s5QS9dt25UneCVzM<ckq?2W?RT!c2nh1$kt5x`|~04WrnVczjgvu`BN&ruk3C
z?XOSwxL;}J-Vp`UDW6iKkQlr8N)wTFJ%CXUPi8q4>42UMPnwgz1?nlyQ`Ob1=wp6Y
zzd)4Nl`PAHQ2M*{!D`C!0`Knn8#Q1yL9Yk}Z2T+y{6mS&{5e+N(a3T%-Ju6&@eaPu
zu~yKSff1?n@E?^3C6%BWIp(C`AYr02tjaSZtK{D%uKR%YNxrqScW`WH{58b4s%c3I
zL^8jh55QeJXz`H^81El4N#_`|Jhqe4yR}(-K#z!a8buD_jC8*$X8$u)Z`=T?VLkIY
zSGBXVO5>uLEX%-2?6l|d0B&p?w-|@pJNfzd(Am!)nYW3C7(fFL#O!yfZp7iJWIB$?
z9#W9Z8{=VsikeDUT;hX2hWGO}z6DerkIvkuxVyJ;H!@P(KtR|ni8f-EMx-~PCHD&O
z7Df1n@Q>ecIp#1w*CTkss(Vq=mH#^S@3+F&S-Mi#Gu4Oy?^j`cdN7Aa`t1@+4v^IL
zbtE%yp3iI#4a!l@{AXJKqk?9Pl<=lcnty@t{*O@mNM0*mruzSW<w-?YxClP2>We55
z|5k(lnlu0V?U<v0l_{O42LF98|EIv;p1>FH)Ze*w`M<u@&!Zb4XHWzmo}#$_H3!Vf
zKpUO6$amkr{OeDrvqh2(Jn7N$u<-wW&<jogLaWSs-v8anJO1ZDfBp!%b@C+j)Q124
z+9Vpl)_mdq;@<!4nuSLweFD~vI^6l+O^QHqCv9?u$y#3g?^&Q!1fG;Wl56+-$?%`+
z^Xn64QRT3p7U^GdC<y^cOTYP+{2_MV6>O3?{6pp+k=ciXvfR##3>^oAeuxSP2)O3B
zn8b-M0?<WD_cdUXewDWvmw@!9IYemXMwlbxCw~1F*x+nr2){drAmyY3+eEHO!iyQ1
zeARN!VQ28aGlNJoUsaJ_`-LPoR`QKxpCe&3?&j)NIDJ78T@X|FAgV?HSoc_9>6G9E
z7tj9`G&3Q-=oa((6KN9noZri~_zXdfE6Xkhz=k%q1H>_LbZGDVbMh$h@e18QlNppv
z!>8ncKA;#<go^=YUFpFY-_?^goks?`edmAfY3UnjM3KL%o_RvNhNH!|`8^_w#=zZ0
z4wfC(f8ITu17Alk^tWDAAXUoQT07OMt|TK>fQ~af5+b?utN&*_3+R|x<Jknq>B0hC
zCEb7tpXVj@e+!~KX*Wd`sedYof33c7eL@mu<5;dvxBtE_PZlht3^2m`-;A~X_3hK8
z46yqEB3e+iXZ+vS5IzRxoAU0&rPHDQYd1_h1TJx|{+M+3e_wO|gzQg7+4=vnmz=2p
zDd`v8SUP20``60>qRH^9;{EL|;pgxp#GK_vws>#YW84D+0?#Q@oNEnI^2}DW|Ewwz
z$orJ@){!y$AMZDz#QYNOJN|anNim5W7<BkuUzT@d#@1)8fh{OvFpDUTywBgUi}egV
zoR3Uf>xYM`3R>P^M_I1hQEmUTLn!4nm<?z2EQ(r4Ntj<blZJmqtk&6H`&Cgpf5vwj
zAb@b+0MQD9(!qnjeethtxdF&zHm_OWm((cPP8RjnQfkSnc^U6o<Zp}VFPZzW0zJke
zW*L5I91?0g-EHQ`;mZeNPZ<K!z<;Vs%16NT9y6Xw@dX>OoF9xzPq=*Qi}(8D$T|jE
zt!|*c>is%An2qrxR-j8A4G)}$9E1OJ&S$kw8ggAWJtfgirw#}RRPbK9#PtEemCXGB
zoLj0U^oNQgQS(J?h~EzVC(jSmH3XQdQ2P-#`1+C4%k*%)_?`iBX?6c&vPi0c&96tw
z_Dg)erOHi&=ci}y52dr$-BV}bycq&APq_Ll;Lgef^BZCs^FO~&cndgh&ZIv(EiEG9
z!Xsxqk4U|(A3;nb@a(x^vS`q4zDx@BXTq#Y+cO?y#+yH~ku$9TegR-?Rw%I?wXCIp
z_akTS{@DVQPz6}}1Ggrqe$8RH6Tb7jFH%+RT2(Lqvv7f7;)@^0URS(^_BVY4DCRj0
zy{cJY9syxtvBEP+H@%M!VqyNMB}Y^o`5*J~h7A~W+T#+|(+RO6oGyFF_Ix&ni*K0j
zcEG@cJ9$AYIciGRz_V`h3LIgM68|z|EC4Ptd8L4bUlIXTor?njHF!mktXbba2S1X-
z`&P{RLT0+~1J6J8X_Fi<$rL0^zsBYJ<;tr!XC#{mma1S>#G*q@+R)pMay<9AOgAL>
z_|#`o1myq->n)&pWu&P}Vn^D2%uP<tuJwcQVxz6){NedXHL1v!V*iDhHu7v2ND9DL
ziuG|HTLTK+c~ZTL^+Fj%SjsJ-QT{b95ieW&$jm)7PqozFGv5&hsO;c_;$Pc~IlLTS
z74*D?SZos1Q)?p%k9Q93FiT-bo~1F^(}zwho}bxm!vZ|r3%+|cCJS9Cpmi)oPny5j
z{O9~48txAOd)6J@xwIC7wHquf(g1Gl`lcy1{CZgKgp!_K(s@tMPX*fbnup?h0W`NB
zf!e~npVl~e%K)C)4S+UX)q$zt^c?+rhlSt8`*6#wC#>*PV&Np?kBJrTpJC9Z{7y*x
zF|-X;r>Kw@_QvrWkYeX!J>M3ecApvm=ixa+@-+)9P%C2?9z!su@Vt9n>a>HJgT=&_
zEr2PhC+ABNcjN129aw8TFwNfKW4)XZpC;2nEZjZZ;y)R2A=SG<ymw9e4@?E~9DXpU
zqw2cHsSQw~+Qe!ezVSnT%I@nS`NhX~EpA8;MeHXoF&b0KG9em1T;XD?c33K@9i_s*
zV(?7Sd41YM3b)k_*#jSZfGk)6m1=va^{YpW)TMF&yS!<x#^HBs!^}4Fbxtk4*VQz4
zkYQs5+FUL$r(Chr|43$>n9V20yGYUMrC-Nahd3VmjL*R;A@#H>rgu1_t6{2!p3vn7
zVldO$|E5w@__Uk-6oC0qR4TW8n*AtU+QK<!m!fQ;JGNh6%Zg|I?vbeTYL!^nzaB&_
z?{elv%X=w4)TeR&lhgeJsP#i^c1hGjU>|@kP!2HroFg_*V5xj=PnT;$c56jcn+8<1
z{F!qvfT!kD@Cd}PU)2LE&lM6;ib2D1w<@QHnrsiDZ7L;pIK1k^<!@k*^)QQH`|@v7
z%6FD5OU$Q<yl&R;{N8E1LqeA4LD#K(jTs(4AG(rFOY>?>u-OkpaKZPQoaM@sJp1e?
zw+vN*%%tW8#$z?hktzeA5wW!_#kJDi^Ns?(6>%_+8<LU+o>F2{YDoeQd8%eTI}?S&
z>W;fy6Tg14h3Po!dYzu`fA4YQqB9Sp_@S}&r#@Ib%Pb*&nQMtyO_Q3M?Ol`7o{6@8
zOEYmUKF+RsB+72i^I)qhOTlRV2kF8>H5F^NMXBacZvUBy3a46qi7spSrx##1Oh~0k
z{kb>$UciP`c73QAefxTH?M}Ee+T|lonfc}N$T@X@tl3+ikK)(t{=Mj!JMgKFE_`Dn
zbW2@5VIa7dPxOI?=anXt)LFm~Ouur^ebxB=1s1!0fBdPL;M-rC2^e0a*$lEAnUdW~
z+>xz7@|n%w_dy1Qs>DTA*@Sy7fARyUZxsSBUKt1ZF4<PvOxem_)tkL+w1+8JX=`JQ
z0cxX_1~VQ?2%G#Oz5yVD%7g#`#3eSluP=c4|8ino=|*9$`!s3u0+dDtjc=3A;9tMe
zQF2ClD8A-CA$#~NiFs4gImUb2_^uSrXRiD3X#dlJeD*eA%)Pp7D1L##SAcLxpWu23
zrUZc7{M1g&L4<Yc)FSuao{^(B9=twuVqN@pPBdxX0v>t0&ioqR&p#O3`M&+pdXt5X
z4a^?j+~m8b@`v68frxHp^q>G${`ZY2P4PFQIp3`Y0D2tY?+N{N+WZ{C-_7X0DxVI>
z5kGf%giL!WeO1huU8Vg$hV!%IsL2bq)4PlM?yG0dKgpEU_Bu0}``<=G!o1;e`buxp
zn>uR~l#a_sOIADnKN&6jzSPO0W}Whc3~p)v%_a&McVQN4b|4-GMha5Ba!b7mq{=>f
zOa)^0lbSXB#RcGpFBg{FGs{>f95!A2AS1^6I$*6R$tVY-zdCv;-ZIcDdtNdo&TR8o
zlxpIaIS+8%lg@W1k^u)sN6v{ufqb_B^p|GF0H25K+5w*a@LQCtXYfxMUjeJ<Z7HC1
zUNwkbqsOPEW`6g$?Rfm3h4sseI<-u!PaHX215MR`jGTX66Ab9&j=w@v|7ld6ij+B=
z8(5;o-Q8XP_(Ffo8Zgqkz=ELNHnRFP&woA#kFey##kF?R7UB5c*YL3c12*zHF?;{@
zh`$E}ssOCiaBf~M_Wz6N-ge@S{I2-`<I%H4(e&&2(?|QRax}TXSKahQxCpP|Ve6c3
zxSm#|y}3czY54g5M`PiC2JxK-xcpV4^fjgv!TW^Ym8v+kP(GfMk=wk)MN7*6OY(oc
zEgW3BV6u=!`HE<NWi0zS{;y3*5(<)$c{~4M(Mt1`zy#Upn}Vu-_x+j-D<;sB_NG6F
zKq>8uK->xp{SlT&X@FmI`fSP$89Yo;KHLrP3&ZPACJMFE4Md^-5+I0#2<Td8dPAB7
zgZyWiFgp-B&ii^n0lrkw-j!J#aso%^@c#5UcwkPht`V64u}i$0n}?*|n+HxU9N?}<
zngD^Y>sHOifBCMCEG4w~X6eAhmTqLc?)Z6+SI*~djL8gqhn?S)tS!1G_lvgj??2C5
z13e`cR#*BbPIbc1@BOpn{$<hs>x%GF;9ckc-(7w`%}?h1|H<WFOXaU+_5bAZ|3@y&
zlc;Islxiy|&jG6OK}9SXPZfytiX?laM-R3(EKDsNVCzwg=gS6I3F;<rZ$%!Qp0mFW
zy`M(Euks}l_cPytW?fsT#@Yp-BuKqTPsD*=Os$04buL7^#C7wm^NH7^G}jjLv5=aF
zKa^2|Q&-3?`ekd7Lh;L`cik_5a!w#f=!hEA@G@*{1~5N327qz83M%S(>RJ4A4*&c#
zt4#RjGHAREX%{bO6?$8DFHj2?tM&O7H_Czsu%`#Cyfi%nLPDaYaL3)wPRQ(7d7w8Q
z!MbzjkL<0PI-<oqtjfC3!w*SZPAeloBQ~*4ktG`YC4cvq0FnZzs|hn1{VaHW^FlcR
z=6@b)glf#U1Suu4`x<V5v)lRRtb5$ylVkq=;Bcz``dbO6>|9fm<&9mLVSD8eK`XCw
zpH}wwU1_CkKh|yhFhA*dS|Gd(H>`R#S-!$7g{u=y!<e)x0;IbU02*+zs1Kl%u&w%D
zQ-Fw({MXz6QpP34p!agjh!96|O4o`N2!u^b@3FQJw*0ZC=Wd98RC^ZRhv481J?`OL
zG=L&+0|Vr^nu~QuDYg1Qn>cyE?J)t^I!d;tP{uAjjP&vysl2JRs`r%7Gf65R?ur@9
zY0JnM7dqTal4}2{e=08DE7LX(QovQ2mEX8FVWBFRNns)*FN6Ka(pp4S{V7mB-F_k=
zEb<Q<Q$7{+9-jo@z6j4vac^7GPW4I^H_6Z{^fC0-XAimpelCl*;|SeOdS&Zuwf1&b
zLFAb0$t~V@q}fR<h><)EpuL`0&kB<WdfFk|dZOf{duu;za_~Z}^Sb>KgQVxP3rvzP
zrL6SdfBJOGC4T9fFucG~YR1z+-|Z0jV%<!_P-XbZ=O+qzj&2WK8kXLUu?{$@o~5`J
z)upAEVtL=Fkh&P?#zUWU3us;`62|8h#>K<aSA<42;wHDbfew--lNWXJuk-PBjilV;
zJWzhcn*BV4SQ@)Or>2pbQ4n?O@z?DEn8h0Eq@zS>DKA5O`cgUE+B;%m59qMr0!1ee
zSd|Ws%gM@iwO)x8epo_wRP@#C=Y}GA@*HFJ@e6XNh1}OR=Ft5d8)fkW<NSsgMVWUo
zEYYGl4d;~pu)yt;PJ{^uqy4~i@W&pu-}nuh36i~mcE01WrJ<<WnF25DLBv+`Std7b
zI#I_*CE{4`cq($chJy;vy@^<Zm1X}FiCU$6P42Z22bH{eG0v~s+uQxP)R6)=qY^M8
zxHqw~YMbk6O;7Sz;B>WvlK%y?N3VmV`6Hv{5!w<*q?`fU(|iMN0^*(Mp=@49&2ki~
zK};=2&AWGI1FbhuVMAYy!yYMvvV4*;b74~59TB{pp$vy*-;tWRs@%xI%tSrc7aU@x
z`coI29p8Gz@h@@H&)=)B=*Ug=Vza9koZ{k+B$i&UD-BfWbc8H@ZbOqnQKMSzfyBP1
zJ+82oXSx-2x{O}?*C9UbduzCkPZ7M`YEqD`Nw`U@zU9~&&(r*b+-tw$>XcPY{j4`$
zK(%}!seOWiAgb5j#9d#nr8%zq_WrT4N-{z3Etvs2vnxfNYn6gAA1>9~Rn?Md#9zJ3
z;<f?Og3@E}vgwBp?;z0t{`Zia5g|&=R>zs@xs|7{Z?~9RssBQj{GQEFR^Ha29FvmH
z(eUJjOu4X#O1M`$muuS5;F~rA;`5ie5vu~QrK;D|^|l>i-HuGD>_qYv))SoI!yUED
z;sZ_qaby=UbJCi+2h^u)<9ok55rrd|Tv0hiT*UzFWceYHf==KG5HAv(C~iQ{;2&7F
zLf4&vI9F5y_$(V4BZ5@)$9G)M?ugT`NF`;OPpEjun^|86fePMX_R~1!uj5sY`2pTr
zZruuzqX;FQ`ftgS!-4h-JRjv4k=F}*N$~p@Y10PEWCbm!pI=$gI0U-=d_le|ll2|m
z;c&Bg`dN?T!FzGrp`|CUF2@g^<6ad{M$z7Qr6r&6NxwD*T40t6zOMswNbM2I3%YRa
z-fVf#!N!<wsqyB^$k_2R0M^sp<O}XqLB|61w~1?YHb7&uGK*XsvtgNeU@}Ci#s$R!
z@I2iW-f9mD<s8t86tgYucDms+TqeJ@I$;<0m=My5G5zKf%7_?VdTjRk1FAugbG;T)
z9?T#%8EWuKXeh1q4G~##m;p-Xo~Sq0b0}|^V5m^nj6;21&*i|Mjq*pCp>ngOS=3uP
zfOt(|u`k)=4SU#x4Hh-={UetakD1fV&ke{d==&g%|GLd%O9nQc(!7t#a3FQ`ONM`c
zdcypGBlzXZmwK(Fm%6Hl&G^!VdjJ--)y19uSSd(P(m@QYHIyOO>MA1Gm*6NqabVCu
zWLNOnD>SU|;~1Pnhb+wzNw&7K-QVb#TGO~U`I#e!yM&${Gmz$&bI<*F^ZL3qcar$)
zA)2G1L<D+fs*B4hEcDi^Newk}DwT;$pq`3zt5jhrQ~0|UK)P4J;(wvFW_Z%@wT76q
zVShs6>bZ00evG`kcg<s59|H7hv8Br@TD&Uif{c2ZOyI)GU;MHaS$0-baX#I@X0laP
zG#$W#Pxa%D;7e`A@k=?*Is})D+Jfy@ZUV>YVCaW|cRG#D7XB&{XLBzJAY*yzcP{D@
z+}F4Pd)>aaE+VqB@hcydM*<{-9|^7<DdNiWrJdw={?mhCrlj*d9|9Wq+OFCi$sWe<
zCX1p24P>;PRE;I^TiDj29CDp9o)vFfZ-C_RJ$FZi`C7B!w1Uef5GQApB*Vyt&MRW3
zGRd!XZl%2;sBaNxQW7n@Qi0HHC-84=)$c6DhtN9%IciJo3A0W8QiUau(R|IsCeZ6z
z>$x_SoI9^Gw04)FQ*MWIKvU=#1?t>@F^Yjjpmgw%PD85MLR4`Upo@zJlz4HeI_JmL
z0#uz9>CN2Hs^S`c3%Fg?cTsfdx>k_Pkx=3HS#9fT>xrRtP7gF29U?ftsAeb`2Xsiu
zS-fv|&_j4rH6?gkl-=d#uXF>miLXSyhC$4P5)f}OO{M>n&*C2IU!uwVvD;Qv0mH2e
zspTiDvOCJkeJ{385wC`+Exz!C0QgbK_~T^(?ZfYUkb@MaV;CV?j}fs0u!Gpii8~^<
z(#*TkCraT?D`yIH(X|SX1r!|1n*VAit}aIhw7=}ftcs=IKg;b76!~%u{^+oD0SFu7
z1w8;F<U83~kdzFtRlWg+7--iUabjy>jsZEvi&CUcP+bU<oYW|4sQHkbIu!Q6|3WoX
z4mvnUmr@9X{eT`#jD8g-CRmos=hbS(5@ZzqLm;?a!4YlX<yHV&j5Qn!Y!LXA@m}<z
z@2_l>?>WG*pdJ5!9TV;Mlp0UmZ}M$ywdfsgU>(v4T=>K26_I9sz)A^(3%|1zZdXOD
z_nrUot=Om?V0dZ<thnv1N(+au^TNWL65Cs6i<N5gGad#h`d?UEs7IUD3SM>e%{d<Y
z(Md%$^ELq|l??@Kz>=ep2%1Q?kZ4A><nX0pUPElO<N9$ppvW)OrsP<;{|NLIdh}hp
z|9;TL6>6*~DQo#<S|r#yfw8mJ&3o?}X13noc`DEs(v*}hY+LbC_WkH9hMpXw(!IWG
z$>o0^799%sGdS6^?u%5h71069{Yu}~m;zLDA=wbr&TgybA8+{m_AJ={;CxtFh1<&+
zixWk=w>E|H+;La)nwzwG4Pu&cx2iydIM^I!VSlu4t8k{TtHWtHoregsD>C1{-5<H^
z!M&|@yBdgZvxUEEgu44vw0AAUYS}MYJOcJ|C!f{WikD#xMJ^EcR$CV&3R&t+G$J5W
zWxj7$<-Iqt)MqJ#Q6q2&<u_A}4y0~(z2xI%x9;g8;Z04X%#EMR=*MGNy6DJ2j9Cr#
z8L+U4HAr|?dwo;m&}*55xbXVjDH_kQ1Ts~ikxYANvZ~uc&(g2-;v3fI1jMm3F77~)
zQzq3fRdpIN$kI4bN;>7XB^<Ak8vM!Wc>olUAh&9w5W1TJjzbj06uGdwrqNs3yqT`1
z)?fh4K$#1xXr`)L&%UHhK~ueL9dk7jpH;mIRHz)5o&lpJ_`f!ZAvUJ$R$wYlQNF~4
zfWOlodT)RzffUj5WQa?I-m2cuTW}%LLbMA+K<G!j>vFjpFyJRGBr4(FQn1_{W;(%T
zH<$WUW#7fJH~cy+O%@HTmm7eLNvZtjc$Q)J+q4!^n4MCF==7soHl*4VbmC21KE)Oq
zFKhCHcbY4HPV6fk=8X5WHsL#4S8jjFq)Kucl3@An=pBD|w^tb68CM)Rt#1db(Km2+
zt2@%~)Nhg*Cp+G3H@ryiac?z@pP%9p`3@T0h`T^kPR|hXdoY)o2A{UZs6coC=*h|5
zFV6p!vG(#08+b$HNTG&qUt?O;v8`<tL|(CU`x)GKFVC^QI4Q+>#_b`jcd>3eMMmyn
zx0i;p+u=@piI{0vZI45SBT{|6orvsm!po*B<B8ZyTn47H+%yicBGiX(8#JQafYc@5
zQgXGuABU@FGtrf}#KXo83aR7y0#2uuG3Q+1c+rd(cT2z=4=EWvv^y3PUf06Z>6D>p
zAu7d0X<>jL2v|cE?_$NA+(AW+Skb@_!5_1kJm=R@86_r>dxEkjzSYqU?@V2~4|JZJ
zt@bYUfLWqg3VL1d?quATgkBX&aOUvF9%r;h;ts7c>#&Eq>kV}9$&O^z{%=4_$%S`U
zWD`ZnwB%RD&;g87vcvvnv$;?5J|ZUsr+&s+e3<aWex!Y(eetC~bH;b63?*bL^JYv)
zet+8reiM0kyk5J#kjfyIt%gc5n6d&r2U2{>37yR!h3yO=CRoXWyA-O*$$K?B?mO3O
zpD~rhZBFXiPuYa@xP19m^9Wn9omA<zec1w@UJH<0kZeEtCYTbm>i8v8a3ZwDRaS8+
zN!!29WJSngrX+cKc#Fr)eta&^Ia9cW;@Um)#>Cv3mzQ})=&`w~t?@e<5BWy&iJNxN
zl_o*8-xLfJW*C6(fQmWF$s9mWBuUYsS_7EQP^R3KJWsdQo%N4vznLcYjC{`%QI=Ke
z_JpkI*J7jdG|a~P1c~~O$gZ%9QL30}6g=BX-0Rd-e(^qsTc>3HLG6>&I;fUT;foex
zwx*35eU-44>Lr7+VSuS7!nAi?zT)5}fJSkFjXs(2_Kwic@jaKT)*rn~>{;uat3e1*
zz_f+dy|Nt<YbNZsnYr5>U<(tSI@5Fl;9!vuTG|(f)|XmP16@}00UO$rn{EyAY6pW(
zl_c(8lgNb#yemxgSht`BT52T&4I|F7VTRA9<d4tQ^s>Ujp<mBPBAA3Bz4DIEAJ~s<
zb24T7B+z>BiC+n3-yPB$+3bqX;tzXu%+NYlnnTs)dq|Kys_3Ih!?DigQL9KIO2@2)
zvr1Ai&3=bOG$462-oEuG;b#Df5y>ySX;j6K5BEF2)f3ex(gCZgtsM}*H8eau-B8@N
zyIvQ?26WVUx5ISriI={IyMVRgis|U_T_g6cl=>s%o^D>_r?D1$+rw)4CW|X}-Qmrc
zY)~`)){xSiF+iNc_xcziP-#N+o7wp0W_UV{Y~TGdJGf?BFs)6hwacOiO6M58*?u(P
zDCD-iSX`Q!nVHmjkztLYCE$Z-eUE;p^0ccFxvt~5L(14S1nlU=whYc_-MW9xsjOun
z-EZ2Y?;B9zxu)!1Z`JG50QTq{FV5`k@Ub*N#ydV#(amapbH%__la!_dWt-pX50s?Z
zrjRVN#cC;HWgZ1xcA08=Ynvv#T<%lCFWn3aj32NmC)s>>CAu6^?O>)b*VZXVY+g?&
z!Am}Kk6RQlkcZ;j>BoxnhZ#%lQYQ_i_}^sd*LuKg!0h%9Q)eJNK#z%n?UCccO0zCM
z3dz~0G&->sD+B4t)v94Pyk^;dhRcohKz?UM4}DCD>~CgQ^ek+GNfHV!4WM=?oKQEQ
zdJaz$7;tE~w?$FpC>J6i;JLuI+5Le@a^WYDyi$giL-{joyLLrJvIiC63*C(|5B%RG
zGIE%o?Fs#77nEfCpkYVnlYW$&R#i3ZkQIi?59AJY$8Z~IEbg!G6}P(wHXI9*(-b`0
zl=2R;9((~7#ie>X2VA~BMt|tOZ7`MWBD7f>GGh#Nd~hg`bRl*ucXn7YTcdngaFG}%
zD3prpOBPeH@nZ#2mFG4K@@A*v8Qbg86(x}I@n;j3?_XOPCJ1$+b816LuZr#6Jc>#x
z%2n%5_V#)$W;`jv1Z%VvGwzr(-Dv_%6dU<p6L%fRxjU3GnBsd;-=#09a)-Osd00hI
z#xOxzfm?tZpx^mC+QWbEd$op&?gHu%v5NpQ`QSBJEfjm<Q*6qE6!Ab`)_%;LU#S-)
zwY=tSZ5JbaQmTiLk8i<N5BJ!z1-69y2^m)9-%6PgaYsL9HcN*~w1uSJHuOA*#z>B0
zu|oFVwL+Z@tl4F}Ft4xQ^0p?Hy0cx2w|fXKldtt!)>{<zp1U^R7yXQtD;q<|Csp2%
zqYPHr$=d+nb;Y$qWzoqsJ5jRab}_0H6_wjw$D3xfCMh)DPC$`!?eiB9o?S6`ex%un
z=YO4E7{l$fJQC#v(5YPFjG`_q-(A#onD>a}(-^)tTI7@oaEiZ8N{w=M*dOjs^jG1D
z%v?MI$i2vnw=YNXA=brAfTHhC#*WS9k$!&|P>^lt&aV?TSJKT>Y~f$ju&h|Sw>igd
zTtlm0GZK6w@PvZ3rnpAo?x?=N!v!@qiKPN<R98uuRAdlWRIFy<dW^?ZSy_3@WI2k3
zB95vg!Ao)K1Z5CWue2)kRONZPZe=JNH$pUCSn73|UE-1G<gRI7+<d6!V8TmS4f>-P
zs%mt#wpzbPDkjb`)nTzGmUftZd<6qQ3Qb40F9KXS@U`CEvtktvepf_4><=1XveB6m
zS+YX`0dS}trQmU++54`Wi+Jq)V{_jd7Hil-ZO4piR)>`4EsP-C<z9{l03NRwwMYZa
zv^PQd2kxN4(2LwrCf!)}{GI>?mF{fcm>BMaz#J6a=CQe$&Jt%N@rVVhTnJS?@q-iu
zss>R4Fbqui$L*xE|6bUmeUmbZiX`Q1zHl6`pkWPHeD0JW-0Mr?Ow<@1F<sI|-9(Cx
zlskd0oV4EShbGkk$QjS>3N7smD0Ewa+jVxTvPidjYXLBl>BTqnv~c62bLuB%caE!`
z`!&%7>99tbz>7ezH04OyN(VqL9%Rvo%Ud0RVp3gZ-1eS1-5>%g7`{C_`&XRVm!I-Z
z`FYPoVhS0nYpyhkd&2#&zJAGR6SF&;@bqVchXg6SDg<*AJErRd3`~3!cUHr5-;;kd
zU9?z^$ZVTT98Ouu7AO=Rb~+-@xwlr@f1W?{zHawC2PA&5uw_lf3dW;0MDljJx?WDz
zeVsqtNF|>aOtN65F?W9(V|C;<O6Nh~vF?GY)d#RJA>Y2OssP~`f2+ei55TsPd@SkQ
z6Lisn5j2=!FbTA?H0@i8I*))WrQ8*UXnF1y<}nFDVUU?mUWYgX0S+Fi(&i7k^upl>
zWS0}M-VV@njxQ(LagnJi%xMq&D=KWi1<tjvg;<-N^gB+Ni|2vWI+bvMGcs6uA4vPJ
zwcG<U(i<~+1x7-1)C4H{$^@Lsg(gs60Ed3;B^--!7Its(LVUqRt`y~S@*5@^GYZt`
zVDB~(Kn-0A$>R`;g(wSKL)A0#OGuJAGG*>^T7LLNSnjX@;!)Lxh>&ZVZxYVKIqO`>
z^bKktP=C*8stBwFl_n6Rdv*_TcL-p7H*%gCWNlB<uR&GD`$Tz}jW$y6#Mug|9a$;m
zGhx8UjoAk!eT3cucLp24^hZbE*T<Eur}r!Cgtl!g#=oRUjB!Eg^it>~ze+kfJzP*+
zyq>PlC&iJLF=cTEV2UwK9PBU=cQG1%SHHO^wuNAY5nQNqTl1m#%^d<<0&`Z%p$VzP
ze=1jvAA#f;WmfZl2!3vY*PVBcw00;IdnGqK*171>_pHt}51fI8xtLByL>)RF8ZPE6
z*AWHHU)YXv)3F)+L9w={(W8O;3x8OohHd0C%>aMYO`UPE1udzP-q@cqV{T8-G!*>I
zzKDiCCcYByudP{hY+yO_1U4fuMpfcYaF`F#QuI3v6i$-}n}a$vPEpagXC4%p2C|w7
za)%7J)os0jK44i9KE9p`u9)WLDO^GJE<sI|X(ak(LzcVc7W4L)dIJJLL&PE&gvZp8
zj)E9<z^3O_IiOKMZ?88uOPv>nY4Ljuiz$tr%AmhrQtp1ad4;HEzH^8I+N}`mwhnc0
zpEC^Pc(8j7QuV{Wx=81#%O((SWQThl)lb{VKH9e`>`^%mi`gwxTQ$_(wWJQ8V3j)F
zC=F)ZEdY3&q}xIhZ=Yb%#`Ff(cK1K)8$yFWyUH23VlBLDm7P+E9cNN7MNtA2%lF!8
z<fRUeG8pZ8JV!(O22}e*maaWk%0*V6z|PT=%S$*6O36bKXkApEDeUOj>w~Sg*_dCk
zySO4P(2(t0N;U{b%Xv^lwN;M99<A56C-)PcrpnLYYhS0_p(4?KJ+O0Ho;&{xFVgFl
zHorxX^<5Koa759pT}o+sBluo--)ueLoGdOPjtn4jjO&Z#{;6IDM#Du2>b?FrcEoNr
zl4rMb6_g;!0kJSl0#Yp+rKV1`d6!AY7K6n*qPZ5#jPOOq$__bAhuEJs)MWs;XpEPI
zI-j83%m^dg1&%SdYr?&dRUG;B+K*??GG8-(M?J@ln7mH&4wFFnU|O?UXJMsqY^7Om
zIr5$3{F~uEVT;?z%nJ84o@n(e4BMnw3{#a@qNf8M57n87q2Hr(8Y{>3%Z2j%LK7dK
zBe%0K9kNv|$#Zb@y{y=<9r}&xMzSbc056mm_L#Y$Jq;NzAgI3LA=VY1-d~kT;0fJE
zYs+y*@vF^R9kSX0)r?;opn2fP0(kf{B0WEP)!{*c<-@furX+1zFN&<xlQ`AVr@Z8(
z%hv;uPm}u(8@hv^48U<`MIB?^cd7m4yltk9wW^Pb#8zu{EOi0IaGtFtlexP_E>fKQ
z2~Zpe(Hdyi?mMM<2YmAQlfn~zr-@TI1EsO=TPb+V+w4WQ`Gzc=vkFJhorhQh9j4~e
zo1W+@s60`>iJ2$^!zB}Sp_mQ#OPifX-uL#ts8DEBSX)HC;oBSqO2RhD1+qpea~wGk
zk=rnZ$h?Dvatm(4`uG%KyO}6wR^Mw@%;3>{t?ryFL%Eyf7knkP4I#GN{G>0k70Alk
z_D~09^$s8%e??uOo|3;C9QJ<*y9%hPwyiB4N`o{U8bs-CkWK*w=?3ZU4yC&rNs;bO
zk?!v9mTvydedc}l-G7WT82d2JIeV|Q=9+89H@_JgTE#l$%zIX?Y9$7hsPM@klxF`(
zQ|4WFR^=YYOoCp5!9$r{<zouHFdnk|p<&123JDm;fy@cR(F>Z({cFNKjsb2?$!sZn
z3aG~z<uYH-nNA^+_P9djNA&vLoIh~g&&un!b&(rdFL-Acw&JGbWwI53g*f*`o+YG<
z&7D9ym1S4W-~jC#@T+Fnn;`n7%V|8Ixw%5gi0*9QNevqK79h3^fDEJ6vwa(nj)lgF
z_;89~^M;x6w}TuAsjjt?h3mQo!%kW_H*XK`v=fQNMWSaNH}=%)h8N<Y5w-)Mi+_)k
z=vDwYjr^xMyg%Ft&?Gnx>0mANoqCF^;b4n(l-EY7WWxAs)~k~f)2caQr1GW#*SV3S
zLJz0Wy^6u2r(w3!tl5a3m3a{XciSf-5*r2=(7>D<@@q*^9RsfK-pIyAvf?N@c4JF6
z%`o^EWxq*{f?VpN{Cr<3Ws;Vvt;-kcN8u+s*PjZ^EoyQ@0a_5Vg)y%)$I;D_N{{0~
zA_yUQNQ^VMt9K#^-JeFh_w=TGkN4hr*w4osTu+n(|1<<#rcOo~q>`2>-uqdgg!}}L
zNh%r|mFW?{CM(IQO*B0UHnPrM+iDME&m<;ketOr6Bzd?@VQA&7Qx^cuTJCy#T6L!8
zzUjtueIs$adS`nEciT$t@X>UqvP~I-qru!<+Y~zaY{E)tqm=b9S<mw%gL&#?dhhhp
zwPp8c$)QP6yfR_}qERn3Z9>$mihALvY&#W(bFC>W0@j7oE+V<*3?u@hi9e!~z;VP!
z380aD1d@4v<mEgy;osE~<0~z0M2~?%EQ}bL1l&HQ4AZzeC)Ac7+Kjwb&QuKCHR_3F
zGHR}YOQbL-;B=oToI3*wFVA+ZGXzH<3V@2#_>*lL=Zwa5vG5Gj)9nhm+xDxYuAJ1F
zfvsB<Q4OGS_&A<tMM}QyTzF>8Hk+;(8l$|^)HG7RLt_N@G$F=d_M$7>t`+I1N(?iz
z<^WLfXhWld)0OeK^U4xh;kmdU-={O?Mbo;6!)@=Py>Kf?`SO8#$1Y2z?wl?Iem<UY
zU_9kw1YIm-s%&ERf=1sPs#bD@(9qJPfnYiSoqb|%J|2r#E1;h32xF#W+;qkiWp8;M
z&Iw=5LJ#PY;7z{0NE-&g{}U5&;GE{huAw8xMO5x?E_ZL(4`_<*YHv4V+m-I0k|#%1
z0@;}9g;1cqr?T<s=n#Hx#pgFgr9veW`L|;P)}UJ^;}|x%v;_~MaoG{%t#3{b&EsYs
zO=mZ;suZcaLOan^jwpZOAF<)qfq!Y746*QN^f7It_K5P<0oHGcEzkUinwYWW6W<@m
zGe~(p5yFd1s-tOF<(>uw#2fE<Sj0><r>?IIj>FVelq?Daj^#~K8w~opc@IUZbE>IV
z6D%iPZHbc@v~{CAB?UW1Muc;BeG@s8iM0tuP1dm@lJra642H`+W?IS!Jez%Q3JMmB
zlDkT>$6~W3tiWNe+Ui&urV5}!D#kvguno8!UTAqEj*yEQ%D=0zo)B5>*Dufu!Ty?-
z>tS=P9%{pyiYdA>lhD&UVIZcGs$CN<8;*(+&(+i@f~g*vn`}Kmpr+RmqE`9Hj#fVD
zJjJ<Q#9P(e2pNFuLK+T!%AK94j8wJTA|p?h6`|5Hvr6T19PZCsSVUAQces%E&UE;u
zZD!_{qv7i1etA@6N+`8oEduZ@1fUEcRCPG)RZL7qaVg{S<KHpx!@Cy<1?t<`9oHpJ
zGrj5OkU?}-Rj#U4xkAD-s?s=WVMu%?mU?KfTvdp_m#MsSFymwC&w558PdAE!qztJd
z&uKUZeLDx(h4}s-Gla{V%kI<>owTdVYd6d`kawLWtPzzT3*eugSDC~!SXsV5Qs4Qc
zG;_UQa%{#*Vhfo2PC!CKYlQuhX5|52{UatE{{p$>fY0PA(p&#*sKg!6dO(?$5IiO<
zf02zpu*^wMi<uFn-A{P+z#&k^QqP;7XVqjol#ag=%@dkUi4!L)IPJH&yt}<#%k08-
zZ>(9`ozj-F;NL-Fd#`g!<$#}S5iW)TI1{<{1-@xAzSot26FIx*8k@CYZRDC|;GQzm
zmQaffr1Mbw+>59@ld_rd?>NyDdpnc}dhHwd8%H2jA2JcPPoxQzCHNp1lvSF~{efPf
z#T`N7gSh+B&COH&=dSmj5LQQRVoejm=N~tU4AX8Z4y3lzVWdj*r959}iEG1C4xBQ@
zp`evbj4vK09Z2r5x5q7DyVmlCWp4o4QQk8Lrp1-=@+)@BWgq;4@C4uN#bF%inuuXm
z_G`O&+t+=)6ALkG8ZPgQKd>F2s$EaqhrUy$`oqI}0UYLeU$+1(b*nm1UoxpdjEL-M
zLW}(BQ48g}p;bPW*2W633Es~9M*?Xb-A?Q(&2VNXTgTN6@ptRE@})2QQg)OhS#s5`
z3x`)eO0SQR&7@FM{L+v>rIKK3vTvX4Pw32)5?`vgjC0sKPzGGT>edO`H-p9(Dxfe+
zPcU-kA(5Ut19>IA^Tn26X58r$l`()GF9d&D4=<Ga=h_4HX1ZX)AV}<}IRIEaMcWJ@
zXYX;Wpkb3v!8L5ceMmS3tZ+~+CHzMdaUvSG(7`Y+_Nq?Fv&K*&D>nmlw_Hji!$Nol
znWWwiaEp|mi70QSm5$U{)BKVygV0AL$o(nCU&Ftb9q~z{P}&7|jt@d23tndv0f5c!
zXzKUH=lMIz5YWs<s;Rd4!#{f>0W!LU>?hg5V)BbIJ}e#_lew$6A*b=+)midYcnSKS
z;3-hx1H6--4H(gPC1nVY{)_?zLpi%fxmdB-HKFkbh~dWvU_SxSA2*~QK`l=U^iQL*
z#Nwmfr6gsp@jU<aaW(*Yj0g84f&IwS0<GHaRZNgzF$t2S#(!n6xuA&8@cpC?lm4n1
z3mbXB(ua+}u>o5RmKDf-3st&+O8*1=38Bf*SRf`Pb{8vwimX@%GYo9q3$W(;HJuL!
z<y%;ws5G#rnIE`j17OLpIe9H5zb)y{QTd2_pbLMD$kF&=$@~B~{TL`9sc!*E#nQk-
z{7uRKIh~I@Up?F-n(Ovw-Pj+G^DxKA?ZAYzUMySuZb$v?iG0LC9B=Ufnl@F$4uS}s
zju0`xyd)(hg*2Yn-FN~?OiWD5!SO)OOAQ78G=j_VD5+&UKj6nP^S2)ZMxM_13LN(O
zY(E^xj*0<CViwSs^bCJ-5DuUs6%4^+e+Iyq!UW#KiwzwIyx-T{#Qxi`e!gAINB;}F
z{_-W1-F=<JN$&ab*8vg><PS~YKnyo}a`KDS9TZvX;U*g?WUG9d=o0vnqbA-z7gj1!
zhJD0=<v2>yYyr0#64C>TW0!<DBMSpCI0v8|PL%(vnD7_Q0roX$4h*uuG0iaW1e*Cp
zJ(P`&pE`Ae{(p(nuf+pL0ZL~0ACqI!N}JvCg}rPD8P6V!RcHKNEB$S)JYj))7mM9V
zdTV(O4UlWGG#2b8CFK8q7a|&(kXjf|Sl=!V4siY=?0*p#K1iU7UQ|>RR`)(1O&DCY
zjII_7?+^ai-xd>j9efKB<a6n0>F~EF{rzni*aK|7Sx5eV3vy{5V72c$>9N!P@6Rv*
zy|9TO98CU2s_ZZT`{uo(-A2FK1M;%0SfuuE^=!XZj1S}rKD0IS{8pub^}rQiOUK+=
zD5^@bmL{?np;mOGm56nFF2l>+mK+<8rr~;80Dvk@uAIwMT`Zw$YSe-7&Eb0Qt{l*#
zh`7dTjp4R`VAO4r{kMbp_Za~8G&V-(5ejV6-Blt2IuQPmDgs&;4Io}N&ko9niXJAU
zViWS*9O`V1IeVBa+=Y&QcFe$IcM*i<;gJL|dv^LZ8~evV3)P4eBztRK#-GuE-`3(k
zdINyh>t4b_!Yl`~>i5jqGe?)|NpO|{Y-3#!NNZllfjzzPkz@=E^dCyKzp`4bMMpE9
zVgU?|-16rG|4hCA*Mz~w@Wq7k4MG&X*%7Y!WP**&s4wt&<J;{yZ`YU5lW3!fu^5(f
zJ7g*A%)K3Rg=rf;phu=nmHSbyfnKx6$X4jx^+A}L$obOD+5Qae*MQ^=W}q!yVy0-g
z^_=5<914oe6G!FkQN!m|3-hY;PR=219UU`k56H+ad4192_4BE<>4TUTDtipv*&7-T
zU32?cvF-i@ft3cSxKujr(&Z!lp$}wBE2GxCGxlNmO4oU1qHfow4Qgi-8i~6MYB`2w
z<E2ig%J%p_5GMbvOOZRDc!ohDhY<48a68*Y(>C6S-^?vepxth5j2*oSod@`_w6K|v
zlyYA+CW+mTC<1vqvz&?e#2uhZaCgEc!WBT{7;*1<mbE*Nud`9W%(`rw%%De8p33b$
z>tsw)w+jTb8_Xs-cYu&-IE@-XOdF{6k)c&b!O>CA%34M!PetBy2lo=m_@e}y{aJqD
z5GMuVjC4|#vtqTg2&IGT1GgxY0i@W*TY`#fWHHH}ld9aRV%r{;99+N!Lclet&ecb7
zTwB8=-5;fLJZ$mpEx9$DV0SwC#N9t`N)RSN+b{_!>eMqtBBDEeGBM{$_b)=E59>qv
zey9tLBFpWzR<2_E4TIGx^-(`3V`TD-aZv+R^fNW`fx88|x_s&6g!K2Jv933~TeDkW
zoL)<H2kz)nkIRitaUP=JcH9WhX5Fq|kXIj;spaT)p?00!@?Bq(PS2i9fIQ}lClZgj
zooz7i-=|3#)880_qX0-pseq?1o&}p5f)ZKoUOdL6DY?GW`I0I(8l*6I3J}3jj+s~Y
z@2ynXHc6~e)5PM86~$7Hh<IO}&g!M0Q3SQL`OEJ$*4gjqV$NB-WUk58e+hu03!Ud{
z`5-7+7gRe6%M1q8<LOtVhGda}y35Mri+`DjVkiLy@TDj*D0CDQA_y2R^2A>G%fP2E
zDp*gFBLf2iBY}Lrx-=dy(Eu;+bR*LsKuT^l6(3b8YW}I{?OlZw^~qc};c9l`X9)mG
z)m25^R91I+wG++qCZbGcz<v+tT@R$zTdt4V7VnN0TJAILQ$3F!LJUB3tsory@_6Qv
z_sD&ryM0f#l>5xu3*}j|j=mCR!(jn($t2=ByW5@pM&pT)qsxt+wG`)lOeq<gwjiv-
zi3b=i=M0?krY~?3i#F6&^um51tY@Kmvu%UC8X-Yb*>iTlyI6tvotE)8rTBASz-IG_
zg2VR1ZARV$ghEwYo&ZM6T0amPxjY$AkyL8g<=0MWFFA7;(o)nE=D3#JCufru$_>k0
z^t_;0RW+mvoA#A2j{=JX<&h0Fm!U($!R$y<2``T~7;S-8-vB0U_nmJ%<3g!19*@f;
zG&R?C<6fo81H^7}?V(OL17*F_J9)b28b}PFi@aF#B}6rTF>5=Kd^A5>Wy1H;n5ev_
zDubIKwNUEOa&>Z6$Kn(`I;EyJ8XTOzVJKJZmd$IqA%lOYu>LtfK2RJuk(k!1Bluh`
z_q!cTpt18raxws6$dy}zAIjOCnV;wmQLw-ymr`{-jkeqVg2Y>C3tiZ7hQo-n6ov4t
zH?e7u28a#ZC1mpqYrfPS?z;fcUE=<a`(sU2s%FN3>f$#F1<J-Fk1=S~dlzU-TGkMe
z%9+8!o(@;;dE1q@oRa%}1OQ=ynHo*%`sl55!}UTcoWvia;i4QWduWSvdEJit?B%M=
zOgUY<NBJ(612L>_PbaRp4F>k&e^o~PeeFPuPy)aiFC!U9dcHmPveJZGP4(?YRc%g9
zGV9G2%25yPj&4s(LFHy~B&z5LKxwBYuitoAiO+4ZU-RCy0lXlQ&URaJ2;{-jcQ7*^
zj|0W+bP`@TM^JK#4zw($9?9-2he|%EKW3?O0y>MB!e{V%r$*7nx|4BAPG_D=Ev+Kp
z-W6w%IGqvzNG9cCM*vEgZVSgWcvvK64WQPMl+5I)fqbmYc!7%fxxIQxdZRB9U~wsp
zkOL(M@6Xs+%;$-9+9k`uknz9r{S7hy+gJf!8zmm_77h56x`a6`^UnLr4dDra$3%pj
zc9P^$3C{vC-QR6!Xvx7a8@<)&j_EV!o%rmU@%(|k4Rh7xQw8#P(fvy^rh~g_jO^xb
zQC&J~S+N~Gi!CxWd7JM%=Np`-2Y?I+e|s?D3lia=hz=mnv|%{3N|B29yf!}pdymd+
zHZPqjpj^$`76Xm#ow+cRpx7n+TIPUG=*;C+hF+k)T$S{TYxeHvp@*XUCv9lAWQv9F
z&e_UC4N?fUpIVPABTN$$M|3zQ4WZdV{k@?0w__cMDL?RT-BQfRWMsTZypI7yBd%S!
zMXBC;gN8GK&zKmp%+NzX1S#s-n_($~)jD^nj7(;17kO;MU~qiPumyCLGj19wq0Ee@
z{k%=pFM@XKWEKV<I^A{1macATcQx)nt{fjUnTG{-M<cZtPQrjsDFbHmYDR47DV(*{
z-^`f*yuCnqFlO9Bi29V`ZFSU)UqsQkYJI-Nq|X4qVaCUQmEHz*_f+il>!>LDuc;8G
zfPoaz%hACX>UsLh{QdjYAvzm1IFMrP{YlMXQ3<!%`L$a98!BN@QQ6)myZUd&VVmy3
zpkgKf&`PkG@i(lp<Oir^fXZT`YYm-#<nP{r03|Y}y|{Ec3N!2yS=z6vO-~T)-&u0J
ziWTjEb6|-kte(TXnvlNZXNU~oDx{EUN8w_VD8%r6h^YK$ageJ$ct9BowhtAFw+}As
zy>T!04<{KGs6FJL&GkV(k&OLq0(Bcb+$P2j>L18;<{xTd&rjEOzs~~@wNX9>nynFe
z<ox2+@q7+|J?gLKVu3iql;VSj5s;PS+JvrJR(6nJ+4k2@8{~LAmHbY6`IikCh6Z`D
zpNLk-un`yeK#AoS5)uQz(^qxt%C=`umA9CG&`<HLKYnM){=2)u&hK&1$iDQ<I)q&b
zxSLPC<FUA8?|hPpy+ZPJIEjeRFV}%V!{dT~9e3mhoe<#DIluyJI$VViBA$HuaDb!o
zDovA!6Wmi4Hs03It8M|HdrO}P`V|rR>F9o!c0SO9GXeBB7J<ij)<)^W7xl8f{{gC8
zF>L$?^Wab4?=M{rLiV(QLAKct8lcvvF4#dZQoexO+zbPd`(h@$TFwQn2e9nKzX{GC
za_4D*@t5~VlmSy^en7x(vl#}^uS){WBMj6p8EYjf2RuBW5zBerH||iPAe;cyV<3wM
z)*8r02IGgtc7qHd;M_8xDf{hLJ5#_fkJbSiuy!RP503{%uMnVhbu2)`u?o;~pkiVg
zh>$NP0^RjcP*CylO&^mR_dyIl)w^X*>nPLkc+@)Oj=&B9Dew@F8ddbT`w>tFs%_-}
zZ(qS^1QT3|8W}%-|3sLXHP1V%%k8(D|E3}zW(T(5p?JgRY;fxb!t|d$%OC;#uzpwR
zzeF1PwLR0a=r{rq*ld2xB`6<?*nT(c)0&3p>eVSK3L09JjucP?zD)flLf&>;%q3Tj
ze~N<hKaC}R9=o0D;QZ9-Neyq=aDiRW1&%J;-=&ypEdpW#Z+V`cv~zojdfG+N!d{Gi
zX7yQ?gxyi`7(IsJfurvG&^$+h()^@aBncF@L?}@$8x{@jkRO99t>D|=8e!meNb|r+
z8*Kcqc!mQ&=yf=%`LjwGAoIa@M7}(hQL<y2jy?N>+5X{<%>aGIqUcY=KCF+wM}cEl
zmto*d^5+o2!T&cBMTa)%)%DQ`U;%i&A=&hsCU_9}J3wn(*lW@J)F=#|ou@n2x(h^{
zh8%x8;12<otQOkGj6a0Ia}Z9jtu3?ZH~>TFB)UK5W8=sa#cv3JzqXW54xfz!!Bdf6
zHRQ;j_yo(c{rMplnp-{~5U)oUTgax>X><Ra-0#z@3OMXEP-s6MxLJu@6CB#?uwCor
z;SVkV(AFTDIvy2Dbc8U7N$SaS!MIZsWPS+v{|b13qXQNE9<TC$ZhW6;4`?Ll6PcJo
zQ3!aP=fy?R$wEMp3=CC%;+u^!(-}K#R<nrR`DD*gNhRB2qRkDUjuMrSkkBh-3-y;6
z09T@FfY@&HJQw;QT&;4=AzXAK*(8Xw>)6jl6AU5X#CF(bs&@>Ae4r8M9js$^ANrU(
zadFm8Ra-{d>`luXjhRu}vzT7q?<T|5OO0W6OsvIsp?~E6rWh3#)P*b%?)Z%8Wp@~n
zJdUntJM2eYaYJOoK|ZpV3=#?G0t`{?FW0X%np_te59)WM4;qJ#RSu5#+0T<)&knh-
zlbrjNor}ndns|<nKCu~#fs^w6_Okejoc1__di{0T$DjYteeW@08jka%em}qlza1PC
zC$}(AbtK18pO6g~fA0u(i?T~b^CT{IEy1`joGP-T$39P?XLQ1#Q_cQO*o+x<%uTco
zf~W`NA1^*Q(l{`p?Wz|MZbQWW)AeG*c-+F(q+!}{s=eWA{jro}2F4RH$5rT3C8xSy
zzkd}30%L~6YC*jpYKJg6#z8rRqDOZ5O0ZLxybzhxd=V=madoowb<L^^-##jeb-_gh
zs27!n5Qs>Hdn)?dAb|lR`|){I({%&+m@Wnzzf6tBpmuM*%o^^m$R3`LT`^pry2c88
z|3)l6m@(ubJ;9_MR+W&ScN#tN;&SKRV=u}X9n)pl7)JfxnEg3t=TI)&?Wl6sRar-!
zGrWUJpei@sv?Ju}ljl>n69hoBM31N!ib^;vMns7$rf1n5-L<y3nIZ#?`tBVfbq}CA
z6yv7jo+1^u$PJhVknIjL%a-M=Cj?A`7!ZL$!=~2`%70rTm%!vIO5n7KQf5Ao(g3tE
ze3_|SF}*&?Zr)@scH=3ZbB6}xX1$RbF16MIX`d$UZS*HI&TRClXioc`rKu&d-mgl9
z)_QaSb*v3DZr6MEWqYB#c5$ZI47zmf5H2N2K&W&@Y5)Tj1op>E)C~L^3rH7#P-EEj
zJh@srReHa&*ag)O4eUheMD0Wi9xm+B!%LL~G9QdD7_6O6a}H%{um6K4ZV>d)z50aE
z-7rV!;}Z2WJ|3kCrveZg1LOw}4Yw)c$4}d;g_Ucpq5%-4XsN-#exGTxho>CSm3wO@
zC+8+d*cY2|{&3&>Tbjf~N(8$+Ij{|*X4P+4u$det$;4fpA1y<v)?3IrojM7mIG<5n
z?$?bmoZnvBRmHfz!no7G(P@D=-H$YzBVdL8ARTIphJ}*iR+~CnpdzB~c8Q@>zDa#x
z`L-*GJ#w`#vZEK)qHb>9mC&vHRblP+GoX)n^0;ed5FYb*H69MDU8ZWa1wV()Za$tH
z^VUEruR+|6^G)^n1<)46v{Gve9r6_chmB9OgT;btMRHxW^q(Tf2i_$PQl|-`bK=}9
zR9oh7W>LazX%z#>fp6Are%b8FskIGd!X*&-{w*=-W-JIztXK5(D5F60j!$?dDzeO!
zq<|Qyw66l4PR}3wu6Wrq4C*LG;r9<zIG*5_54F&81x9Ocvw{hjB}MKO>1PlLI7k@{
z8mSCUtFX*m=HqT-bi0z2J|N(d8V+ZM*HOQTDwwJ4YJ-!{w9J5>55Ho0xf;!4s(k&%
z?cO!-xyD(6&E8R*{frp1T>YoZIcIZ(G6Y6a<iX3nsS#4eLRC2lqfWvZ`%V!;+SgxQ
zUjUN{R33S{OzHXG9DWIv`}`VC=gCy@k~21si{JQVlBDCQZSI@A4_xE9-t2`YQ=C$C
zs{oo%BNz-qXANV!XSU{eO@+C-cic&@FM;f})jq|ak_hr7B|-l71hS1ww91J&7)H$Q
z^V?V2C>|(auGp>#KejAYipK~avYJv=?J<4X6TIm2s=hi-4y1!~w!+o^WQl-ztFnBx
z70ZFL<(Y#_H*Y6>$lKae+||zTjL$a<G;7_CPY_tHZ!V9g-dt)dwd&&=0*g&+%tTAM
zs>M|-S7SxyiLyv*>$)lfl$69jadG=(>3fp$Z0UZoWTY3;BD(_6PBSOBNbRDYQ;$hR
z8g=i0B3r^Kls&mlN7b$2tkn;=g|Y7+^D-s^^`qm@spsAdVNoAu9nIOtn_hTurhk63
zub|V-aZqWY2vm_Kl$+hAp-X@uWUAK~@W1c0|Liy5uMhFVbU;dG&!oR{sJXWy@@Nd=
z=}YU5?ZMXJ76X*-B0R;S@uR^5{WDLrz$R1J&%4g~$8l`=iiL7w^C`zEyE9cXcQ@8m
zQRATIH?NnPoW_*93eDU=_McFB!ZO8hbtpZ+z#_!vFxknhvwPQfwitoF0#dzREicuh
z26#10RM9G&KE3z1J)9oJE*ZXbV|P5te0DX`W6NRO3O~NY(xs@``EI_xBnX>P-RXRv
zO2f56a67jrLo%MBJAU)(Vw^RwpMcGJJ)dokpk|C)md^7!9gp2wbc(?qTB6<c=PC64
zBJ4S&_WbPfTbI#wDN8P|E-{cEnr5o>)P`>b$zP&kf$V|8w}OQ=Ar#WrqR?Q7*6H!;
zqYFDDr{>*FXlCa<JZ>x>P(a087}z)we&?rU@ut$G<@4$s(BM`$(oN}v(!FkFCVC-$
zOn`On9c%m=VzS|*6@DP0pJke;5U{Lo>m5a;fa9*QUw@3qp3g{Nlq*4kg}>{bA#C*a
z8_D_Dw`%6r=RUSTU*kQF^Mu!512C8+strrMuwZ*)3c~<|1g^crRH<VSWu1LTZwi~$
zs-M)pTI;KCOJ3)h+Fy_6O;w5hF7i<VAZ-YNgp3Y3aoo2^sG`W=yCZO3;qm_w0#YzA
z^Bag>N>~aI!?550&mpv2hyk~GL&Yac7@ulz`?)tQtI$=v7VIZyoYq-#WCN8JfriuN
z;c#YLr>EGZ#uKSW-#SveBglG!Je#q19$l_=SyyyD<xfJfUK){U^1}#Ye+2|96<BTE
z*5fyzX#DA%m%oJa42fYkU2{W%Mea-#Q3Fjv7(1eC%F4xjK|PZ^_uioZZ(@-iknv#9
zYSkS%`+z`5XNcVMNH|J{_qw&sj}QO))cq{NYH)nIf-nqVR$Gj4o&DVo`Rlq%>S5eg
zib-pce;qFm<86+XY{mgsy}g1?_V<W~5rrt}Acd>~*aV7}(j=8awejKYv7Ulh1{uaj
zGA*@Gs2e?}7ANiQZ!5Cgn<(_|>Y;|qm8h{f?Yy{VKW72`MZ3sPX=J5i`;&vfoA|%1
z1O0jHfRGp3fm?nxUJ3ecNbCE@x#{<0Kng$eWS^05g@q}WDl(_r)7#f1LLz1y^s0$p
zjX$w~i#x4|X^5|9fQwT!f!Y0S<Arv~6YQI$**(9&!N!*=k@dz^>Ol9W*Vw=@a%twc
z*aHCEd-MIcozD_nZ~k`Nf;^)Hn{6*P;p|B<x`)awNqwQX+6!nMJ4$t<Nj3)Ho<)(U
zMXip;aw@_$m4Gp>BBJ_vG!HrA-C}}OzP1<b-&K-ROPmCuf;=<H4i~npzdfVVndo1_
zFJgQgWK7KYcxffH&k{Cm*H+2L)z9}ZeE;!?>(G{%d@D6CiX4$!bEE4uV9Ol^9P~mg
zEK3_dA)#E{3xZs@ZE2iY?E4s^R#X@W6e1Iq=}Sig=AE5b3}9wNdK-CIK>OK|3j{&z
z4gl12*4Fu_5HN`F;PRXhq8;MRw7+(8SSuXRZky))MA#UxXZ@o1dg(rkT8nzD^sjdk
z8K2#g)Sc?FdM6-h^07sd7obx*c+W=jHtDiC(1D3-J^k%>x?!8?=!|TFP{7OrUaO|O
z@6b#We(#gy7&Zy1MiYxp8OEEhjm!ylA1*45po%A%c5#1w(4+@+ACxOy-E#|uVdGJ&
zwE+u;nMBIqqgw$R_F>QlA-w<7Q1yU}@a8MUMW!*bXc&V*>?+i73pXz6es;;I7ZYSS
z(hY|o2yoBYT?%V-8*MS2GTq-L;<>yf|D~rh9l%STK-#eg#FZh)xvWIwdJXV!zHPiQ
z@?VvE`K8>ji<N{VK4C!nL0SL)YDN~dgJ>NxQCeI6aFAu==@KD>0Ly-~5m{b^F6&#&
zfq$E$v6{G0)gAIVI*SegcDMBB%qYTw*{5(Vb69{%$J<9NJW8pE3ud@TVRxTY2?4eq
z2=x!^4v@ZTVmQ(wF#a5y9Byj?__bbAC5D{H*(y~Expl;Fm9eH>UKrAau9tGVlVFz{
zgKkurhS*3;0lOI{Og}6?OGp|ZL1IY43~Gyf+MVr(PW`!rg!1PTeZR1Hk0z=P;Z^(0
zN`s3chekc@7{`NVr)DH#zlhsPtAi$7o?>-$$UA~af};j&4Yi8X)B)P3_B=38j?DHh
zufydst24q}J9T-d2vf1=DKSoGe}hR)XRD0PnL*#P!Vd_7`;=r>yBd~F$Az8a1@zZ>
zZ!i@7QRx?K`)(&RhzXzjRqRF3u66tGcsdbpOH4E@z6_U2)Q%8LFvt*IuFXsTWUX#F
z;HbhhttpxER(?WexIW&`D!S&NV}9d6r9gRk7*p@u2WWiaR5|vEsBp#%<9z18PZ;7P
zb-xAXYxKtU+v{25ald?}ov9e^x@u)6Oe#6S**pb*m#cW_dNwrbTQUuS3bN5V35HeZ
z?`t4B1xfQdm)uY>Uo=KM-Q<TU{qukaE<E^-aFF{`V;N-?yRhw}bO$G4e36nPZr#lz
z@B5RYXRU<qZV!BRd0ds@G3%9wZ}9oFOb2w<dzTX!#F(~I{0K5oPTStBODFfG!5=8v
zaApolEhn4=_~Fp3HjOMl%lhE8T;=cFMwu~a7wjx%6H>c>Eki0#ZuzEBWfmd@MX6~X
zZ{JHm9b>_$k8a`H5{K36*_J1;IK6|Xf{G}nd16K4?&qQ}4)Th)7f3osn03QMg_sGF
zhMi)b6BTZs`_<`Cgf}>IJDwd-$Ys>hr+0o^Sw__7^H@q#MB0NJQqxL5vMe9a^$W!{
zM_0UWOyO#}SJ(YoHLuo>G4Tb-KU0Q~8fjvz<|_h(S9_^NUjkA32b|HKH=i>_XDXN7
zK1FwSMfRK2?-U}N!c{CJq9}gfMT-o?7P6@ZG_%e{nO`;iNYSJGOR4@Ykia?f2;49N
z!~5CiGQF`!UD&1Kg*Ne|Iv;wE2IB8^2Xa@e4tpJ$399w3FMDy`UVeTHZ`fqpA1xv1
zdZrE8aP0}b?Di^p*kB-FG;c?0e|x+S3u$)FIiQaJWw@Gqz-vXJx@VPHo%d0~IF`Kg
zLbIp10d?l?)=7CiEAuO45>La?DB9HWq<h^1W9YAM^J_TDS)mc^SF)GQK9d|&NEddj
zgd5D)6Rfn1n`R^IyD7`ydmd~LPpQ<ZtO$2jQ8x%bK0tJ*zP}+UHL3s@odC(Ksu4y!
z$3m)C0ngWDY5ovQ@6v3v2#TB>+egvivIuondV+){$B_4N9gdq8jP$jNz#dZA3)b%k
zEMEYHMWL2X&0A%dm!#oo&!fctZj3zGCK8}vHL*gyb#7-;#ggTtfs#P0vrH9+Y)i42
zPfS)z0#29LCc-rtZTJxe3n_zR@`?hlZh9yla~m#cZ^UYFmm%1&c%rl^3?AwBJUt$}
zz1k6`+BeGVa6PNpxY(NkKBm7pN)<j^!@ck=UiO$hccvBN0$Bm72B>s1P=FAg`hw4I
z$NW~+KA9qxZQW3xe=~evERws!{EB}?=o88#v)KISZ$bM+FTdpFRgPr7A0qO=B7uLx
zgloI*DNnbflixq>;Y?1w%^{A6U%H@8Yu==G(uOmx52LTu+$k?m_3~y7^Ye?3XU<mZ
z6hG8k3}nCE<~x02QoA+!<41b3e`xCeXzgY((6o85wLEH39(Yaj5JYL!VQ}f~V3QTG
zy*hiIIFPbTii)^ol&C?2`D!byhWegl$jqx=tZ0nt!Vm0Gte{06#Dq}O?fTr_X^%#6
znnB{RLr|vg(o!g|_HdpG>FN>=)2rLFRE+cP1Hl;D(5ZtuyW>9nKz}gghJgk%?`ozP
zkDH=m*ky_~W|`n6P1Ps2H`bDdGozcH4zT`2Q0B;npP*yr>b^EyNLWUXgC-RA3rVe<
zEiKiY3#zrMTb>|@E(t8-oOx03d|Cc#RP3+scJiqcLD@Zd%hLf_SaXm+5!86;Sa}rr
z{g^6z2OfuE@lh2@tj;AjbBG_g5E8y{cRW53HZCX*#zw_P$ex}h<<B$C0}>on)D6N*
zq0jb;w?^UKdaa*I{&+1$heBF{c8uPD6K}Rax>-TX_gUrAuqcc98EM_V^1v{w%*C~R
zKf_@r>WUMrXIzF3MqD2Ur$Avhx!kTocoMthE|`++7^4^O728H@^a0L*;zr;dQ8G4I
z$=&;vIErrOj0^4>2cFPeZ2~VPa9<?nakR}5W&t(Kt$Z;1gGS;kiH`o$#4Mbb>D4%L
z3Qm;<@+`<$y02ob^uA`%WX}5Kr9nfV!w~vCrpIj$dU{@W;GZ|%K+HO~$_|~{c=>gA
z3Fn8L3_+$XHccLVq)q_CmHE5>Pa1XtDdz*$!os;yqGNS6t!i~L+7z)CS9|UMy3_zO
z<6;~eYM+P$l^~rzGFVgJvq&?>OTIfA&2V@UEy`iu<$&NkBXq0Rn=!=!498@OdE*5N
zLiv*kPAhXMOYTEH$>~ccykP;l>z05QyiOWP$2R>D#g5s*Ji(P*b=)Zuia8=eg^kLy
zw85u@B>XisK-Ui{y?ZxH`z`WP9bbw&fkv0*1a1lB>*n597iQ50?ZWTjIbQOsJvRjD
z4e7l4fU<C-ZMFVmsJ|ajzpnXE(L`mTeF8tL4lIh3V2?=+UVbW%Ah<hPdG<NqzTxvX
zFq<s`rniE@sKRNVKCl_!@|7T^-ulIA3aOn{L-U%WFYc$vOc$hN^+7D8<!jWw&g6HM
zd9{LOba-*DA;aM!fZb@G`fi{2v*>y4w-yfF0rdff5XXcIaG`)UI|v^ZEpjOB=cJi}
z9V`~73Wyh>PbZo;1S<`Z0Z(KvJ8liI33lG$&N8(Kp|Ba_&%N|@JR`BFII?PBq-Zga
zYJ2Tr{0|4jZ+=NQDlj@o%D9YCUbm}Hl(KKEXJ;~@VJCtNQ)G{eq%in{z^PvqlTb3K
z;wt4jHz`@0&u&ho2bfKuGxIdt%ATi`h|KB4fZZjxe^4>|hIgI^Z9HG6IIH1qYJK+n
z*g<%_&geDf`LLdTVxBlgXz6`UD%a&#dYP(c^J*Mk8b(HVP{CrFt1*c>PsmbUz#VP5
zNh46PmpUI?ML(xXr}5Wq@m*FgZL6EBi{Nd7on>Y>5L@lrfs1-_C+%61Row3B@lEF?
zllz8S;V*~kr)Txs1u3;>lpf!!41OnO@Mb>{rYm~?N4ngh7A&N~p=yY_J9N*$kG?AL
z8M94hvSg>wp{)&C5SFJ~FZS1{xhxTx{K{n8H0+ZN*1ckAdtw45ZqL8?^0qas;#!{W
z(#WOzbj03Yoo+WZ#lWJUTYSlAhiJ0mGhn&NwWZQfhs?OlU@{K|12yeX-U5dsvytHJ
z@y#JgxCm!Jo{FL+A^UQ!$ap@ndE;6xxc(bKTwzl0LY-Cjj1MVOaN~=M`Om>QND5!x
zZN{U(#ywf-BSvtmH;iP=svdh2;cs}j5}JaIVf4sCDxUP{mfZ5(U0gMic^oVxg;U1h
z?JkPqvHm(2*4ZT3giwQ1fb;eau~L~|o~yHr3skkJ*Tt%VDle3=gzC&T)%(<cmlQ4J
z2T6&gF3^?#dbDu05t0W6mT^?z45}t_$}Fh5EPT=AuYv#z99&<a&Uq`0{Uz?{<>u$=
zLd7<WX2{7E_}?UxdECFV)Dmv}M8Ax~K3o0VDi@{I4_epGeosI0Qmr<1m!xoOq;2fh
ze^Un$SC6w%Flys#;IiAgQZUh>SG79cdYARe`Etr_wt<>AW?ik#S-yyS!oj;dLI|%s
zMWoP(g~T(<vUP<FkKI}92IgjBMd?saVve`v&WEooxzB@epb7lh7dKMBaxY>JoR@8%
zQCX8)a*quhs~PyjDIwY^_{zA!w&9fnCuPT|*Z;F|EJ+tRTuub>e6O6vI{}v#FAzTY
zCcPggP9ryJcfLWfc5jkmE4k1@fRMlq9-;KPMZwC+1mXS=){y&70l~r@OQQf=2QlI!
zZH|k?0%XG!)`i_J83RVr#7`x{XZwuwg!Q(A$CMTakQ354kC#!w9w(^;qgeW1^c+M8
z`d2M&)+3?PTCclIA$P`e_vpzhDnvnwg7i=?4$4y(nxd56j0gO2P%TS=<b_Xb`2p)k
zG#XNF^a*5DNFncDBc-fTnb<UwI|YFS<=sm9jr}-e<BjcW*Cf63{bpamgpGqHn106G
zvu3rZB2e?mdS9c8P9u!Y5^Yl)NsaJ-od>^Z7Sb?5NVzC5qJj|dB8Q`-q;bzHLsa84
zepF9A`hS-+QVaG6oeMb>esQmY%V}}2_9WK!XV|ow0WMRFBpr?VPt!MS8*XvxA;l5q
z1b2uk)36cq;y>Xtz!?-J2RFQU8ZEi1^!%3>8^$MxwpeB^hi2eN8deVZj!#s1{(10^
zNn3;mAxFoAzEx~C)JOZ>k&Swk>N<rW3bI98oH5<zX)^g4z~o~<wy05!pa}?7>heet
ziuhgq5P@+WFaLbJK9<uImCB$W_X1(V2?}M64ZU@V7jKtuPXGOOr~sds?YCZuhjp8S
zBxV5t$s}&uNUfbhHXvrEI7?;V{qS+n5L~7N=GS;wf9ZIrUx(KtI8tV?Uasac8XyJ$
zIz{uF^ss0Y%pH~SeNmqdTP&j?o0Aw!23WvA5ZXgl37b)HYcrpzb+TLRLo!tO?*pI%
z<LX(CA3rwfiHCs;ik%AQ3m`)-$vrgLn~pHs!?8@}N!G^g2ngq^hMdJ_gui0?5p+QH
z_MjjKS1-LtkH&I||6zXqdPKfuWIq?^))zu+s_)^KrBit<joMYkK>o04NEx)L6FgG?
zWUv47_79PPF?oEEvE8A;gnyce;@PlOO7>_4?27J+;m0HVgK7IPWgqzJpTKsmw;hIC
z?HATT(r8iwf``JFDzqR(nEmSkf18D$|7;_F*!(`6XW5;NLbsJd!?enzvWcu>Ktz+c
z$44Zb4;8sOj-(Ex$)HUzXex{PsOd@IU;Ff}u|oWhW&b@i^4B$L44)S^GHRuA?_XjV
z5&8%LwyPtB?0+3?nRwve?6jh+5?T{}KH2}eAVmkmK}HQAJ^uIU1bMy|^nmtxZNo5y
z{%ex|9?g4jnNUS3!I?m$=wj<H|5Xd{f#>=G<H+6A{YPK*4~6mSVHZ+H^!onKDf-V+
z%Ao**ksIpLmH+3Z{W-IskFbLtVXsKzk$&6v|Foc)#K2%O(<8h7pM?+O6F^A=Bl;-H
zK=}VnkI=(lgeW5a9{~n=`VjL$2ndAh^87zqF)ZL=Fagr!i9fPJ|LY5ILckjO5O=x!
zf765UFc^$vNtS=#Djtxi0er6o^la99vDqD$tCN7h3_z7kYgWbiu~eUjH0&8{wJME&
zKtKeL6A_QepCWzv;Yn_~nhE6po_4-%(l8LT&lg|B?r$p{WB^AAK&{JDCYfA4`3%*M
z5J>$>cje08cip)<lc`jiL{I~Xh@LQ!|Fst$<erZh#R8wliCA48>h99HNmJJrASpCt
z6U~flGm)q81eh4R08UA=1LjLP0|?p`g|*(4HHJd&y#GpT{+w61mmqa|WV&OJ@%VGO
z;>A-Y0@B_~O&UdH&%C03(;)NekpBt@{vpuAu!1Ihe7z9WFT7BrfK@}r2J4+|vGv*U
z|JNm?^5<)V22x=Ck-m2`^21%aYBapyez;!lsxtq&D@DZ~PKbxq!AOmllT&$%iCX;}
zPV*^vA;mvM;eP6hOj2+ZTp%9DJ5UWnwIx|f6Z-<Jm#_I$@ZXdL5Yh*d8D2Ac(qem>
zYNCs!<jp%<r97_rT>o>?GsPe7BS(h7@~3g~4`SMkoCfBJ@A>~I0?|cOaM@Usr5~Zk
z2Sw!>CA4@U9j=xdfHzCJFMnNb_RW@T3srIbhFCEcmFle`=}a(HUPC^@e?2B&00`KL
z0c;5QzsW5ms8}z%`aTX*(;zc(&d6_)`iSC=7ftbFHnGr*Py%-7Pb=f6mK4o`EX2YT
z2B`32o9^!@qEm!f0)Gqy^agd3jSD3RCN_cr8vaG#KgoeKcozf2(u}!cmqF@_2wuD0
zC?IFk#Xa^+qA%f`Z6|th$8?(O8IX1zvEmgytT+7tAk?XL#?7OEBCno;uL0e2_KN7S
zJ$m*%01jMlYZ#-{^qTMifsm_EQJN{0I9_+}+sE4lB&pKW;b!F;U6iO4cAp+J0?CaB
z5J$s8Nd(KYg3-b^B;za#v?3vR(Rw`!oh{zJnLv6wOSfyLL5#T18mI<7+N)X+ODH$Y
z6bJh30m*yE<8o?%g)s2uazn!k$WZNOd-MQ#{H|;%Uh((vdH^&x$!(<NqD<=xdCl#a
z%599VH-J2($Wtteexc$P>%}?`0+Rocd4=8UeFFumEr-5@ED|4ES;a1`^vHlMIDgwn
z-EwrjQ^Z~Bcs$VP8+Sk^F-J4;^i>t|kA2~X1lT*1_ogmZej_E6Jz08I<N`^>^z?sU
z>`Wu@!Y?8+f~$h|6aDxl=(9_5S2xFBk9MtJu2rf8iq}}_>0Y1Vm%4lv-<rw@1Txl|
z)l+Tkb3mUnee`|XgChJ5oiFT*x96Che&IS<iwF}y{;OlH_}P9Uiy;W0r>6JSsCSUP
zI&&3{p}AJ*o^|xiUpQ<S&9yG7r#oS1x8JX?XLBicIqKT@&@<3vLf=u(w(FRyR+p^=
zuzqKqI2}=SMX+1vZ$DSNs>u{vaVLfgg5Kk_3C~c*#Cpzhdzis>Y@-1n6WrPyTbqZz
z4Vh&pcj`{0*6E$?<|-d(lgnTsLL<7gm~?zN-HGX7NRJa7B7ER!-k-ZpaMBtH!j{j}
z^<VBM)yLoE;6C%gQDl+0cWcOWJZ_K8^rT<)-*?sB8Q-(&z$cSbCdIiwqSWbZxoYz-
z>IS-OW3|VHgY(DVJpp|@y57z+*X;~LAxFBlJQ}4c+kQT>|F{C^Zoe?PX(|6ncQ#cJ
z(n=r6<gIx6vtle@dr~KRv~yV$Z6-WNSB<8iCl@FUGx!C~`mTDzj$lbIpZEZff>`6D
z^>C#!_2Xxaiuo7P0hmq9vRk7??(XU9_jC5fndgx*oiCA?V}S~m&QG!{y7NFzT&-?A
zf_8yfUmrXOMyCTY7Jwhbd<|@h0h2+B0BEVOpw0$(Z=a3iNJYqHiNm7r;B(kT1KCSw
z1laq}vfxL3tVXbwy~?oTpB#^QncBSp=D9?UAgpf@^*2YPjMN~2x)fC`rKtWWLAF^Y
zl}Cv}ygEy2M*3(vk5RYPXDIvGRG9`@p9)h9bKV5*msiDCAMww}KhsprDM+r@bF*p=
zJ0A0#;6Q?tb%&GG#h=of?ZS4Y@}_DzEVu@*CEKs#K(qpQ+~9AE{uBhUBGZ)-?)Lz+
zJb{ZelDvtA<O5P8>CCH3t@faU{CEQH>n{Hliq*opZjA^+*6$jfF9{Z;X=Mf1O_Rda
z9=K}6(DPv9|M@zUX}%w{y*EVD-TC-MWOX~wx}!VKYORjcpdWAE^wk;9@h)<%Tvp|?
zI@}%=>v^;w0pBEUX9e{{j+Y`syt-%SJLmb$w#bl$7IcrB30UoR1v4ny8?|UdV!ok~
z>x^}7jpPI~G`M*`wK`}(iW^$RDluQfOon<?B{EZaOVr?u6vo-}`f27IP|G5PYJ52?
zL5s5j#v(wkPCZ?LJPqFBXNx7hcmxB!#;A|xa(gQ0S$frQJ`aGpc6%bo0!Kfozqz~H
z@aUIj^Odqv#h4c*Ed2JU!{ufuXS&ik=-Z=(L{@WREV3rMJ!qF;>y+#BdsF?#<t9_T
z)!yQ%JPl0d<8Kw*s+56-5c)@L@9qGpNKksD|1t~-i$3gZ-Z>9O5=xPuWDIMh(RD{+
z*yUo0;%KOY0P69qJObWD6g>fm2y5O+Mb`AdeZ4x>{ch~}LDRsB99&`W_ILrwLV>Dj
z<q8SSr6W>+x>A{4L#?ecnGEm9gfZ5!)R%+rGLFpR>7Cdrov;WOS+)yl-n*6zRCV>m
zLJ`tP(g!z#;!9T4=@|{>C9~u#g$`2=ugY0l&9uaKXtfl%L|Y>%Wa4?w1N+BMqgOXW
z8-vdG=V%>7N|);v%#iHk@l3_L8CIrqzO2Oz4{KR0UBJoLk9iqu%yO=vMSeS9sEX&J
zL*BQF_F1&3sZ2g4e>gUKlKCQVkryjrXq;-#H>nRV52!8hk3ZFRe#1h!5}N~9Pm}H@
z-M)QQB;NvmgCmp7A<tpwA}Ab+o$Yop(hV3^eLOdmMQUa0A40L7H(cAWI=<rrD>L_P
zSGlmBFe?jr3we9RO+n~#SRyNVesG`Zh_XGp1;=jJn*n<dv^$XGRw*|zp1nWWnUKGe
zU%M{fx+7e6orB#AVlkSvk>{6R-=A~GhVTsb_c*ctOig0qdqL228=5S)nT;EqPH-K%
z(|Ps;d$`A!@C6feG|*GQA^6$k0ks3N57=_$jJCcV`ZJphu}b>|p0Xh0l<_^8p>$T~
zz_K0pd;H_Y-8t+546NV}i1>zVHoJo?3TE#X1~G-Pzl1W=OhM8ix}HsGqa8HkJjwTj
z-x!0}Z)i^Qio>w%8(8-w)7K8d=2j<gSmdEn^)vHJGWiaZ18lu>ssVFbL6MCRe|z+<
zZ%Z-U_WLjl(grN#IZ4Bw;EyrgwDfIR%!kHDVa$l-0|D;;SdMEO#{fgblp6hB7YU%W
z(l{Vj-lz~(JFnq#yiSXiKI=5r{iLD4G3WgpY^sk#p-8It(WoM79`7=K#l1Yl8l!E)
z4s~OY3eTvta)sMyVy}IZy+|Cj{}_37ax&1YBz|pPT5VR^(YE${mQ*4BE&(r=r$POi
zw;v)%G&V5}i9Ohz4lj+6kjIL2Ih*e?glo5%+a3G1c8%5AOoe+^*6C#1oQoKb-Dy-~
zSa&K0880Gh83jE7H6@(*6ARLVx#$VN3ufQT&afbfMLoOKf_%5oSSH(Vzc(yK_ZaLb
z7h3Vd@^wK~wjtk~D!J$(Ar8e7$(tk+(80k~YlMUWHORHnM2Q*M4BLFgZAf;#>N95&
zJPwfUIyHV>v_7|?+VajOC_#kF;{(EWyqcqp-J|U?-7YJ`(M>S(Bl-;AF7ET!^b7Pa
z0qw}z_Z&8;5s5U8%rg!)i%FAy`qAaa0faEhJ{P%jNFJo=F$R`Q<BgTS&Gu-2-D5}M
zLU46r`<*Wk1mN^mg!Jg?=``HIuqjVGC5Uo7l9LTlCb*Dxc2WEo>FE2xR@0s>nbRA?
zor6IjMLjxbWRwV|z}D6aI_9)&#bU0n*2dP*?{|?`k7pVK(seEE5|nDSQSmMGV&@g+
z3+;5x5LCKNIJ{OH{TMkny`Dpa4*tS?heR(^Y|Yj=zMCp--W~>z1XUR=Fbv7$R*43*
zR2H6|kF#6$35@tEo~;wQvoJC`pGp$gi_j5p4ewjFbgoWsqXI1YR;2<45YiI{%x&Y_
zlOLrZO*e=DjRPn2s8IUb{<&sPHe9q`#44l4+@RZM@9;{X`GXyuT$-2V{g*mZ@uxu_
z4o)19l0POZeO{1%v+%JDYoc)SF1x){BDYjgXMBn4WNU72lsjvjF>BO#DMzDI9s;_{
zb^;?Tf%!As379uD5-aq3`*q3K`p*Yxr1+kzZo=ZVF{PvrC;$8Atb`3gGj`!c-Wv-E
zTmDf#0}K}`pw=xS#R$Ku#S$eGv(QOXv$#~_#o7HPZ>Ct}htI0_!gB*m;ZI*AilX(w
z&_jb>RqMQVD$_4mYk^5LOqh;PIzEc-;~;+Ig(*e-b?nf?iuJabJqGL}pQj6p|ILB>
z`#ui0prRV`BKx|7?oUIsiJFZE1Fj==)D8Uv&6M^zAO{B31GcrMd+D8B{~W0>Z7vHH
z$D416!*Ya;1=kc<4CGQNMg*OY(K~5J@EJ)07TgkR#cO-;oaS?S84dcyuWSfbL{yLK
zpQ*4!=9J;vaW}|tnlzB8wfZrr>aZ++!~;va^>2C$F*bH8zdu_OEC5j^q{+vx3Xg>+
z9E|e}sBCR4cdGVYnkk(7!XPhVxmwpg;=UY_TE-(0OT)UK8-OA44lSSz{YQp_&jP$8
z2u2kCWviBLgTrRWB8+;rvq8hR7<z(wHj}hb_eHmn0Geb%=7DHhW4zpLjz^iVKH}1j
zjFU-M&>Ii;e~?N-m2)eMMS}o(Y*PeMY{Hd8BrHc7%AG4dDOS!b*~Z!NC`EeeLOf$z
zthm(bX@JFQw22+`G?j&Xy#%Y_essj!=9P-{<-3-dZ5Fm1s=|uKP@SiCay@JFIgtsm
zWn6DEAkllpJqT76TnUrUuBhk1+~bHAz70{>*v;!=-z^`5g>E2zH%DNFKvOB=z%*tG
z`&==&-_3xtvX$vK=xql)YPA@%PQDthx<Aidxaz+WfUP+lPv^j0;x;VG=;j&qt*t#x
z_?(9mDsvBD@xmMd7?jr}@l$~wm>D=#;!F7KRsjaR?$tRcOaaRClP$)1(?)7TW+_U?
zjpTDZ&vE0yV+ier^43L7k-Hh7-z5iW+@&T|_XWSV22;DQY=$~=8*H(c67;x(hiP0e
z*yYxSM(-0JJdU#FJA=`otEu$pTM@hw(r{kYv(?6FMrz_hN9`z{)lWJ&naKk2)J~~l
zVUGwc-avU8L$;+h^Fn?@)&#+mLOq@b!NvtUz=rZ%`PF|Ou^veJmNWvi>xkDJpl-UA
zn^SBseF8&=Gi7(#!}!*X1;F^wMsnJGfcO!giAeCsYlSVjU+%CC&vWT@)6o=7hhJ8D
zOrNh@wMF_>;&m9#`8aDKy2-aV1o#QUIPSx^kRTzP=ig?}vf6{aIEhVgFK&t@pfxAC
zp1x2fw%<M|VkXe!Oj%2D+7^pkor%tqbfYxriqJcwbKID7%r@aHL*=BSw$wi<qip;V
zT2lbA4K;zH0F8uAWFd9fzFU<w57AeD>J!-e3S5!s5Ny6if|g37r1rAI0}l82+32_X
z&F6Em?Ty3;I6UvVO=nM*-qP@IY;|sZqz#J7>QRWR)#B9!QkfqP_Q<Hkqm9nv&%6U|
zL%rkVH9a_%R6m-2GNyzZBcS7F2b{!6Oy5v?y^JSG5IJd$qQ2IM*>5{|_0zQ#;Kq|T
z9$OMErhgHqq@e>^sqm<5zcXQ`dxWV1tzXKgP^7f67dRO@G<63VGz4QoFeEZaK-a;Z
zbamD1VG6kd_PJBYsoZjzy2#%=RjG1U#nTzvVso%0&*XCNT;emb88<EaD^{2LTYX!S
znSoWLr<=!YzGqx5dc7VwFp0Nk32-(qAUV%1hNjQLJA!wxEns%ePZGQq=joh!9YzM4
z!JjA77!}{O77u+OxiSKsmynljB0b@<I$KaO-`u6<-moC?E~Q7Eq@pC$yNF~qW{tbj
zBxe-fU2#K~eC4hhDSat314}}_9m=xIqd)iQLoMsH3wQ<G^~+QHM3@+WZ7rk5P95tg
zf5O3I&rd}WK&a<mVT@Y$N}%;Jd)uWU^q+5A6iVpgNvEEvcp~Gzv@HKWw$3^#s&{?+
z(t@OdG^j{-OP2zIbW1l#$IzVyA>Ae2-QCj7&_l-z3`h;lyZ!#odCytz`&-LDaIs)*
zX7A_O_jBLZ=em5!z?{&ivj7+og0QK#rc<bCZT$=LPt+ERO=n!>9ZG1&ZFmdMK$qkS
zp5w%$+_6BQSuSo*Uw!Z*nW6*I$>^jwBvMT0s=0ao$a6Wjnx_2KrCB9C#zG}s*xL&7
z6`NPxP;38@MgUTPtp(kn#zm6*pRB>|Rvn~P_mDG^l<#{Q#1@V*7uFvFM1vtx*tqph
z^gsQpZndiTgb<-6g*#+fmT&vc`|c_1#Ho8+!fPuB_jCppU;f<#^#o8QZTem$qXmr#
z8p;A|yy*K%n*nu%IS<<ty^GoTSABSmUpDT;tGlu87R9C;5`6Cd+*-U9SV7Z~7!4Hl
z_4<u*wwFxu^x<wGQ5h_WKiiKLM4PMEMEPBPckFRe*a`$+QMupnmhM?><LEVBZHa_F
z?58h=BoX-xx_#xPdbMyGKg15R4VScNy?1>0ZA=b**8I={`}71lNDX^+NsW}s`FSr&
zNbYxuhK+)tkL@&utn8GoJ>N*vE~dF4>O}+c7Eb&LSYNlxUhd~kFj>*HUB;J#JDGuZ
z$22my?RnUa7j{ZO)^ucd-*wdw6ys#1<B7S-1EAO9_0eDf-`;P3#L>|8Z0zwY4Uhj{
zQzmf}0uJ;*J<$o)@QcPZ1VmHZ^6lFM0;F6w%aYEcRNm5|yl<v|UDyT^dVoHKg53$l
zIUt253&mZ*c7rW_T$Qg-F*o!k<#bNxv0GAxLX9vZo;*y+T!&+yoVMRbQ>wPQ;VcUt
z*8q)<G>Cayz#br}W@^VRe(qT8{wPE3|7F;q^t<F!LKMfiz`CCY+!o|J-Kz;qzey-3
zoem-XbN4dtl{Lh28LtJWUH)v+PD`x5nV%BO&ivah5)fU$Krd-~XHr%Sk&RrOld~Kh
zpIVF=x12T_G&MUPh)mm#k@-B-RNcE%h8r7z)Pt4j`O>dNHlcVz9tAY>EhU{qWmOVm
zF8ff2n@)b@Wu(IG26~-jXpMBvWP`q?#`B{D2^2tBO<67IHH!K|X?U01inK&Td2>_P
zefFK{*Y-JiefbR?--REhpW1)2$5X6k?lU(YyvkX0Xrl|gASD^Bvs!x(7<Ok<%*51R
z!P7Oom6s19@?U-*Sr+WVz*=>?p%A#3`{TTEhbD2No8DPeREOLzAbqg=J4SyXn%vmr
zbN9}V8AmhHfIqiqqC9)m)^X_0m7?yQ{O*BFpVNLb@y4GYu3g9S^&In)JQjQ(F_yxR
zWA+{NRIt>QYBq)ZFfu#1c~8yFu}>R$rl~j)eU?JLGnk**(BX#JZ*2o&4C(&-`|<UW
zca!w1OW~Zm-f%o};^IbF#I6xovdRT73a~ck@l5K-&8xGT*WRij;+}vgU3yA)XrI?R
zafUry*OMVQ#GfMPJ511LaiO<eMhln0EpGQOsBc<Q?{`~wefM=hN@=<>OHzN@rMEV7
zls1HWSxbmkyaD<qijc*leU;Fr*=kD5f)IhjuV=Y(&!XOM!_j?jq~&@jl!kk3=;(B#
z{%8<%`xK6MqK}6Oa;Wn}{qo{69D2-$$HKXQty-~?VL>{MF^)nwyuX(rC~7j^``LHT
zrV#FTv|jC4G2(dR`tIUS5*?w;a-kFD7MYZxP#)Z))eUA>5ey&*@=4>Aqr8_uGSD@1
z4+%N4KDUzq#sgRQ8~z1fVnl>YJim!dkGWGCAM2PyUT0gC-H!p$fGeSiNgQ!EYhl?w
zSnvJp^(|s*FpE@*G~&%qg*3i29)3N4-`^07TivyykIZMSm*)X%L3Wkxe7)O5`=(6~
zxSHj9AJL1tse@UZGRt0b&ib+~w6mypJ}rdw7*zo_IFU~8M)i8H?B4qaJ-k}DXI9p|
zx5Lv=8X=Odl+BD$-rZ)LsXI3O>F4YHd$N0i#8eo)_4Buh+K91W`M=fj4{>!>K4QFL
z{n=B@em8F4pKa1IkSz?8;;07e#b4iwvF>hq$ADK*R?>Vqb^IGBpViUQ1u}vI2sz@?
zy!y*Qm1rFE<+7gyqiDUqit+s{b5S=(j3S)<a0i&yXHnW6+gg5G{*mT^dJ>Me^#KTg
zFe{6q2z*0vE&V1L#Nl%#Kd_1my(y1M4C)HSrRstKosCK;-ob*?i~NL^*+3Y@Qwy#2
z9E++O`uFP)i<=%FIAFN>2W;qHo?>uQ)ZuXfog3@(>5^jdi&udd(a3a>d^-6R!52*_
zG&PIob8<=w7Ou4zd*h)J<jsisY9M+)?WSuj&l<%c0=W&<WEMwyOk6qYGxzr8X5+`3
z^XgSzW7p?YnYI*PAq{u-1{5bXUv-ckEYOYU!a*`83#O}M@(`d?3WT$`*F#1V;5XBn
zdLuW!Fg{Ny84ZbVj;kOrG<cNJ6W*Ky%`EBo^LPco69d^nS6G@rxh*m<J|J$3`R_T&
zTgJeBQVjT<Qm955>_Mr2RCR$&^VEa3r*>eI{Ur%;(C?=ZPE$NAl_v5!u8p0TwPQz9
zOq!n4{48>J@{99D%<sMo6+T(+4-3^JW-<Iqqn{Mh=o8US_s*DPZ03uYvi$!fs63$%
z*;SF&YAJ2=Y5WqY;%t9skfouXUDeOv+~j3dI4m!$CT_3%fyQ8y6CVXd{}spAQFU`+
z6O)ZaBCM$kaz1@krMQJZF#;|H+Il4#<t#sV;Y89yo7a5t>!i<7Y}4)av-f)=)L^No
zawH_8Wys*DPe)~NjvSs~uFX}1D948l0dhGLVVMbE_e9Ujmg5d@^E9%6UG^(&=v?WG
zI=PJo47lnI0%m>MQ%e#5!Na1@Q@`1{e(?c#*^hp85LU~n6el=5)|oU6m2~R1x)G`?
zdpUJX9$n`vn>nL0L^Pblxrg7axN&2;{q;57?Uy;Gky9e(dWa8Ww17#hz=$8>h;(2-
zWWk!+@4WcHWu`!p$CnEJ!I<Y-B~mV#_UsRR7Wn{&fwud*cS;;z*4x$>-C73zw5Z%J
z_5I0htNocXDg&v1{!yfTutRiYTkIeqsqK2%*P|d@uM{PSPLaeeImCtaU8NJGV<p>9
z-;{w#za<YW!a^7~l2pkf{o!^T&73zPY(ivtf`K*1C%mW5IJl)iab2st#d6j5<ZP;b
zBsYx4tvl4NUygU|0tbb3ll@cbT?^BcSBcH?CpuiJY2&KYN~3wMaaxliamVAX-*yq&
z(IZAVfE(Dz_D9n((039n;tR5VS)bV#2uXT=AB4LD_I!cz{puncL-xiCh(PUXGXx?S
z!ADYj-!{JA5PWsA_E0s>(97V_h8IS&aem-<3ha7%QFfmjdG@G0*L@uin?4F^JEW_3
zEW32-*MG@qe(Xu!XRgF({#(2Gob|htum-KVJ56-0H!ap0B#>XMk~q(4X!WNu8u3Z{
zzcRppQ$M_QsB`jxH5{qR`&a(N#d2<fEM*@<_o?woh#V&d9oVM35?q=(@i^M%i&Vz8
zQBaQ6+Y5U@To!0IaVazO&njNX>fGxCP#Dga?K0K;SWHpMLqCecwvc7^ySX9z;6Sv_
zKkmg*nD#%7--d;>_+PgaG+bZnhUreoW&!)2hi`}EeYue|`KIsXSHFhtOW7=Q+e|&6
zl+n{cjhzh$Am5n<)9vFSn5C=jRQPS?Wxb$9Po74gzPRXiYf^Oa<MLvnG<cio|6pB<
zhX)AGJ4*0UO%=RGN%phh5z~1enfVWT-PtMYP0;Z8HA+3Ok*z_U%u8ql79xP{SB>!P
z3O*=w+fzIw>)v{X?f}o+&!x$i^K7q>03PLIp5B0IQg949hOOctKDn%=+MiP1PX8$B
z^Xo>{4m+jBJMrM`;e1Mk(=Cg4rz=4b0u;GwFIRc85PgZkHywWlX0_D_k2u}FVQksm
zWOKYPoR`f0l$D29i{DoDA=YvtbJYye(K%N4WjpxEOa}GwR}Gq81B!B*S0)9z?y9t6
zW`BM~vw33PDHmo<c+t@1I|yw5+Y6v6m(n*x8;!R_tJfH*KjID?6MmI5AZTLHmCWOH
z?!gUvg3;s1eb+CsCd^ebQUc5dQjU!{HD%z7S6DII;Sq<2=(p2w)(GZ*4_T5?jf77!
ze19hIeE7+4qV1bE{4N4DEEf6iywD&RsbkN*FSUpJMf~~69L`YLuqXo`$IXOR*~UY5
zOYk{8$w%we_nIxGL8Lp;q|Zp4!iC9;oS)0dM=*s<qvmqum{sRy_yfr8ANr%AWxJQR
zu5WAK<86j+AP=BCMb~~c;d^VMCz(u%B)UF~T8qlDV3vWK7x?CyjIx{}YMqpj<`}3A
zpP5z#n7lv^9?qER82htIhF6uzErZMA`nI9a4=}Fd`OZkL>=N7(8&LW(LbI;M{}B>&
zMjRC~Lv>PZf)TXtS)<N)G|()vciR<+@;N%lqVL`G?hq`6z$E%v19m@}y*Q`Plkd(9
z>JQU_Y*ySR`4WLRy+k)pe7@=qWwS{f1(39kHP>-#p0Hq<6PwiNm@DL1WnC9c6h&d9
zd@*`gJ%F;UR;=o2DjwiX84(t~*=Co9Van)rbo+E#=CjgX^%3ub7RLv!yN6{zoY1##
z7wYlHoB31T``g}Z(AS1_n~rGtSkC9X4_!~a|3e>Ln)4+JNna72E+-4GaOCAV8pGQp
zR$WFV=sGjQM`Q5-0?c8RQ4WU6+&^}Y?iP)52_^U>&qZfahUs1Bu~FsDnn>_#%*V(^
zeq+#J>Aj{DbX{n}3w8O|hNCWcv-Kqsr}5^upiAYma9Z5pWtyk(49ydRP#g5HHAn2H
zP4ki$)@kl}BaZKS7%X=DL!YH+kR+F{ILi}UTO45B&b8Kob+c{g*!H^8icM)}eTny;
z_>Pm|ITp!3B7X#=7AWIpQk}sI5Cw*tja2O8&eGYRxLdsA;NfG<G#j)2?;6rE(Wa7f
zFRWjoRb1>VR~Z2CY86MA9eL@rt)q|GSYrR}*c!1@wDhMQBgB^S9$HG+40~JXtxor7
z{2m65R-+25!r?dubk*|4ujn?Zw(@%DGFD+@mV4Urt?q)BA?uuuf&7j)@7j_j<E_wF
zo<E+6)#i?XAB5u$zN{(=a@ATfO=Gt5;H(RSPfpb!UX_|$A_E%<sUMIxc~)ghJsAU}
z+Jujeq6*t?=0L6;FAllyUCHzu&VoY1C55kaJr!_>6}x}*m8uPpgFgaGc`9j6Z+z9l
zbw2E(nwpN|e!@LX#Om5P8mY?<(O6cJJKAwQz`gWDte|04*#u)4ygLKUc`4|U6Plul
zor}KmE|f;U)s<_emQw_BogGm_GxTC02~&^@bB|7eENQ&zH&rRwyT*;-7fQlxS9n_K
z5z793Jpo3P>{7lUGlCVZ4K!5tGo6|~;pFT}Y;WW2?C3jy224Wjj<WG<7;h{RVz6vK
zbYuIMobXlkQ@!Ubq-ezw-3g^3O&Jn#_jYQz!71OWG<-uGA~`qdUG+I(g!V4kMZw4x
zb-1`i24_ebCTMJ!o&6opcSuC`Rxr$2;&n(uHkdsG!XQQw!lCPnFKUeM5l}V1QgH!m
zJqzf*B1Nb34q3708238rL?ON~eNX7Ogbv>p`6(0r<w^qfjR%yDjfwqQdOf8}4KVUa
zJ$AnAyC4a=6Wx_xo;1D%2R4N1`m?kc*slY(2gn^Jr{<yIzLP%&#7z?xlLe3jiqX-8
zxrLU0fP{rYV`Z1(2KLfT-m^&0yDJk*k`P6xJZi5wyTmN?V_2IqusL3bEV7h;cIQF)
zQngkJ@CYZ3dlbinis+O}jq@o;{QVVD0S?yF1MpUBy=o#~N12$pKYHJ7Sq$T&ryhP9
z?M+?A$U?27Mcmmu#9Ww?S(C)Ul5h9c29N@f`F+oSn{f=vn$<l!W)2HRg-_5)e8ubY
zX?5(L(QI?k03#`Vtwc}ylFWMEGr4!STP2H-SkD~Af^$>Dtp2b;fAOhS9}QebZm)fH
z&alIpgJfKUY(9!aF4nk1{F?c^veR8UJ3*s#;bz0>Q?29(%N2Ut&J*}kNByvw!gkO1
zRnIYHATYxIf{SK1I}OQy&O3ZcdtAV(JYI6Fe_0`J?SHzBd`5NXvp4vMV@0a$7u6bh
z!Gy<5(@fCo%858CV&4fH7HJQRfa>-}T!WC1lERPMXB*MAO0n98c7}nilp`XOzT<E_
zSN3PEhN%0EMcQ!#IIaVqKc?J1VC*7DQK&3oT?*cm7%l$&U(~;{8-r(Jl=L!Vj3~S`
z+u40;PM)3Vf~6y80E1VXh~g-gX1TvQL8muAx6x5`g&2G))xcccZ-2&8;57!8Es|UZ
zXDe@D5stm#{J!fL{ey=RYH2C(J~q5EhbCRPOK`y|SN0U!)xM`#Nix5h-C+D~l0RGh
z>06F~bx79x*sM;UrhjX9pC<$iaGem5;GuHj;zHQqQc5YS4-KBtQdHlp#Kj#oN8ghC
zl{3sD64S()uiC?3KlxrgkJpI5$Dq(>5TIVBy~dLBB%}wm!e`y$>34SqZH`Tf+@Ktm
zKx&KM<xp<QpgpNkWCbgowksPosuf5V<=Z~3MT)XdS8m}y!uK+d9AX8wo}dcien)NM
z43wl`;Sjuq4->U>zsPRu$0lf+tE*r`-q~pt={S<`Bcc2e6N54rI!3@-g-w0gW}T~%
zxIoI-%KNlhBZ5*DEUJjvZ+_cAX^|*9Lx%%^HPu1PG*`Rf>!hT?N$cn(ku6Ffrl}0`
zK1<4ojbYa;CZ{u*Xuq&!CRt2Ya*()?@n~%Fx6zVMKU7`~6lT7AS;54yMfy<HYi(L&
zgA~~3bc6~W(d2}wzw3=8+?I|EHeL-%#+jmaLp_#mSG3t@u>Os9)aeA2Sc)WC*2xOS
zXcf*Ie9yRlz2acF!h&L&FK<QH^K<GPK7_U$6&GR*RYFZWw5N0rci*vck|Q6}a|{^0
z^wm3#?D*#qPhieLM}2Bh9jytrxLoN?Mrv#t_bwC}&n%pSu3@l^z2a!-$sebj0xJ~3
zxY9-KMt+#3EOc__RHQVeqxc^^13}s)EJu2M6Eo_AgxQLMM1HP#7Ao$krkp}ux4ec6
zdVO&u`Z?u?$CDfP+;hgwdfO4>T>0CZp+7A6(j!?hwcLLyU)1gNS$+!`#%~hF-4i7~
zG<aEJ7y@l<pbVX@?N__c;<0al%{k=Q4<wTDdn{1zKbS}3Sdx}xZfp~<;K0Y4vR)VQ
z>0L^2(0u(zy7lKZzIW3nALq^Le-%~#=>LW!yGDg4*N4V+&>k9SHFh|XQHBcuAs5d>
zulM;WL)(iQA5?}=uIw~)ztC7b(F=m)mwr1NVyibz(kU8D5Oe8lQ*`DPW62WDa<h0=
zZFUVgeaT|+ZrpwvWb)qp30x2kYNNDrR|vDCgLeLsZ;*IIhsmmN*9AWEc~3po3NPyO
zCZ`@*_0o_JPfV`*A<ZsHWd^m1=(rRw_mz*`P;o<J8A@9;x~=@1)C7)6zD0@8TDe0y
zCdjm?yl3a8?(jspu+$vrrkFL0qZczdKK0d$bX;!;AwQEkHHaHT9q;&{uVdWTLBz1|
z+F@TH01rB(W{}jlJ6hj5#zjcnG|*$4_*iU{BUo?q1-s6s?NbcW-S+Y9liUd$pYEow
z$dKa7iF5Re2n}siL)$6`)aZfc!RFv!cws+uXB99}yUTOVN%Neh{H7N$9Xc*5oVYhx
z*wjUQZ-1OE>?)9_*gXHPRELKiA?lmO3HM4rIP@yAc|F@aXa!<m-EbiX69kbavFNOj
zz}H2wX_}_M_%D!<tY0In4l(0}KEuA+knOd)9~assAhQvW5MFl;3Xw=b!JUvas%re~
zF6xq6Uk-AN+%(_*flYDBbkzem)-JPNpm-1`ALcoV#ME9#XIImww0v|0k{jE$PRHZO
z?j^#K&Sftn%nuAfDyF;rYABif)g<_kC*37EBqDByhR0<d-ulaDJbs&mAf39!m1kk4
z8z_6|aGTc~dP%?$&kd@0HD#65NVmly*0V!m;pP{I)@mV{5&Y>(U9dJnf|ScknC1}j
z7`;dZ?O4tZEPT8?K$JBbcA+I5Ez_Z7;q}!@9OCL}T^&he1y8qnM?dVgP;>fJA)e}`
zader!4B})dN)u|*7!ZWZbC7wU>wGEu`R-2C<JnVF^G#+o1<H*cyNChnJ_msnVhPFB
zy`aAx`ZwyGp|>6LWH8f%8lnMR^v&1eMt8N~581a#A4zs)ioC*76uQj#L}%fvt&z;4
zqrZC22>zK;ZAt;+<EzQ7|5^|~XHjSGCuyB!BQc&9l*w0KEnB$XjP$~vxmYI*J;e6P
z@ZIr?&6WgXMtSsJfXVxF6kw|SUw`sI6Q{m+-c1)oPJ75a(2tt%?3`Oy%2u0v7<;<0
z%St`<?sFw6LR1NF-?WxjepEcbX#1B2l_rDXysebDd4yM8P*;xfWjp7sooBYhGtVjw
zZ{9a|X2_Cna8|e%ZkA2c<s(i)@h%n}I$$fd^T*RV1SI#^?ve<w4O3^ae0~c8-10VK
z^$9JW53y+h8fpb&&ezb*N|}j==`tOuv{{yW&b~5ahLS<0WRs{;UqjR8LTQm=7k|^-
zuF%Z`-bfrbJsEqk9H+J~VCy_<!vvH^iiiaM4kJ1{N(}~yKd8Y9MVAXx;}1~tRlRud
z`rJ=@rSmyKUd|l|2+=NG{M21t*-U~HJ)b40sY0i=dsLGH<AZ<5rmC7uZLfI1*3`pi
z5Sd(x^IRL8Vx63Nj}k8YS!89(@$>1J^B-c<UPE1cn=wYd$jEO`1{_MBVKcx9Sn6_H
zY<jQ>QC;e-ml|zY@vKCEk%#PO(OH8vHR>Hljh%Sqi}NG)aZ>_p;#xcoCZc-`n$xM{
z4Xn45o>*?IA%X}7awn+Uee<%O42?|mj80dI*}(b7fuAT^diyqx-4}(K<g!r|iwf&X
zBx4*<>b+;^><kt6-^$&Ax<(FgV*i*sP#z&+-}fW94QaxOVqB&)JT-*{e|913)i_do
zvfFN`LNLi2$9{{gxofV=cItJZn{(TC|5+_+sB+Ysh7U64NSK}uv$q&;wbLXo+~Web
zP_T4J2pa5frCLh5>s^uzYFq}hPUjDcp;4D5@)b7eh1QiyJZ#aQ8;{<Or8e@0g#k^G
zvla=gy%y}bLgSE?3Z!L%G;~<U=AG}!DlwakH|wT{|MlxAlWa(`5%^P)%M)5*)b6F%
zjzeJ&q24S@WYCTu5c&uz!$$kuW<zWbiz*&)Q%pB19GNrCKBi^8$G1$TvM>V1jOq~n
zdWbQ32)(&Ac2Im*&)xo+hSII>ep2~^!PR$#=vQM!G?@hJB?ll7KaS{6Wkj>l9xc#4
zo^vqX4=km>qk8Qj1ea>U>#ZVzKI|-OT0+iNiAypQ<Kjv??~p}kHd^306~7>9tzCD!
z#`y1<BS{YkA>K(Y)Zo7be-||pcq5nojWbuX%^t3=@ef<F!p`rG<d}LxsD2uTV`vU-
z2XfLse=TKldrvp>%B$d5ngcRyRcs`LB}i(%U&A;AGtYJ=bpf!St#KmwbcnKgY_Mf&
zb>b4C0JGFe=j%3qCJlc*X={Qd4U*#gp@XqyNOgdNN}k1))3#P?nH8lMMXAM{E-dzV
zC!ZqZ5ICAHy6E8ClpveL*6to(2zft6z7Z#w8%0tdhR2X~dBE`T%D4e*_V2`pFTV76
zG{ty0ST|ivqj<S`<^`SNd$GvZ=4cm!g||}8Ts?>101s#{fx&!{29{O|#OYfcuDjp}
za-kK}Q(?e(33K<`RKtsb=+l<8ZWin-i$;l@qd#t>5%Om*?Z%3haF3RgJy19jM>FrE
ze5%%8=(KFB&B)k^a!iU3nRzZBVb=FuzZXW0FhkHgY6Xjf16S&oOp)hi8ao%x-7vwt
zPm3Z61{Hf#R*xM>4?jNz!6!0BU`LhXw@=3DH#fTT<9L59)QmMpbhsPUxxNb$G&$!u
zJu9^8{r%;N%dq48Oxx+SXK0yUfnk78WSrog<<}^LQKKp8csr+dTF`~Nr9cOagn*ie
z$Q&D^<JPCr=1|7JkN2C~Rh%+7e~d;sOB&x(_7zRcBIz~_MU(_Co{bi8DxVd2A~af1
zQvqJ@q^`BcD)D|(h}wF*dDA^A(1bVzS7ux<fX>ND_8tAxIPGHN)ljr0WvjfTW5rCc
zD8S-7iP4?H$g6ul`6fNPJO0`Iaoi@P9=hR%3)B@MC6-OssB+<656=b!e+K+d;X}Lb
zFRSI_vh(M^3!jfN;wW@9=s|`$wE`BkEdt+p<zHJU{VLP(KmHcmOrtjHmYf|H3>ReT
zbe<C&=oE>MRTd8~b#*ec)O2d&E~+?w5C;NiuME&}NZF)-h7!X!FfphLHHxx>{lQA9
zsarV)2Z};#3mC(-VVO2Ab1xAOP!?)aFOJ(i%=<Qnc&B%^ttSa|&X4IN88&)I{}N0{
zRCdEq^T5~)>S}vhvMQ^uwn$LnXc-60fz>EUo~0uzeq-#|n`vZeO6~Dyz*L`5zV6*O
zGsOvE>V@O|v*WEFmU4XPc$XBoxsl>hj5X8r6uMujuY78+>CWazJfH7^;lkQSYv^w+
z#n~{3Dk!r+4ctiOG)JH8hbZ6fK2L?~g0uFDYRC+=uiAhQ9!P&+d!tiH=F7V(-lyTY
z3gib|le{mW6juf#94l9?FN5@@n*7$`bZkL>XQMN|V~0_fl&z_Y{`_mM!L!90GvqUG
z*sd4rfC2kVpZPa}`zt)U?us1oJ+$jkC~?k>U6nW0DW2P7&x0eax=8OBy>g545w<Hu
z<6m$lKn$Rqu5x?q{&UL!;w_-{%_z$Wy<(&E!)0Nt9T0k$kih(s+<!`}XySG>h5MPb
z!zP*_BU<h!dIbs^a~^Fcxu3u;XgM(2m<I$t46qIxXPBQ{fJZ(V9SlT5>Ai_IN~uKi
zH{4satJ5OWh?<V9W?J$Wo>cuQerxf(%Z|{3r)xq)`jW=KkDP?pSvD3CeN`7aI1u?H
z0)vQoj;ERh|2A)x25*oRrGPGQB>ss-ds{JENkdVZ@CpJyjsXxTCt2HQ+x;UU3XB(j
zWkM|N4_{*Z!26pNs6bvlpf=cE>)SJUnBH-l;9^P!uthvarQrs|iC;00V2KRcFAa#L
z1F<ngF<OU9M`%z#OP7V<$11Z$ND}t0etf_Fdn)&IYd34hGE(=6*x85ggnYF;96@}v
zzYYD52vBF<M+6|FVg0sie8~&?RMgT<+0S?Jh}-)-NRC+5f89NviC`8aEtxRp1Mqv;
zPi?^VKMw^exJ38W<ouOH-?01_{`ZCkzZ%K-IJ0w9-6{h93lML=S6m0o+v#4J1lnxz
ziSV92_cZ%U*@e96y9aLBH-%ReC_zQbZ%hfl%A50e)Of?vf67b--~*<>|Ci<Y*B4zx
z5`RCJWL&@XuGfs(KhZ;&e-&26e3O3CT_3yDF8PZ$jP!q4s<)I#O?0Zk;!P&OjNxgS
z2mcF<^VhTtpt{T+JKBS$X3|5uqTVCNym;rhp1nOn-6-x9lQcl{KV2p^JY=@=0wFVv
z-V_CVt~mMU+5fBm&)4^dS3CeC7R##3$Nenu74XAvXdF-0U%OClc>Vn#{{9P~T^~+|
zRM1^>35+P74%CSMzY_LWf!Tj~r&nOT=E|3i%dE;ls=&qf&<CUl0l6oMz@ar2#&AcI
z68jH)v;O^+BdB=W#=<!<!>8P0=s#=V-^dmKe~$zN+yQ4CC4fVv`5Pj``Bt1$4tR0^
zwLBTFk+5tUUwO0NUE#Lo|2%xmsa^GRaX=qZ$L+K}mG=LK)2?3QJ7J?*8YF)UNPbGM
zM3Iy$6Fme_Ib)-!mSh!iT?vX^JI;R{+%f-UF_MpX+K~14dV}zP>wMDVUtx!fi9L&D
z*l^GJH1)p{luwW`O~l2emBUnDV?F;j2J@HCe1hEH`W-gw=S01d{3IoGyaxobCSZ;G
z--x3`@iu*u#fYZ2|HTN%8X@n5rKUW^Oo!1frleqee)(?^+J8SG>8N_)(V31y{Eiua
zue!n&S)kUkd77Z=?*q#i*%f2>;gW?wpy}gE;idJ_=Izz)KP1+_9980{$KS#JKYo-k
z2mnSIH+ia_|NR}`{{H8{wu<fV4Z{~j>f5+EYAjc++aHVNvs*gB`?MS1`PiZG((fh|
zkZx;Mv@q%Zn%@P8DK5VrLV#e1jc-1Sde6$NzH8{;t`5v&<*AMq_|&JrUiu?e$#fe1
za?khI1b%HfO#<qBe*4elyk*2rRK*)zTZ7Rf5BHC<F>gXq_~>cR%SSG#*72Z#+Mo7p
z*Pj7TY(@2Sw|U7h)(kQLqp4BDAHN{f8D~Tk&AQqZ@C<u|F(|+4G#eY1aAZ7MsRAg;
z)Tfi7DqjTCY@hvyMtaK`2nhb;ACz$74IA&=J`S<tQqyGloL<u|zp&}0f7bc)C0;|^
zhaa7|I(-0Aq4E00{kN-=!pU*x;0|Dq<Rpmwy$6WXjs>2p{(~mj=Kypx6^|w*@BA$O
z_V3{t@+CXd8v##p*&DxJr5+!k=6ifwspL(Qo`*E^S8xu%E+Z)LPE|L6Mk?d^GA(y`
zwCs?2czl-^(m|BuEpC$D>W@ni53D6R*?IrR`{=z6$8D>i?Ie<vlbf#*CC9+u%*Kdd
z@JEDickrpPm7mz`M#Cz^+~v0l@J8fK>BF)J)c%T;#8Ckr_UjX}e?;Q{e$13*<m?;U
zxU5TjZWc{C!-Cjf`m%~WoVlqh#ZxK2+UZbXzl?xqo`)@lZ$RxV#*#+gt4B*4r(
zPDtE9(&(EnnNfbYGTlp<KPOyVD^MVp1+zuVt+jc6x>to>e;W~b#VOusz5G#BC)rhc
zrq2rH?2`|22Uu-0DY=gJs#WYQqCTY#^?4zWzh32B3Puw@f04icCgHH&-`(t0syHE_
z;{4CO`YHYv)$ie2tD9kQAd*U%Sop_Z`si&bEE{lE{fOXLo%`E=kKqy6=8PGA<Jc@|
zk;e3*NiT4a%a{&>b_c(y)|IilGa=r~Z_^pC%I@Ruoftx4-B+)dhuSt}x)5I94ec3^
zt^VheL{aGxe+o7~r8$hY0k#KMzc#7t-Ov?3fqF4B)8xNbr-F&<_eE*eBt_uAuCQxI
zaebt+BL-H%{E9V|Ke0DF`ZvJ&;k(88U7P>F_VB8jZEEZwWLJ>@{0?ayAC1i?)_hL^
zWuj__2`J8Dc0Z}VL)c-V>7y6IbBO|+y%Ef{chnqIe9vK3=Tnq%sN8w;6Hx0812OjU
z?HH*kK3*aM;%@Mj{N;k7St~G~Y_xw950*w@w_vRid<Abia?v48Smiiy0`J9@TbS#?
zyl#hn^JgI8#PYOH)IUh&V*JKGzxs)f`9&(|bYV;66ZuBwlZU(4$8ADNfTGn5Y7;GS
zZAa~?x2}j4)DS_%z2=A_Hha50q{eqmE1ySwHjVoC#9gd<BsxPE;aX?XTyf==PzRgr
z*QKwUVGM-Y&amiSmLslkDO-mVA%2amCag(CBiRo?k7oJig4#;Ehmv5svh&ialhkaS
zejL`PWpH-NkLe>?V>ZO{S0*s(*rDuiDJJE7<>avTtK6D6xi??^719VGaT{51swB}|
z|8aH)@qfG)*{(0<M!O$p+z0PZXgaPrJ}8T&0}y9LP3tG`jW?qP82jG)Fnn)=2K!a`
z&i4ki7SXm8=rCX_p88<knB%`LD85`)rbVo_fS$Y1c1@ikXku7(7JLstdxTBwQihW4
zQ<lEwfLNZ+p{|>;SEkLZ!n=-ulgrUioa$;&6gG|JO9hgmbfQLTrIk<gOj+=Law^^V
zUNrG|Fw19`MYTm_ie9r-h40j$h;6D2RY*p*)*E3MhWAb}O>~*!G4rxi`KYTp=Tw^P
z=?w`~(CWnTOk_c1`t8?;r-^Lo4S=~Z_liw7K0;t`^pzGFm=^KAulFRuQ9ESDy?2Oy
zww(-gHb{whm+s9`dpcSxWuo7EMJY*k&MaGXYnSZ@ZOi<L(UCje4cBmt4E2s|r&7ka
z%k_Kc^X%;Rbr>$4$@h--Tg%%>zcsRxl71jTZxLI#O*mV^28rQ?%fIs-ShgC(%0zSQ
z>})st;8Iy5T&^oJ%bM=Zp1j%8b-X8?WtnZ%`ORt>pmv6*&6!8fQ2qA0Z_5RaJ^d?B
zd_r7*itE`?2Mck2-HCvNii3iuo*)=!NsnX}8}17guOC^tQ_g|`YvJ#7;wU{tzBh{=
z|0y=UMm&jsr%VO#uu|WNT0c1N3^V@d)O+q`p}Yy~wh}iNN3W$Y#AuXP+Rnyr*S2zu
z^xM740Ve2ECKMu1RO}r&^0v)L>a{m~#WO{~w?_ENNNus<2#-4mpS=SZahIlxx-4yT
z0}ivZ+8p{+{$;@t)cmTJGI=pugGzFXT>1Hcd|x5;<rvwshnakNOYCReP}KazpTvOD
zr(Zr#>XuKn&<*F>sGI)tE<0W2?H4c)l6z&lW+YSU-d`GKIfFX6`j71G_-QSfvH{6W
zdY$!*s-`qKhOHpO2O-g$5Wpo`A0{G+McndH@B?$h4~Sc-PvtQ9ShKu-$oyytN7`Di
z&Wa##6Fww5@pMH7bT}#Pb#cgKSZV^=E^=BsA17j<?mD~oq%OHFl-W)xy#2idIKgMk
zPmLE;>E(o!t6sB_p3KHM1LfVp!U1Nb@f+<76{-jveU0mHQ?cxkq;xyO2{8b%C0M$t
z&kmqoAgLYFBLiL1VDk=})6_*ayJb)!gvV&RDhDNBqx2;*nh4?V{pbf&tW`^lj#r3P
z_sYxQ%YsoQ3M?~*(VWT7R#h}oZYY{S0k@i<$eKvD3bL=!%19jAFzLBGMYt%3%e?Nx
z_awlm*5Em{*<e&g4Zx?xQB_N2&fFw0x`dCW3Ve`DVq^5KtCTXr7WbY)VBk=^!l7zX
z!jXEh%?%<&uio3);ER@vc_fM@4#$Zvtue>(@J$Z>C5G?#;vV;*_O-Il#*&0s6+jQW
zI|KFi44$!a&@bHQ`*#albf0__Dr;X%m&y<~f1KE<A|Cv`SZjG?U)v{w^1V+<Om}+~
zxBoD`5yC$Rfc(-16r7s&9oPO`&V5GK7e?H*4r5CfbRvXCMeH}T8@=IU^{%%^jsL86
zLXW0I!0q>KoT2AOxkMH=B{A+k5@fsXPWx+pQ=z`7@Srw4DnZ*=VCnPW>6QM~gc{02
zRHn}t0ngh4tnT%rr}PeH-yiM)2HN~AT5QBLV=2WMd6i~NaZskpHD@X6_M=04*1Dj#
z_$?&lPwv~usCiN6^P<L(%1#5fA(1~a<%N|YCIp9T@KI)UQCV`R0p(XAc>Oy>Q=#M}
zz1YnacVP8hmOpN?9f#lRF&wZV^{F3uQI=pOz1jEUrput!(o8>#DyFo~1cF|Sku#Jm
zF3@kkIgL~C_S;YG>C_#FWvrc%;k=>Bkpe(4M)wRDNZ+zz_Hj(!cdu0*)_;TaC?n34
z^?HuD*Q8}C*Zo2~KJLrcT6{`OWF;fd+_`Ws1~kFO5N`rvbVN}h#yKA%6Ks&S!OAr2
z@MbA2CG9@T%Yclh$kMO0(E*avN$ZklG4=(B*d}2#XiMfKd9&~Ks-hv9GAgRwOaP;F
zdHJU7;`C~0y26e*<p%1}?D!D$O(0}16;S`K@j3oeET^OH_x%1C4Eyi1hB3jFO2Te3
z$Wf_&I2IW67km5LY2^8_+}~bcjYLjeCFM*;d#dzXoddhLe6$tud*#XXo$HG4JD*y-
zZPrYop)olO2t^GJKpL`ihf#9!QUklEN~Qh_?NB0XnhN$p-QP?*jgk<DJa|%BNXd)o
z97S%8jh{fnqZzRVl~(EW>!o7^zOPa>=4n~6J(0w&*}><}A~v5rUG#XUQ#l;T|B|P*
zLY4NXOS~ZR<5~&Ud<=~+jeIHN)(*4g@SdR$V+NJ)dwW3j%kud=K|PvJ*(V3Kpjo!h
zioEkux?1vDxgNSo=TCwu9YH}qAwtf5GkZ80P3?czPvVgM^UF$8$D&W<O|!qYOpmF>
z`x!b;r#D7bNG?w!I7sT7jC&CPlTG`g?45E(&PlrHnS$Ejw~~O-WX{|nl<$y<&a2@v
zA+fIf&vV14mqKTG7Li}1C4%qQk@H^=QiVQ)nBfKnKKYK{<X9#B+R{Kf3-OQ^%Rez^
z!7YubG^kx83E-$98whZV7Cr>|f`Xm<Y*87Oko%{f1P{dxkmb(ZBj#d%d<oELIoCay
zzE*+!lJbLe*oA)#cEJp-t^GtIIJwq*JRZB!>_STOn@@vd3=AtXXct^-gUyxaVwH*R
zdN+YF2*2YqG-ItfwsX#F0NPGYrmlS?8WM|{*&@GU0+z|^>ROb~z~SK-MMU8NT(p5P
zz&;A^<~5-O4#d)b4qSc1{GJu(gNX<qespTDx65RUb%}pVdFQTGVpYp|k^auOu9>RZ
zX>VMt{kDqBcOWfDhaq?FUC1rr4HLF9`vG>W#TZ%WPRrW`ftt{bhv~n{N0vZ>-=2n;
z^HGkc(xjCoCeATrY!GkDf-WNLx052%hps*$Y2!{`-7n}fUI)cYaJcc=DuZns5tL~R
z3nEhm-CVN*B6)~@NBB|BX|*|esTK#kDXGGsyb}ipMw&}%u_$4r-UY%yVV(vos@V^3
zSu5x*#p*%l`sgiM#(b3HC~%jnu}~oIL$1N$k!Jolb&Q=2q3a!S445Vwb<%c|+_d`N
zjvw|o7&jr##Q(0~4m+vN0S63^#3@M4G&J*Q^|98SK0r|Yk$Nz?U0GEp_Od-b&u{uv
zzL<qECRRkA=>xfSjO4S=A^56Ac^}f`(te$mK9uLZDOaz{nG8A=%W}yWTp2tYJzIfs
zxvU}>5jqw^XM)@=0+5tr&uvV8AUp)Vl_@*CyHxtb-}hEXZikp0e=;ueODUi(x4g?J
zGFQhytMENhfuL0q-(+Mz$p|{4&v2C-hWvPFDK!tKEwlIfT$<QU)oAOPsJq>UTRwJi
zdGIn)EvIu?8OT%$R0o2{REtk2{~bdVAYRlp0?XpluG{xM>)oVftN;4Id6#*lJg@Jw
z7_ViJVtgExwE-^z>(4W_Y-t3B+7pBL_*Xni71Eq4tZjtef|BwDk$H`>*I<gaAp1IN
z-zXJ~RzZxoAF9mlSJSTi+BI+^&G4LKANfqMKW=H<Zn0jm>L&F95EbCUnEsM|?I*ma
zl71CtyUbZG^l}apCeNuh+6i*+fQ>t@VhVMM9<Bcn)SIg=)f<DRs}CY@E-PGhtf@in
z?ZYy)E~@wYiW;jD6~ChJiB=_G7~LSCQCt!|Ts=r}WN;zn?mY6Du{a1R<a9YfesoLd
z1Qd?ogb8KN%<0&b7%7BbmOt-CDQBqkEEKD9?<$pAa?pjnyIc@t3&<9#ok7F*IT1gj
zc0LLGMlZTMrkh<kx|hZNRJ+mz!unwRjkA7rInvKotHY<FrL<SuouTwh4XLz^5M%}O
zm|U%bW{ZH`_GgQns&xBrsYl(8vI{{zIyKuewPwEFH^<{xsGc<<Vs2}m^u&3pt?6@Z
z#VOX&NdQ03gQeVNio!yl>Cpzd?$L90(&|hpGxQBFeR*Q{OXv<yOc=qGnm*r^A3g0<
zpeQU7>3HV^QpmsPrpiB53D{7$b)9bIo6Ld9CtM5I?Sk|EXibAW7#$1lXE-U^L}zOQ
zd3`Hw!0=WrM5){@*}SwhG1jN2%h9&;ti))Nij5gysPZ}OD(|!sqH=M3Zx64-?`ner
z!6{&cpH9yX?&J7j8?A3AWGm8PZ+wB}q7_~>oqAA=hn7lIHO*@~|G4IcgbG@*@l3F-
zY&0G$uB<ee(JuuGnVkw&xzeo>$=h?~xe|@L!9E)aeFyF%-DKlmT(2EYokf)PVv|hi
zv=hDy9j=Q$REcUNOiLZy9Qlh`cpsHFz}UXc!5bkR@O!S?w*eKE#+`#No-OisuLpL3
zFYXTtvs}Q&kF#K6KIOb8k_K`uEeFXe_iHsuXg8e>zLt;nfRfI9ByntnE}0m})l2Rg
zc${W3d2;xF^SA=U*HCAxf3;%F>68e!<jO#@WM$4`k4DGUeFURMDw>uKZPY5Uo6j^|
zoElf$9ZJ>CQVQwPwrs<Hsg9<D+?L<Nz4E^_yB|8LtrgfDIO#aJRF4gn4_?Ln5L`C!
z@cZ6s^Z188le3WL-OLps!_b+9TH&_&>6uP&8I!(v%P&Qx1c0m>2icbdtN9DMf7#3Q
zn5>h=QJVWPlJFRH)TK4}MJi47;|EF`rny7?VLzQ!PDrUGUZWyE=gF2*+H*Pc<r)Dc
zwEcELU7{*wudlBCkc*-li;~(behu<`UH_8fgw!0LW*iCZu`MCt9w?cfmgBlbXM`Z&
z$N;~z_tByBn0)B&mxOq`fCjefi!!MF_C>i*63OlM?zQ;vK!xu`A2b@&<d3Ry?B~X;
z8)n*XX^FR&A|e@gm=-gsFQAlrnW9r_&*X~p3aMwCE25_HY)>R<uin3}h%~#-X~J>L
zXXdq`f+9P5)rcR|#1DM3c2+-cR#-k8yTX*)bNc0Z9)np>;Y)S^$vYEfdAbuVzgkVj
zQXG|1Wy)O^83a?f5S{TetF=eNbH!DvH{}}5NQ+}rSjK9VMu){(cU1Wz1{)f&EBGsq
zxpl037b#j7%jnfsdsaZ{2dO2lTv$C&Yf7~K3)Fz=@4nY_V7QQ7@2y$OS{P;F0hFvw
z)7o>DwbE4F^Jt<oJAA5%!_JPZaXU5%X2)ljWe@4Wj)TQRUHmHdB#6uT(@g0+Z@uBo
ztqPr1gO$yHUh-GGhw)|TIeu0mDlzju!l`I%Qi$3AA!cYs*ye}fAARc?TuIiMEzxTM
zAL>w~Tu#+8Oo7tT>t6=_FuM@d=4W+W^R(0J%2Q}%vros!XPB9Z>@Jk7{mjYzIzT+;
z`8p6rGq-Zc^6x#n>Ht~p_*d%%oRV8u>jo9)q(~^NviU|X2`0LlIE$1U^)plJlEcf@
z;Db9VJ@IP059pYAMSoFY-0A>b!wzKs%pK7v2s)S9;$R4GbDl0_B4XBbJ^1n&h8g0r
zlr|Js*fgFU>o$r)<d0#{0FtYZTzAixOT;?z(89QwwJc0AwDSYy=c`-~Mxkx|n7*AA
zE(@E|k2qSc9|je+X2}$DL+=V3Jg;}aEBe7uy>lf^2G+pgOg==L((QKAOzE-SDdhSZ
z;s?`DdLsiFQ9aTf*>zUZ^1(PW4xe{aA~Gwf4N?H;Ytv7aI5v9Z&t(~_Me%C!9IY6S
z3RRm~8kE|u?lt7d6MD;Z3`V*ytx<_YK2GQOjyolTK~@hb+?LJb^!|H3j+8>neDgEB
zqUE7cjphw5ZC~bakjV2}mA??8VZ`tGjV}CQE04X@a?j?gv5pOLUq=ULLE8n%)iS8P
zreAAE73&U068sA6=2aNXRSnZ$F6>;Ut}hnsQcX}liu=-0b*!$mnsUD`6`d+&L$PV0
z0ewM)_6DS=nCWV``3-w#Z9@LfUaX*BB{U&U_2UR^h?eN_t$e<m5GhU(^hc1-5UoBr
zT9)l32{V%@M?PZlP^JY0Y?||K>BRaNypxw8U-X4|y%(#=tf`4`vSRD*So@AoxA3em
z6`2+#V?sR#gEdRjn6c`#F|V3pEkSpG1)s}i;+1CI+uyxoosW00AgpK+z&*ZRb*<>i
zv$lP$CG_-qt9D|ywGJc)DJ-<FsP#kyE7e@jL7OIaz0!)0xR25rlEyjGV1PCC1TZ&k
zi}atvK_Ls5rMzi5Ghn=q66&b<qeQojSb~9%D@A|}_`a=MD>6T~f!qshgz#p3r*K^B
zq+mcdXPKWh9cRKiE;w^{x$MA}5Hnz(^Eac=7@52$&)9_%=Rd)IffvXfTc0UOyPadl
zk}8Dw@1MkXTX!SmSaWKurrsna?vbX!^xeCzfAO#$+2TF`J!Z$?3#RbK?{Gq}{_yJI
z3k*4(+*XMMix2rG{gf-Ek2yk5c(YVh7(yqIg(goY+}0t(2`<hXVC^id_kq|ryDc+O
z0vXo_&~0$qLA_>q?@B;jixAHO<3W<mQeCz8@>z)}AMs<W+e&x}n?p-hg%;XJSD7Yc
z8b1-!fwg1Xdmqas31VKlpa%v-3p?3c&EgQA-9Yr6+0`8=hej$55oDv-RT}#K&lw)2
z=x_l`>K6K4J7_YaLXFlFe}BV0+UX+Z{>UBIosv?{FfX=voGee3vWNe&b#C>;eOGXt
z{&G{O+TElFE|c0(9NTb4d21AJcFlM69gWrzhgp#=y{?^rym2Rf(=cl2J5`JZbht+9
zD@qKq7bbThAHUIp5G{i7!5KYX-*;4&V~PSl4@Ud{PGPuEF8MVp%w&2@dF<nT35|Pa
zadE#&GeCmlTxx2*@#|PG+?>`sy)>h)v&59Wb<_svZ`u_DkG_X>OYG5g&d!`b*#!*O
zJs($9#TSID<YQDLu!ftQQIYp0%k?bG7gDsr$fgZirtCd*T3Of`0^f<=hb`$GRU#S1
zVUXkLR%R8rsXVdVE~*~C`Y5zsKdn%_H3_jRJ=Z^8xVxNC@*aw;p4@0#QSu0>kAXxP
zT-1FXK$_BK6HN50vaaGflHr4C(q~15cldg@y2xfk7=wQsWpIrpSwrTEi+mn?kExE(
zCBc5htOW|*0!=ApYnC;?FBit@GHsb&>{htV#=R3{pUs4-YdR_%gW6U0YJ#Ge9&S?L
zUTF&YH5oFd><kGitGSO`=(P~HWxE$j{<M`eX)wEjhKV`{<FG|NYOGB!Dz{OFCcVN6
z%0h<zcS2-B65!C&osm{ft@tqeqtx`mN3)bH8wY1{vzv@$;ZD)x23?T5$Q=~suyXuO
zp8Yt(romRvh4trdust%P&aaR~a1?D`A#tf@aV*STFSY*)NGbi^^hD_u;UAQMRhfTh
zT%Nwkj>i}tD(|+FFYo*TVXth>)ZnQI6ikdOyuD^pUs%)oa#oH<=@;#}Jw?B|Izg5z
zj)GqjzgQ5_Ds&YKC%4yf@KogH_sA2hawP^B#Eg6Br%v)pC(oJAdVw5`@5Cd`GP^}I
zHjCBB(oKZ2HwNzNZX%tzYb_ElU|_NC)j#DEYKP*j#-M3oITYnG56*T4U7qZ6^enrX
zh99tWUz>U6Z;IZmZ)rq4om!7L=DNN@yXLZMTWZ&cS0gKgj_0fq@xY1e7m@_|g8#ma
z9}%Rva(vCm&l$wD%5NXieA(hxVE7nlZ*V`^QCu`rmvfY?xtE-&tV~}sEurr_hEt#)
zUSUtQ8Y${@XT`r3a%*yr*KE_NPMAj`<dyvF&Av;4pgVM7d?(%CXLv^1f0nVB&v!8@
zIHqKO9Kn{c+)Ai}g{<#xr$n6RVFVDl+NA5~j`PmLz2)a(9o+Al*r-`B{gUnsf@NU#
zYnU(Sn_mSLFxQ9rn}}0>L$SpwmpRnGd0<3CE2UeyzQrTi)-0^1R0penK%!l3F29_k
zKB|Q?XzkT}pSE2!jz$A5l<vc42zrklk9m{#_Ep?d9^6<W;3CfBXbPRh{?z<N4Pr0v
z|5SvQEOEY>&Y-r>52Oa#E3SdT#0Ke^B#rO)3Zi(?j(D;Q*oP8}m-Y@{TDC04?ad_(
zG4kAb%^7PhuZ@+{v-04#vU>?i4%Vwi6z$(zM=B}gan9%uG7T-u@u|q=Y53o5EV%(a
zQTW}Inv!3YTe;L;89H@miMt`c{@z}19CagXd{J7P-1^$+z&08Mtx3^gX&Se}D~Qy2
zusr7$kg;{LKWGdFM*$9rGRv-tplJh|;0w&w#4mr}K**mEQI352EpG0m^PC=M3cjni
zwiVy(rEh7}(k#3WEj~v#Bo!MN!#^8<i6UuSR;p!Sxy)XV#lBdtuJj9Fb}d^x*G&o!
zhmE8@bJscIt>apTv!cQ1moCAuo8$9TpZ2+n&wGpDKzci_dfx)yXWXAaz6%bWzIsIW
z7jlCx1u###q$*R34$5WePU^z;t5fMM?EP<SBF4O~3k-<C<%5blr%R8CiF$BlZ=v1D
zxe(`Ehjm+G3g7ziqu<P*wg$L+xuH7~cpFyHY2abmS!Uq7dXC_~H?XsDn0@*(e@IS|
zo-}>j8x{X=OD2)cu-U0+;g?;{8cl#W0dbM`Yj?k$H^2aa26&;RDTup+9y?6Tm(E$_
z5U2NuA9jKXa&Y*X(T8~N8{*sd-4(5x%SA6Z-(?|FwLEJ~AIqrp_n{{)m&y<p%}_27
zq-D>7);1I)eXdu8Y-9;T0N`~sjvxhE8D+70lv)BGf_-`@^PPHNTvc+$XoW*7KG!=}
zAXx$VkOzX}?V*NQ$dr^q?%dHLg58JJw{YpR>u$Px;cjr1pp6cWcFLENiFLudTZq4*
zvF3t#eE72afORTRr+Yj#k2_ty-8sD>b1SpfU0!I7okN$Ssg>%-+N}`6SW=f0TPieh
z8#Pb1ZyiZwcZVq#>XrODu+gmi2rE%1v9TR4v{fTD_x8&xkvHg&2a9^@$Skxi9Rocx
zdiWJ?8ozDo_(z=Nk}bAty_1oHkF+8?@%CN?e5bIf=Jx-?-gkyIwQcPxh#(4Buu%lJ
zA_4-^B~%L_z4s>4yAVQ85K!5o6r~eF5ed>efzWM82`xZ?5I})Y0tvkn?qZ*<dw=JA
z=l;L<*?)KzDJyG^Ip>&TjQ1U53|WDFI(N8Tko(_!2{3l5#6r#b>Z}AdIAz0<)Duwm
z_zR=sCE1G-g2Fs;O4Y3%$aOpY+&y%zncvX5<iK9QwiYfkZeB{{e#9y9h>>UtWMfXm
z5`X@>QR(Vj?iH<C^u)Sv&qSTY(!u&=tTzU(iH|t+GOKYBnhoi>G7iT_#lKnU3&2mJ
zeLJhV1Nlpgfh5Z<v7P;VpFp5|qbTFsDo;O;;&?|;*NXQdrYdWl>m!U8nQHC4<0H6&
zi<t^2hiF=Lg~;dH({EnaoY7jeY&LBII;{i`$=YP}q%O~cJ@7CqtHbWi!p-Lxtip_c
zbzBY=<j<6C*@-Lua7~-`G?IV*_W?pjN+bv!+_-13I?pxN@-|PaB>>xilf(|-g|q9%
zD)O4X%6#+eZ8RupbZ!5lKcGFwv2c*3#JQ0ITKqJhRuOfI{Y1YaVk0k%ci8tmz0vs{
z$psVju|aeQuJ)1g5bTC$=ggoMbu6Uo4k3E_O%iaRze$21Q>83--=|6`j60u-3%d#8
zJzscIz;FS}Zo{L`zHJZjpxCs6P9$pVzF9<vllpf{CnfKABDX~s>jrb8jWW=@HRSrn
zZo4;&WpHP?;eLT8a^4`PZ-b^ph*cxpF(X)~mvSmZPA}8Ag_bT1n*dA+0Ue^>mH2Bi
z=$o|p1d_ib0Un+zivsbhUEh&~w&{jV1s^G%S+-)R{<3^q2=C3`CvOQi2^b`mHdQzU
zRu`vA)!y97<aJIRUV{dvK2&ObT5Qfw=M`Odewp^@v1$;|$=cj9uKfPbUe-r``AiZF
z{e=M^NK=NX(mAVoiVtbcNzCgVLZoVmg~y?kaM0(Bz{h&+U0XYn&%2?U3QlHegs!od
z*7#dh+Ltvl<gv)(yD^*>XI|5ksHYiym<W%)nqvmH#|qBp-IBBaAQE{Tpky$7_?*|(
zPj~d#kE@D|+5VNZRY1h;(uvrQH<hFA8vfDJ{I`oHHrT{YII4O2D-MyaRNI-+)I4o$
z9PP#kJZfpqv3TX#IPIAfX)tl5ZW=dy^vR}Vl`BpSenYs{itU4CcrTYnE6ux0+5PHV
z-P@m2$fft`vPIa$o&595PE<<@=ZNqAcvaHrCX-x#Jbo;QB1gESYm;{ZB}uzMfArXj
zkm8Z66o*@(M`+Azj~sngR3AG0AML;o!-R#$A{`|z0`BK80si|H;Foeg)p?%^NBnBY
z{l@_R{)bxzn2%v;k&8cP;oom^ghom2>5+SHTs(X){N)q;+g0eoF$QkCna>vgF{!_9
z$I1u%kWzl_u<5Uz_HQ>idaV2$_tRL!Iq-iUlg@kKhgkE74vs$?um8tQPJhz^p4V5a
z^RF)JzaHX`F6`Wg4*P!yhT&YYJoLroAj-2+Q^}4>fZGUn##%|r2{p8zYFsSFlt+~&
z0F;i44%6+1HWi;eMgY39Et$g`6+qc%xlgJFRAR-WmZYrJ>)L61-&j|t^t*3_Sl*N`
z0ZPmQwR;_v<!z=TxDu0j)M2OKJQuLk%_hH2z`%EVgAQKBpoq~j6iI?mZC_`)nBOZm
zF@kdC`2C0|HK`N|s#FnHe9gQjvTQUBhMEQK?@|P+P4**Q%<aLNGwAX!%=aaqH5>6N
z9<eEK)$?WeDUqQ)nvnf<<4|Aue6Y=Mxjnq;$K~oJau8QC{gt#do8qvHy@d+lWUGb}
zDVYVG7K37gya}n(<zGDk1&cAStm4f5be|YG{vP1+;70up!X2o4bT8X;K+ztx>sr`_
zLP-apJkQH*(FWbyGgUaNZgKNlx!P5&dL2fbm{-D@+WnN=jF91*ias8sXryM{>bX_c
zDYyZ?gR(R5rb)W1O@#~k@l>trgLh`2^pOm|EceY6hi*AvvGdjALL!2ZQX`a`VJW%`
zuA|kaI=DAAi^i<$rZr14eh90^J(OVecD=>w$rZDzFTS@iFW#2S<we^T3pg(wTIr&U
z@tt?*#r~Y_!~GF^eQCtivxb>c3m(%-ecKcR$X2{*58Z!?Q}kwtz|ARjOt5WYp6(<!
zwrLz<9<1$KRQ;;92EUCY_o}!ryI)*8uFt6=XeKUEN3kLp6xZ9+iqSQA4i{5qK`N=m
zmZ0QofCjkL*LdF;n+0{n&W~PrNfByVc&uOk;N|C;u;4hCAi7#=m<cW|MnQxND&G3-
zu03_;WM^MaM<+&VU80rC7{x8w0%FWNf_rms<O!(&k>m=qsWGz0v$f&Y3|hUaNsr}$
zt;1~!tnPA#jK9{uDB~V=mvVWgt>cvW3i)$s;CTTZ3GZ7!YXMxq(4rNnTXA`fu2rS^
z?$yqKIV*nAawwppC`7&Z&VZM}?(d(XsI1lH9t4m+fV{{W2PIkNr(Tg0YOfkCx9<Q$
z2O}IDC5LD0!ghWe*kE)az+L4!5{Z`<%heMwRVN0`A0Delk{<R?ZPz?=_fA;##t6U0
z<@+f9>$%bP+gE@k`R)2?OjE4rK;+zCI->0SCFyx9F^By-M+5DwLgM=#PA~e6NnEt>
z&&%@~>J<`<URlhS-JUX#8h@uWUfbQiU9zoA5bK&|4*?J2i9j?Kvbxsr-9buwh{c`K
z5^+Ujal{XT&kt$8OkC#E=DncH>_4DhOOYskI*%>&0<X42RGeF##dM~4rbZ*`YUdDj
z7H&XZi!5$!g1)G!d!4p3J4OyLzlPfd!BjfAPQP&|SDIi^s6!v<&?M<h-hSciy))eP
zkd!8~DYp6*?pX^80YdL6c~h(YMQ;@6;*6MnMt2ngwojkRr!DLL^>xE|e_qhr+Wlx7
zq0NSkU2HdzcjRuD0-jY;xM?L-8Zb~$@|Slxw%b!m?BlCdzJwSBje3(7iB0Qlwd)G9
z+shVsoV=6tu#5GYRmj`c>q{hTS>|GXaef3*ALbTP|A8}J>tm8SdQ*Io44+kC%SrCt
zNg(qX>W=Lgw7%(-8R{2r{H94VwDM^3%*O$g#5qUR?R>8@^%?Vp{Ty?VBe(rW2@2G^
z#vz!DJj;X#Ym!2xgQc+oC0|^AOtjE_uj~M*(9MSM9N9E1px)k$hR^O@DcjBZjedI9
zxL4#ucgwr#-f9K7)HoDD9%UAa^9V}m!U1h8-Ib%+NYU3gxst5CqTbxkUl}kSPnLAA
zOK!aq(ymFRMtmV6;hx~ugkoB@`w#MDR`h&sF|!3HnNPI~l!)Qh=Q3QqQsy{LUAYDd
z0gTZyQ6J74;t0E?oR9perl3k!;ih-Jwq(EEX$6wuIy3v~dyCp~;yTWerx@G<elQdM
z)vZ_Jp2H59)GX7PD&abMgE3#Ts$64nCZ^U0hj-hv$iW@xEV=D%MVxI7x3JG~MMM^#
z14%-zY`vQ_7>6!T8&d`R+{sYL_~$Yan;-8#2PBh|J2pV?qMji-hSLh>=3|FKo8cO*
z#`)udp<f8*h)Fd94?Sl#8Yq*Ni{5!x-MF}sr9AUCnolsB6d{~~B<<Cg&s2F3cI37r
z<&?1r8m*;?*zFwG<oSH?62{cCcHsFikM%0h5jxuf66n;IO~iX%%&C|3m@gx5N?<8R
z4uZ7jX!pnC_R0eXC~lnHC5<4l<ti9GNk)!u`c<^&K1}z76&B0cnku}2emBQ@s4fx!
z8^VA;IO3O;5NkB?;abe?AWB0nd~9Wbs2rk}z;8cRF-h+u-Bi~xso?WXv7&n{D5aPr
zW5zCiXP6qkWsJWHjWE|jELFHgtjuY$r7HWHrHI;h7H=>I7m+@nJAnJ*VIPyMVeSXL
zd$H2nQ-Kbjm++Q58SJ|R7jgf0?Et1f2bxaUg&pi~O5m)(IlCTfv%7oz7Bko{)p2Hi
z6XRaH7^1>~bYJ~0nf~Qmo97zKPHsA^OQRAY%qdeXfmp~ujqNDlori=)T$2wsWIB|r
zgx#HI*T5n*Eq3DL7O3PJTcwIta$W6yiABy^Qu15-;c|TH8^3WIm%&1t;)dcT(%v$=
zz6xdn|02~fJz&ftK*<kV;}!lHR*=2R5QN+3x#$P!nyDt+9cN~HY*u*(n>M&*u=<Q?
zHrP$S=EH{yt)s7VNUfPm8FfC39&*O19m0za8zZhP;8j5sI~~kGe%~j(_I;2r_*)Bo
z>dSM=aOegBUz9y9?U>R4q%mVAL$=oEOmrgN^xfKXkx>+?=A<sVxgf4BF843K>*|8J
za6eobuJ2x~p{u3eHk3=4=QmXK<7l(seAxKS{hQ@A%O`X5V@dGIXt|MqEv-um)=P#*
z=^>9?#_;sxcCZTV37>{imEveAg(gsTl{G-`@NP9E89CN9M<WY9i4<B+7c8Ho?vOmi
zsEu!kqlp-VD`$|~6?c7&fR57Y<yNY7$iZ~imlsUhzJ@aIj5Pg%IlaH#C0B4X?)zp1
zHBlOOYnZnmwf3JBu6JX7HsqJlaKGCf6}GM*{z4^x(H>-do`GH3C9QTTtV<}$^yS$v
zi)ASO8kbEf3gA;7Z57a%6V1V-XljGU^JuAysDtiN7#^d%xJ~d=xRV=ofqDC$bNzgO
z-Xc+FAEI?!ees%W3X3~BdoHxAp2hvP9_|(G!n%_7X~?Yq8_n>w=d}Tf*1+VeXFm-s
zW&#ln8xN=v4{m=g0opz^iHr|Lcls!CqM$otc5D)$FC#wfkm%V<we}MT@{Xj_BL(c&
zD~>EcDWLk*5K8U9c>}nMeac8T7kI81m9qDC^WaJlqLC2Y=G`d*=`4~OpMW*pe449e
zeI5ul3-$!eb^xk$x^Z_kChbvm*eREDoGels4PV9O8s02k;&K2g4)LHKqHTrnq=!hb
zG#o5p$+;)LjdP1GF=#U1TcH}x1RubU!;`E62Ykr=W8!W)<CeR`uKj+@yUd~UawTHI
z^>+w9bE9LH1&~e_HreY1bNMp?2qDpop>Y-|rkb%z&yeEX{Yg~DUh3Q>Bc0j6pgk_<
z?xy9<nDQp9ODe@ZDF4|oUCA~h!%gm52aFl`pymw{;KUJM2{($K7a&s166Q>d2_Iqf
zb~`d*iN<*b&W@XY$Rz8O@A9NpY`5%8^zy-O6Tq2cjF)$eal{mytw-j{L`zr?L6oq3
zz#*>4ANn@0aZ2WI_I%$74oLr*>d8G$T)<f2@Z8H`@rjuhdd^43nL2BOP*x3-7?NkF
zDArA7iq2AIB)W<e1(H86+e4@|wt$u8SwW3AXR;!=B<c)`T*64|g?GNNL&6%@KE@$G
z?{JWJWT876xE9|&Ro?4~I^5Z?{Xp`naP@svZLQg^FQV17-lO|0H7$A(ESzrD!w&3H
z-F-Rg?oeyhTN(2h+ZZizyhmLT{JoFJya`TW3WW+Ub`b5kx6@h&%7UsCG_r#!{z)Id
zQJTW8qScoqWvrKfe8Geq1Qn9f>OYN1glr6B&gQvS7PnqWnf2atm6Jarx=?)gtIUUN
z{gzamqXsxM@<y(N4}$q%le|k@`ow~L%%@&gJ(D_SY)M2K=IIU7g|pu25v^D_ca7iJ
zVvH~VZjF?=bAWK;w#!BM`e5INRC@Xq*XA2{?NGNSS$2;#Gl;oN#Xe}<MvVpH2ru)Z
zi}vEN?({a*t#6{gzqv;CiH>4NxKOJ&HN+h1ooO$#v+x&osnKzO(s7b+*qr=iz7>aw
zb781=a3-|*n?q%j1TTq?$6<S3(?FFFD9f{IlEv=S>k$LK999$E(@rbbv}?X>Rv`AM
z<P|_2;QL5GsNFF&-78f_d0?9uQ!*hNIR4Eym2@J}c(J)xu*+p0G?9@G!sJNnJ-H2^
z1kg#m)zCJm`}ix?2QuHE*%Cmkw^W7D>N}AFC0>(~PKB%k>Pnn#YzkA!=mRfYxW1OW
zNI>uF49FFn%y)mWx$mtQzTC9gup2GUSs&<{vC66Dtg0w!uQ`=Yhr6HFl%JR71b6FH
zbjt_T&bhSKy@>X4pQA1IxOXK)mO4c1`)ECl`5pnfzac}Z?Xsr37>>+8P#12R@5-8t
zE;4`Qo<Hv~zb+YP9sA&J<(X@(Ba_u-OnVzXaXx~(+^3J7>dGKDguQWzQ{j#A!(FGd
zX)Fz$;nSA4O&w~6pl!)L_gT8;L2u_QHr5$sjIjeP!X)dAvYE${e;5ETPGC#7?@Qh>
zMf*4h6=qC@(NP)>-qaNfd?@6#VvSe${%mseHxPw2xUsR@Q0O$9q~*H7Y40CfL7nL&
z+BQ5;+*?_DYzu7%deOA^!&Rz)CJ=Nyd=hQvn)|3~-k@&dyD|>Czsy6Z67jckt_#?&
zQyQW?bt}oqpKM##eiFi^d)sB(v#HjVP$v0dLt%GQcz^$UI189&61{!3)@}!yOh&)0
zzaoDa6195Dd%FHu$zOK(_5~f@a8TYsHDZ-^firq1JC|wAE|gqvsGFNfW+N*+ip?|2
ziPUXvN5=D1gCEYt$B$)>sY?4@qsJOM5e9q-JY_HUH}JRyYX?dt4v`obsED}UqBZ8`
z-L*RrYGkxp%I1MU7u;s2m8#f3bLz?#Wv^f`8!h4DA9)h^hct}N4|nQ)$HW&0&^wNX
z#dQb`<lkrnEj90hLEG;1MUf!)eB%-#xl^+<{znvX`HSDv9bOt=prxTD;DuMf4;||o
zoLT}H1lsp*F_yFW@e0Ihm_HnCZq0n>QBzM{*?utHGTL%`<t6v)!Q3uL4wP{$l!6lT
z9HZIXSna-eJ$jK(Kizv=(5h)1W)*Vk!zVF+3+F(|MKiAm&e41T;tDV@Dz=!OkC#2-
zV}h@tTiv>U^f!ox+~(dZkG7Z@*gWVK`|w2Mcy1|J299}-C=u*-_9)>DfyZgPyS!jx
zx6hDmlTHsK4a6O%49R%PK-^}+Ah6VQD_wePtXOwdOClecV;Rf1Vo0mwx~rD*o)nR;
ztVjPo5`?m!4X58B580Atzc`f=HTmTIGa<7txpH2vDDfp!cB^gX_=p9j+EDk>CaW^>
z;)VOc&^}8~3AxU0;S%GVyL!^nT%p{ki5)IV$Zfpk=9<jM11&Rv<FqOM+@Xz_e6m=0
zOCAm$?&k6>b})~q=diYEGB?O=PvEbe2pvljw(i;&GO52a6S&&DP`fjN910S%Az~lw
zuVK3-AF;_ps{yS1r2?014?1%A;G^1*NMUpBU~X=?x8DA=EML~Ty=!jnD=p8rt7|X&
zSowKkL-=o6Hb_pJY~nh6nrKD$<@ii;ZDXWIEDxIY=3V{3Por~?8x$QEF1qt;*Td4+
zPFY<G>xgJz5sC}8JYgQ~%)qVVLkGlw2-S|=KYid(nsYr0sfIom8$p%(8C{5x+Byyi
z&4Y>kUf+8kW=|X;m}ZQ>OT#m<Ns3YMJ|b#7Yx4s7g#3jYeThFL1tW#2S{?(=`t2E#
za$)Ue_^jlpyOQ`7lCDZfX}qBMAk;wZMSZ+M7Y|OM_RV5oBrhf;zX5EBxQuX5<&hKg
zs%;S4+nSso@e$g!uWEl}zC{;beWTXRW%qiCFu^ecZ^ikk?Mo1)epTz`WS}41Bj33D
z3b8qSqg$fRQzob5p%Bc>q>fxWLq@jb@uoCNjs5VBLwR7^AQ~jC?>;=W?tPe~xgdzf
z+=&Z0uz)XPmL(f<3qAK4r)s)<8s~+^FQ#WaMyH3Q2kgCc?BrwW(s=(vx8eL9EcR&C
z0#?{>_{gXB1c7sPV53ze@ee}O!g8*-z9WJli8En)d0A>XO0QrpFLTD7v3wXzM1%Mw
z;E{c)@xsPHzpRy@MxvQ&U&I7Ag^j!FPb$!~?@uZa<K%d=L{a95xc)AWAr`73-}-4U
ztV6&(RZVh+iVneAHyIC4z-fl4es8R_Wr{-QU~P0B`zg`VQu?V}!)YCP7c}LKD~diP
zf`C@CI-!;nq|64S2us<}zyhRY52|V1vV?fT^TrOJQ?76p1)q$|{vA;sZYOhotWN#_
zRO8~PH`p3X8TyXg^PSd==?JV_3z&;8(Ucq9FFUg#!@WRJ%`XuKg0tS9;~|v$e)^#v
zzxC)}V)PN{k#rrDL0UDW@ha_!ZU-<~8Lw2xb^u0A?`iJmy1h=08)tvU(QfL5z9nHJ
zK72BCe&&O!B7emZ_Wxkg_@Ty^z}1tQeuDxH(bO{s&EYZ0tDs8XIQ4{E3738cw$w&`
z@VZ)n=vr|LS*X1<KwncIT2)Uy>JjLkZ0mjnq)-{pwWs5^WN%w)qdqsjy?zt%T6;UY
zHUV^x$3LlJ_qQ;vrV%fOc=P%aP1mwcUps#7TJi<W^M&6$*ZB=c_;8Bo`LuR}l|YW|
zbxfLc2c&=3qK_KXz^@b)NgiD6k)QF0Sij%Os@9J@O=t@-EKvs|=!Yu{$gd5lPw2+_
zO^cS5|D=q(AF}&e68z{Z{k=ZHOx8R6fI;zH&jL{)Z<tu(H&n}M;RqP`;6|fDlinO+
zZ}J((4D6Ujy6hLH1$t|7t(Tx~HYw-tdTM+*k-~xZi1TB{Ln7C@L;*gNbH>Bu?_~c?
z?8zU7ceSI$fSU^}Ze4J3gp}uhh%*e>U7vc^cQtRz^^*-LLvLglxzQ(`%`06gWGMq7
z8x*0V;0h%QR)NmwUPN0QFTs)IU20zMUpDF&bkP9rfQ6;Za}h6E3@XUwJW5P2RWsxt
zindR9{z~#SSJ@hliMGQfb>x9<#w=gz1Yt0hs1BIqSjgv$b2hkH4Zm8){P2w8L!*3R
zZyQ>21ks-E+X*+wYZlcsegv7Xz`lklY#I0-?7`(TV=*FylgfO#tN>!mVCxonCnpX|
zN%dJwsy_%UUjm|lwF+$Urjz_1XkjeyFby7m_tAb^Ir#X|VJr1@8O6ITEN*P~a#H5O
zmcrS&y=(oUdLQQYnuzlPZ~df}rLkUV$#0|%jp_?D?Bm3^FZVjbetGOVm2@scGA|5~
z^1ET|PKF2@HQg*$ww=~ma?d^y?zyP2LGkmOpXchN^|-PCQeJmz`uaep3xdhkD;TsP
zbG5;wp}EYpJGp7!k00-!c{J>fnkNg@mo!qp27Ay8HLR_SwX~Tw;Eq)7(ASVNGhFqn
zWtS%Gl9;b!Kl+UZpiGy(Bnh4W7|tU1gBnV~u0FeLXXg0?4Z1fle8;=r#PR(oY5ulO
zYdP2E)jJ1?u#%R?hT;<y$L-~tk*a-`<#eX;jd%HM(=#^Sl3p)9vFbv6g0ah+Wc9N(
zRtODvO#$pxmrT>BFIjha51BBWC8C=-h0vY*M^8d-<(<o|<s{vw?dRu=JbnQ%inSLJ
zK=@;!)~egNx04f2-7-?~?zY0!vdX257;d8n$kXnT0Fs&2ujZ;Ri-~m&zbrp1<QAp5
zGG9<qhu-zkZ+qpOQI+8rb15#a@f)&dg-5s*lD`NX*l1f6oKwd<&p&<!8p=7bHe(H|
z;<T69#neeUJ#m5J_=XE+a)@tKGM>;T@bhTaZCJXi#YlAu`(BQU$b6TpWB~FdJ9Gie
z#31X_fIs|fL}i=OT^XLa+t<vjT<~sL$49R3NL@Dh@=RDho1LkWt=WgyRg(yyUl>P_
z#+uhET6lH&Q|`&WjSE71T~X|rYd2f;bllknVJ(aHff-%GHEGA=j8&?NTX*)e`XBG_
zu=oIeWH3m9wG{T1nY)Blrh90aS*Y%Zi^6b2zNQa(qr9tRI5SyK+j+v0sS`ez+(A4v
zUb_raSh++8uGTyR?A9mLy~YyLiWj7`!DRXvIcmI^{=|+Fd!b&Nc!M+ct4CwH@w(T{
z(>}Vnw1>u`oS)Da_#i3N^wJ(Axv$o^Egg>@b3{9Axi>Bo%{{Dg&SJ5KAZr8u5wD<?
zz@imW=NcQJgN1Q=?lVoTT$40tw4@8ih4WG8vq`VozejM*Tj}qABi_-f;ut?feDaR@
z(_Bqp{Nwsxw&^oQrzf}_-=_+MGap31Ft6p+9ZTp_p}j7kP!mb+Gfa)~n!JGyDa;tp
z3E#CUL96e+VE2?bxA628Tk@Kw%xzD2wWURFw|Rw?7vk8CN(Cqg65&<ql!;5Jw#|mX
zyU4DDHy`R&g87VOXS7Cte35Ll{+jznM+g$dyR;RbOW<PqePkugp*kD(!dkQ4c>1H~
z;ND#+P3NFPATe3EQxRd>h?Oa8-jU*oO!{{IQqPg!3l<bV&6i#Qf*@kOShl2%nww=G
z^qy_)uUYSOx9n?GVb1cXjofttQ~4%Zp-ZbgAseK@0&Vpllv7RbgC>@lWGwyE$8719
znU72B6$zUtG*N#HHse&s=aRQ(v!#joV^9Khv%Wkcz<*_O@OG|yRn&{>I2}YS+T6&`
zCKnXF2isnpHAJWYS=io_UbA3;KCh9~#eUq=DF64PlAO_?{Pq48CTNLoz$e}z`~DmJ
zY7$eF8-CV>2fg*%m&wfQL5m^^Ha|X*&7=0qz8<F$ttueL8@0~rz>z8;!MdO<ug&P&
zh6{vT{=AJglxDtdW(7eCGKCnc%#rbwVDQSHRDo#^JxZ9LpcLaaL^*M(ye4l^ehG_f
zEHc(Hpz=g${-bB=ci*pnpsgisa}xODHVyAG)Ajo-20Zy;viU%3=}y0N@H@_COT=sI
zk-K1tq4F_aAopO#pcOeT1$pmU$N=fjuxLs@7;$<^qQCLFtOa`%8Stu_93=N9yI+86
zq2rhyO-rY|uCne%$bYVSu}8Jo`mrr(oI@WKU3cGluGqi_)fg5qf1N2Y%n2J-GnTJ4
zw->fPup5dRn>{|pDH&YZ%Y}#ue0<A<^_kI(cTzEdfW6PQwfR`)!m}p%51Ps`=#Dki
z<gyRt4H(e}@|YiQtpZNFBn;#_wl-}ZJwpPTaFl|!0@hP8%L#M-y@7g&)t#}6Q55ia
z%^BhdX{R<b>VVouzKBKvF=3$)%iQy_1Lv;qAacv1ed=@hQ=)Bh_|Tt<U4PKV5q`Kd
z{Xtgxv=DZ|aB=9mn(6F8apv<~f`8PqHhg;BEMb0Up5HEHtI7P?KJzzHl|hrf;hG;`
zj=bKRRRAQPEmjKr(ULT^fg#@6=vava4i&qzH>-WN+t*&wiV0$TfauL50oFs#3^Tsy
z$;DwF;DyZQV?MYNkXkUBrs**x5{NAy(3c#@FBnR{9LOqx<W1hqv6&<ikw<ZL{XBVz
zhV7>b?uA+(_byE9;~rHR78sqdd`{kC7x6Akk(qJ#iwJ0Y%X5EISmAiv`Pi4o`wa=f
zDt0Tfy1_<{6nfv-zN>y?^Lk|;u0ano6A=BMUNCFH1FtJU3{R`Nr+FMeHX#G~3Fm0T
zk)Y@>{6HpN#-Q*K84b?VlwQrY+>Cx{AHJG*&6hoB6Sj0|Fivtjq-n_4#K`+-*~DHO
z1&W{ZXZBrjl2;I{wS=HNOU7_@_2ZzBcs<weAC-#ZDxq~?a89M#m=&@c=o}>ZYLRod
z-LsKzM!ovalR>oi9NqMS3C-PAjoqBNxe+B>-Va-(m%F7e6@2#ME8N9a$Z64?Qv-7~
z0BHxkt6*qa9b79@92Q=dn<|W_toaxNwIx;)+A*Hj@RCff3QfftB`JWJ+)=JE*N43K
zd~uO`xX6Nc9)JNgJKr_;bo*Xmy2NQ@SNSzvy)Ezc$7pFybt*L8g?+AT+F%vABUH$o
zN9^_q&M~{8Fg`aYun#j<D>lH5mCk`SA%wxWfR?U3pY?A(r$}k06@gLe<L4|?tQN#!
z;Tyb$2tYA24qjboP>|HyXYTPHPWQ9He-qb)0D8Eq@(S`WJRlp|Eb<p(9_x8Dga3K}
z-9Y~I)KP7!{6YoI8EUa(j$ci&b_o82@gN~8%|klFQ<kvi)@Temwcn<$+Q=?BP`)kl
z&?-hFIPm?Hvo37V<U+gQ6}N+K;`Hd%G!BiC{y>L$j<wd;<G_wz8RuzoSu(_298_*Y
zM43f^`re|F>OBsm3m5w|gSwd+i!0gklUP87=9?wv_B|t>*Mv)Q+*JzuKDoJ^5Ib2c
zGqCZo%Q)qkCQGr*HWcm4_pLdU{CF^qrM;l!s3|tVpjGq9nC9i~-)gQLSOrW-RoUT@
zK&gNMVe_Vm#$$p9+m7x}-UgL)&1vB1G09yw7m~4+Ze8m>Gb_ponyX4CoO*(1#Z@I5
zG8?h$xq<7J*kU0!bQe_4{jIwyP9C=HGoxX(z*?$qoLg^I5ZoZz-~f^=m!pWf^SYtu
zV=-PF)y&Al0}db^G<*Fe+0!ydAq+016EG<&_zJmo$gt*j!`atjZ0yr*M%3+a=8^H~
zBI9;O0#C?LRs^d^R2ZJHBPkfIr(b!e4#9b+s@l|rl9nzHMXECm)DK^#LtPz==RFz~
z&{r71j(Xjf!}pwlC+O&2IqI?2ze2P>ZUU5-JJpPkdq+Ya(mk|c<x^lkq_I90$n>FA
zhwb%p=oUD*k!pAZHVA&>>`WSYM}-Ou3mllGw-hVJVY_Z^{8O&xt24q+YMT>`Wuukm
zK=SFUFSn<#0S*zI{CD&@!Nc!qUQSZom>_0lkDqTlZCF&`((59|5*j2HNW`ItuqA92
zG7b8hW~lukm*!9wZl;HXph0*MkVkR#G(gK>V0+2`+h0=pb>-$8`U4I<>GqR%`*#-0
z&(|0c0NQmAXJ>Hfr}G97y#5xSKIF|{gaM+&DtiUp|9t)LAN&8|HlYm1sM|AK;F;Ju
zOb{g`*_Y&sE0zH2#iaDso9Ev$ZyePKp0EMyqB<~f+H!TOaEt?2(>Bo@+=>8V6iF7&
z4&5nrs(67@C$(XAkpT|#tIxCY`_KF(rqH32rxkyPp3}YiOOT;%$6!0#7c0D{^FJ3J
zP<KmeDO7sQ-uB6}zz$E@SvZNbFNh|7NkZ~!kGN&fvov<Rz9c=kP&;$waQ(RT`HC)P
zBT)*Jd#fX^VwiM*smkxxM7ZHD41ZpQ;ewew26|n^VHW7*yDOcA;VEL{x0cq4_RlrD
z-R#L+2al{S*jW8~7GPQP0Zghy%)OkyR>}&055Uk6TOW1~FkAQ=bE6&Wy^S!{K2^Un
zLP8GI3TjXagv59UW#pR*+Ffe5GlXxeCkcr)1X<e@&qGp9n%PxN+~{1ZV46xBS<+Yq
zfNi&rW{jKf0^6rz6sWgGW$rHi3{3&@Dy8Ak{o68p<a*kq+ASd6&-N$FV$~ZK3Ly=q
zJpK~79dp_GfErX~S~bt2Uo;|gc}DQ}%ZY**kzxBJe!4Pr;ovgB{wYuObWP7Q$kRg_
z@KN)sBv2x|FMfK9tmluWE8gT6#K3-SBD$MPRC=xyfgmea7Y^S_U36l^4LO&1wDNRG
z6IB#7F48IPHafb8vbu)dd;yOlt8FUTGN<9u+1qOs+r+Sw!VpqV0Wn}Z0oN%lC{shJ
zw0Po1Q?yx)k4OcaN}Kt9xwbWuLJ#Dtu-O`7?h$tFgDrK;>l^{iRTi)WHLY?<sGT0f
zTUu6*0>tLcxX~^8{0^Y^52<k#VqQ9-j-Bi)u#l_d;tIYaQ}#OS_(*i)Aq4?juVjwW
zEaF`#UIlayVX?$WJg{@$<MM#1Qo!y$tmE<bkCf3(stR%r);Lk{6?VxE%admjFbyH1
z<@ck$2J6U;2T^Fjn5z9zYQxUFhEzA6QV(}xS;QVtH=}pQ4!((7uFlOxj^RcpF4O|d
zYm}e@`6Iyi@I(b<yVl*v&2WD)E;lmJTB5MEgVk23^J4dLo4ceB-3t`Ypb@#I5+z|4
zGqScKP%Oemx@s8k{kgG)9Dv~YWVgj=2aIs4U~{{lP|a{p;m2#&)Pc;DjFseGwMyZJ
z5;Cc}XwvKBXVsUJ)>sq)`I^rX{d!FAuVs4rW<QVv%m`W44fwg9A9bAkaLt;t+I6r{
zVl#%_*{sUNrg7_sfFExo=!3*!S!+glUTBvX&JvH^8?U3lqXDR}Bk6$1U!b*73swMg
zR;zG$)znV?D0WC(ER;t<c*UOdoW{QUl0vNfnw!d-p5`J?-8zSQO>@F{t-h;%>4y(C
z2OI?6w!a9UtDX$%nYik~5qgLWuX5@ZAe{qz!QgLK>3JRjEj8VNsF1TzQzd-ZwM4EH
zgBtY!%Jv`%ghKZBjf=Z6T`sm80KFN!C}E7wY@Y4i<QbF#pq@_4`zoo?oFmC1o2MiW
z3in&5_b8)z4Sc5M4<0v8NdZ`0l(XBW`m}-y=l6NjaywNqa^uTG%7|DiXHaD=bV-wl
z8V`}O6kd2UYtbz(7a&7jE#BaZ06|>pVpgQ*J|D2rDcFU0ZKg|B_3DAKPf3`ph$}y6
z7%jUf<XaKUmz_3-gM`f^m_kp{aePGXx|7|ZF@?~Y-J#68P*jXzfq~>9ZECJ(`8TnH
zRpCwy?M|wsyBV50GVGH4(iASsI7)Rnlt~ZaXT>fy)n$&EZ#s}fG1p%c>4@h|uRi+B
zi}>**2%dHSc*<FxNWA3OZR+!k<;fTY2|1wld9)nSNk98*3zhV_Aj}A4<ayaY*V)TR
zAKc!?o2qgJ+IKvnOVqPrzqqzj9`QD~)2WbYvBPhC|B=T+P?U3E;5GIkwnLOT=X0QX
z5}$2*JRf4Kll|vv0S-OJf+O7n7X}lv$NqKB0->b`G)*IEJM9J5cwbg7dzp{xtrsCf
zhrT=UMWZpCi0OV}ZJcM(R_Q)IQ2&^+V#He{{E5tFrv;ijwz+&iD55Ehp`<(6!FROm
zowK*u{nfO^Si`070MH2zeTK8nuvb3SL`Zw`(aYm9odD~T8UW`)&Rx!}ZEilJ(l3rl
zMUauZcZ2a&a=0j|OyPo|E;q>QkexYMl$DfiB-i|d66>aP_{yur+gHGpV}7_g&~r#D
zOwc?xQLx(^+fC{jvGB}D?D1#JQO~7_4~koNn*k}*f%(z*$>lA-ua4Z%<6HEgml?z|
z0Ep)hknyPeb&fBe(x?}R4CLkTydk$&BRw5|4%4{1fVec|kFzXD`Tog6y&rW*`}!9~
zy+BDUa%^?*-;XkN7&$3?NIw_1U8F{6ZbEvRdth*0a|JpZgUb3a2F99&if${ij2tve
zu~X@yujC<}X1D9W3A@9A9Xx(HY%Ys6*{s?vCC{YnU5XS1?3V^erp%#kv04n^J;qy$
zFX!i85+cKLocotpfqdoFbK|*zVgY?nq+Slf5-z*G21ce<)%kTkVNbsKy-yP|H$n(@
z%Xsv`oz_Y}SDvyl-P<;a(UMtoIwUs)FHhqf+T*&Dfhr;Bfg~Zz_K`%jD!#`1e_217
zIwwvm-Za1{+5bG6LTBlG9Czn5y~${!+(uB)z;}y!e=jk-Z(zY}w5PRCL{P1_eN@=7
zmLnelJZ{IdGnAf80d=02)!{SuHP>$ll<-SJ$%lvs6<c+_PVpRxUOcGXoR2lqRVf1{
zV9yt|UL4nL@L!IRd^*uY*s2;SX8(E<$P!7@dIGUBE;9SF03ae{vX9*PEM_95@t|eV
z8X&1kS?Rv;1QPith+Ct~zOoa@7SfE>jMc@1($Ddx8OM1EK6IB>^i%N@&sltLp3a_#
zN(T+;(A!cng>4&AyNLj1hsR-$F^+9bgeFS=Q6}JatGxXW{FjmrPIWLbbtXNir5-pn
zh~V9+uy$j~0RwmZAOO~p)AjCb_al*-1r?ZKM=3GJ<gIe=X@2m_p!UZpwNr14$wO;c
zy#{{&%y(rcS#Rpf6u291SaB`s1YKem2s#KKE^bhXM&mSbF#y3=Tz!x$W~Y9!8lT#i
z(?=gC&H@n5f2#M;kKBq!vTj$Ng*^B<JCAQX>A*lWvB)TbKaHFa$vs-3xcIX453SW)
zP0#n`?!)e9C5#+PLD>mAH!XW*+j6FMm@S|OGZ`W1Hndp%+D`xNLt3ibekp*$@(8DH
zfR~#=*sgf-tQyUjbGOtAvU9JERFWQV3_+zfsVg059#C5NszV?}lQq7Fy443{se>o(
z?!u=#y!+&H;N+QZlIv0zjE=!z>Qdo-AbY~Cm1_0UKy41`SmB;i#BW_}QEUor{=$BY
zm%k|XYmF0-z=oYcTRG%t-W^zcD`3fZ1Cz^YC@ib+C8sv^_Oi|9vu`y)6iEmb{N6l@
z<~K%@S%5dhB;kkWH7*xyTeOsQj$JU6ypuI4?Ub_#3#sI!8wN7x<nfxKTz})Gm!0Q+
zdK!<+di+bH1=IB(h58_}M>BXc0tYR|H*aKzMkI`=OfRCwAbFN+R^tNGZ~8W-P+RL{
zXYhi6Hq?6?PyA-AI&c{wOF_b@E3Zlj*fW1mgSGmmExk2COLz=azYqhuzl=GZhs93&
zV4<nVs0*E+F5=xPqSO=K^WFl=q9+Kr$680F`!E0^!@9m)pH$p>^tMqywIFEW&e>NN
zg;izwmK5-9CEg~Hxi`fbHV?@pn%V<+%sCFlVW>^WY~HyjnO-w(?zBI^6rMARJ?Dfc
zzcQn=kMG&V!yeM|?Rtsst!zUF8l6rrD@SQbBmfv>Y`svF;N81F7Y`7>aXtMFDl;%x
zs<J@)OI*zk=X);EquDL;9K|yWwT1P|elR0}dM`rfyL!H7yS2ZQCRPNjVv54aK77_%
zmEl`NKu9i~v;+iopXF6lAFp$%<$1!H;(oD`#vSzb2(otj?wo_9e(%t+K%m|nd<1C|
z9;Z3LZ|F@Y)URuG5HHqgYz>VoJ3c>a1j+&oOb2?pM%;kq<ipn)x7!5a&gc;D`_5^4
zOv-6xG$cnSAjch`y}z>Heu!8f4_+<0EVcogq`j1aq(^~!Lg{N_^mDbuPF{q~9Pb`{
z+Ro3`Y}VQV7;vtK{9igDW)lUiFu2V=3zJ3R-E>113?4<nyle}6e6MwUN(EU{#&BWi
zXg$EtGS;v4)?)h&^WkWo#Fg-`$~{j<rPk`c0RE3`S+r(AHdME6ZGND&UUHJj@0N>>
zBPWSpz_{3Fv4or9f9zQxD<i8E#l8M3?X$X6(GY$?*{>0RYKPMdnqCG>w6r0|qi{8M
zof3BUt%7DydNn|K!GW>MV=ZGj2X!fTV3oUChJr@*$pv1^kJ+jG^D-a9>-63|P=&|I
z&xD#5Z8mycv&e1G!;3o&bwYj<u0)JA`jAJ*fL1|oq#K;o#b&FozLc5MFT&^MsTQlR
zI5v4S?5wG<ahM<$*)E~)W)q41FC0^^9q4x-`c;_X=-{aCrLKd9ix#5|!sJc4*m6}`
ztqfOsyY$;$ZwrD7BoPaly<kEn#_XmWdSy+jl5=E|9MsHo3Q>Qzi83~lE3`7cyDhO*
zQk}ibGg7(uI5n-e@6bmgCRv9-D(q6pxNPGb(EMZkJ<h!-c;ypX5ekYTPYgX_YoVA4
zL}ig3+XT%hHhQ@3wTSl``#xT%Vj+h2ffg?Mb_nQ3N{eQ4c66aVebSZhpm_)f*|Rlo
zHV~#F6*DqO$!(ksuxHuakG*<r)OAUvJRF&DGhJ*WG^4wFvh_hv<LfJsBvTt&IzrHz
zck7QHo6Y2mkex4rE3FwJwHrVo(C4u8^}jqIfHdGl7b$4jDC!treXs7M0SBOuZ<>Tg
zp7)+{5*n(ikOb88Bhs0knqy5vGEIJjDzo;xE)ZV8ZY`0NS*)}TC0F*bI@hs@<NW4#
zn1Dh~Be$w)M_dYzG5uaA-G1m5#mlN?WGhC3-X12IO;2VVrkKS}zlegf=afm8`xX@z
zWZq3_I)Ig7@YZc|x^L<q4fzn}tgBq6KR17C>mW{7q>t&ezxT0K2gHh62fVzrsFB}>
zvPyD|#IJ-MZ)X-@{Jd3tN{EtVit<3P*Vm+38l%xW0aCbvK2VYAZntvY>ow^1%Cr<-
zq^5QwV*PJ}kG<7=iJ|ScVC7#r<C~hIHr+xs{zYg=+(kY5{QaipSKb@Ohxu#ki9(hm
zOm1o3kB<iDX)S9jz&%rkEAPvYO?r!TY1duq`0;M~U)LJZCG0x582rQyyXE^Wu@lYp
zZLfd`oViedT#CGLfsuqyq6T*V<9xsNKv{;z=#i$hkO9O&;9DL`U7fpk-`{-o)#jmL
zA2h?o>hD$xcNgIJ{1^rG|5bm0+lXfVv_f0-bfyIrC^_Ly*(rHE2Hm7xoi?_Hw}kKD
zN4KUs<$c|r0dcgtUhJ8&ku&_^KfU2ULQHmtsb05YlYtE#nRtZnb)->H1qwJr6s_pu
z)n-r1JfQ%&SSuy!NhuARW7t{eaDXo{4KeJD`5$-rYz8D=6v89gfj|{2@RaX#9N|9?
zqQ8I8>)g#NC`MhbUkCnQNB_^O->raKt+^{Q{j>>wzVTO>;P=Bw){w-B^Y?zu*<Ww|
z>xw%Nh@ADJj!OLd8-IO++ZTAy$GGbTzmkIf<FkMj$iV$z{CNLg-}?8f&>_Hd{M|B6
z=jwkS)4Pkn*$idZq5bb4c?@JIUG>$x_x%4d>VE|Dzsv8R74x54<d3ENk6`{0%-^=m
zKRV{$mG_^^;~&BNBbdMKlYfru|FRtZ5zIe=`Rhde$MXFzw)#JU`F}<**L9JVM~?6S
z{i^O7{NqsmwYPs>{o`r>l*GTq2L2JuKZ5z2mix!k{%_0SpP<#>Jd1x;%s*ky|9zwU
z6N30JuGK$+`LBY}5%raLckd_6qNsa*Ys<F6BYI!#0Bn0fP?U(X)+|#vxRuYgAoVxo
zo7DuhSp9XOGESs!(IJwvw29AmDC;YS%fTQ0HJ&P#<#ZSqpA04n0^iY@qT@HZX*|Vr
z1mFe>-dx(>bReu^JJEAk44Ttdyuh{p_Q3)w!&HCwp{-h?D01jR;Ys*kpv+AW&HNdQ
zy05I}Wgp|Wh_3n$gSYg($sGEHFsOc^*EL&TLe;EEX)aKL%m@H|%-C)KxRWkGWY94%
zPv2L5bn@gaOBX71>8`C+z`!2Vg|>qD`5Z=Jzz_d1;Su=fzyW;e*s;Mv5gvtf+xBP<
zn@G-(LcX!;A`uz#rQ}cm%hKsTtczE5!7=2FB6l_b9ld_g2{5POH%mQ6PC5`i1X)Z>
zP%6xlA@&^}KiEtU;m<#<y-*s~Qf=P&y(Or=$2SCWNa<qg<0!TZ6YZwwX&6{<01(sj
zys8fZ08#tQAyNBTN))$_9UYYu)L97Bd@7vp|FTzcKvfzH<@^PWpT2nvs61!i9>r;^
z=3(5$+El4>^}}t$2S-A~E&v1Xc~{5+@Y5O&soLYu7LL%Ao!>%Hm>k^mL>&xc)NRdG
z_<ylk0b!LZrd+fZ$9E#!BR45YvM;-V8jJ}iS;yNqV+g<aO^>f;PPC*h3Y?bxUCCAF
z#9yPUXV@cThHh7AS!Oq30g7d&(;*GYY!Nf|CqAlpj7H7;JueIO&OsPMb=Qrb>?lA&
zLq4Gct(>0Jd|m7;r5dQF1Wfj^(~(-{6~J`<!An0K`jq{)sqK~6>YFrB$(O%iK%H2c
zz_T`sk%r14tvSFLfTZH5I^rWK4S!O8LT?^R6p|n*6DE&qbdiWL3XL$Ui#D)hr?(C^
z$hYp~3&>4u;YKcAznVoCd+e>^<1;iBL*Y>(IvP=}aR$?0VzjOxMe{U}?Riszh5@Q$
zZ|Swa=BYaTFi78%G}M<kAHx~+TIAK@(asxJPh3@`QGLjLHB_<A3_+3k5vg3nIU-1)
zazz?#61wD?@GPiqziz)@tKq}RQ+rb-qjJTw4MG53x(ATuTMlr^7k*0X^|2fBdpgRn
zCFHS!{Kr1W4PN~k^wZ{(7app|g*NR5l?7P>{QfQLr$@pLN<;*Ij@pF5$LIH8>hst2
z`@Xkaa{o+A_~W<V-MHNtRPsy<%JX=$9uW0X6`&KWt*zL87OpBg%K+Vb$qqfO*8Tec
zpN{aa?XH-1Vw5x&xAT>~*At>>x8w=C;oGea&|^XXe)`hpdJg30WL*pGDYF&zoqG7%
zCD%;h#It+9tsAqwpP@n(BwkhjSu9YI^_W(gEH4wA;ugkJ;Z-2FO}#_zkeu+LECgex
zD8ydS&&9=koCX3p+m4BQq&f#Sq%PL3R*;*D0^xQJx6w96++0Nr&u_e+Gpl%?ldDQi
zm5_J2tobKtC)A12gu8|6IbPjnsJ}l&IawekZThvqPc3q6S4|UhQsx#tkfJfo)6DQ}
z8X|Y+C#`cSRPjT#mU4lgte^VZw{f4TET@>;7pr|wT)$D%U42U=6kjuQoOtA?(op1J
z@DF|F^jhsw!2Z*rhChZN(9Ce`z{=e0=LQow%OGJHHxY`<sM|=g8kvsn9Sy!u9;YN&
z%D7(Vdx=NQyx5OsIo-$cAoS@CM&zGDIzl6IVJYl{B>#7%*AE#I6Vzw7E`CI4y%xST
zaP73xQCO(*nWX!NsYJk|D5f!1R<%7)RW)D_p?*|4gNbVLqmc6gcXqkjp08fC+e}Zg
zd;Ht=Bw@?MB0TD$e%x{Hz`*OgGUhJ6M(Q>;-}~hy(OG4Ux*!=D9zGsZNzaj6a4BP1
z>~}&z%PTWzxf`w+@fMyWQ|0MSIDbM@qh`8#iHNyS3nf*rZOFy0lLpo72OLaSuTN{?
zfclyvJ=aSbtF86)^$8*#Rby|B2-Kdea57lkwLrE9ikoY!s#^Dm|I$lQkcs*H<yDgX
z`n=oa%banEi5F1giTV6|e0+&$fpzcPhWmQhK14^VM1V&_gO8tI2T)~TnoI~f!yF!d
zzi)JOp^I=T^3^MQQX;>qs_L8c^xctSc;6UIrQE3KZKUS3o~|xA8b3VDQxw>>31Jst
zm-OiA-Icliy=^wOHHLm_W!lWx(Zwv%PCiAa?tSHhD5=^vJgdWXv<d;2IoZLju9Vr>
zm&=(3ln?2ki+H<YS82HW?Mv}HFO=tcY@-XDRh{z|hRQ78O7I;0Nn|ogSG2jCK1w1Z
zHa~x&JyE@^7k(nE@Yb1zO+ZII4WYZVOCEPs@Ay*}0K5y(@Ztk2AMn78vkFykF&Ne5
zSz^Mw1T+x7t{(C&ioVrSg^IIAyn(pBs*Gz=exVXEy*B4*ry=vM=oAY}2hgR&qIFfh
z&aXpxQMMSHsR&1^GYVf2zjTSs2eCi4vb^k7TYT-BsF)bx-PZz5?>W69Ru{{Xh7Gxt
zp89=Wt+YgWCI*wsoR`5fw%U_H_D*)6qY9VgSy-6V>jt6Z1=Pr8YyJTVe7&K52$x*o
zt&74gxxqNg;YJAD&MCKkAZX#$<lEB^pCIOKC$qf`yITbaY_igjc~{ker_I#;l83%p
ziQr3@F8RRw2h4awQ1yEa&~klVT8{qcgIH=s*SeTvcl(1U_m~^K6eXP>eF+mcawFIi
zy2C_;o65B%-ScDFrOe*f<>HGndYaq4<Z>tCpdB3@7P}`IS8_@}JZae=Khm~hV=rz`
z_FbV2SuGZqRpIB$M8~Vr+6INb09{Zlmzu+FUx=;22B?Hf8K?Ju`$-xxai)o2CWFCO
z{8z7Z`cbX8ZQACd-mYcQ{+g=LiN8;kGEHdK)e}kqVUuT}YSbUkJ3vzR;6Y|%sUw4f
z-_yf3)1RX32bV2lGdw+c6hfjnWHxfBU+aW{jGzQ_6yDD1iAgkv{4Hl^N&d_sNGz&y
zERl&lUF(SZoF9LCM^0IpYDQ^^uXB9iCqks$wkKj}u-4MdtWd!Og`TFPqg$yPhhCQ|
z9*>0zQy?7|Rz#87gH0>#9f70k6&~qHb!N<0WGB+sy4%fotkduCu={+DfIKz7JlOo%
zP|lb%rLCb6@2ZJ;TL*HSmmMrJ=d}a{W?~kqJ>HDbvt0LFcj4o=D7YqUkmZk%iKcS|
z)|q|OPD<8{lYn4CKg)P9*4;A$Q_+x(rTZ<C@s-8I=aF)VYYx8qhJuH)bCo8KIjyWD
z-^#rO0<*RizkpQj8RiWUxoa4@yE{8Oqc>xoTXf7-Z^+F8nY%RdzEq<HGqWQ$n`BG<
zYkZ@OR)21WM_(BHvrp7Ft%Kf_pFeq_)3E;0$1EwU#E*Nq*;+Frt=<=h|5UKRdD(<Z
zOgzl@1GL}M7`j_uG?bK5)F&%)^7XFB!EVK@hxn^qbA_py@TV>;aPWGU)U`tV&CRl>
z-amkv9n8n~$5M}Qa76E&F)=bC(lGdueb5v4R8;C%SuZ$ET-DqO+#RJVDJki<mPH4t
z+xnn($NI~kH>L>qKWZ%nHE_4_GjW@=nMg>V{W#|6XpiT*eED+1+M0OMy>ERimVNPB
zR;oR|LDx=KJNHa}+*V(SP~ZU1!PO+b#BMv-y|1et`S7~9&d)v$DOA3uHWqw*a?;kL
z@~fpKE*m*`_h88eUGTAw%e0I;^gR2zP0R0<Yi;YV%KYeil3tf)8UX3H%=62F>izd(
zE_&ecGSZ;;6|2R^&yvwD3p?hQ`+B)kQZAF_g#SFFO1<gG>uai^NC}tziI#g<cELOT
zrQW7@z{gITJOk`QrAK9aKM(rS)1k4xXX$-BLY-pn-4qta?(G%z6>x4PB3oA$%w4jF
z+dn_(DJl;*S@=3e`*+BDM+Yt6k9g6bjQBJs&kVl7GoD9}2K6=6_&31MJu2y{dSt)X
z=9N2x(eBgo?-<}pubxc_Z+yyP<p|r#`j`~J2h-DoFo%`;8W|d?8g1D1WVuZ$w{>8^
ztCvm8xZwidm%fJ!>DU}iiAvR)BSyhjS{2d<n$@%FcjjS$f=-CDPEX7<^T3<&$oP@t
zUnL36eTZmd)hrOtv_Y&(8Y^j2po<d}NcQp0uJ*?ZpYX&6)SI*4?fF_Oj{KhB!xVr0
zhK3bbti4;f=Ln<1xsT9*f3eY;)y|e?hSHrXyk#eU4Br)YavL<?%wF}V<ATB3U@Ku)
z$jMd`upJU^HVcO*_z&E1{L@-78Kc>G=pW|GM*k`OQq0#AGgDJet(z`Mk*&%{<6Tw!
z{b#n<K!WyB38SNJ-1lLybgi?alX@(}`unTr;fg~<of_zFHQtA?(`KMhn@Vj}Gd4~N
zx;H*%4%_dN@mqU|qVdy;<9qu+MfFrobO7GLK%YseZs;KcTw_@Innnf(sGhvAaEpni
zfA&K(<2H~y@kQBe$gYj(>|S~xIk&ofGD#ex--Nt556+g4=8%4$9=FIcbO{C@9!Yq5
z-1o}23y6oHyI9WiZkVLs5L_RLHl^hzL8$-J-gW;qm1S{=0jY|N;zx?0(i9Lz+9m>m
zi}a#I5h+SjWC*Y>MT8VG3g`+#q68a7CI~1cNE=EJ5JW(Vgr!JG7!V>=N(eP%Zd_1w
zcJ{M>z~qN~-g`OsynD~P=j5F4yZ0Qbtfn<AMZYn)>sd~Ak!{fFnnN4o<%g))p(_T`
zaJmhUt$ii_&S6dIqk?SXpT%QHKraIY?wj_OCK03871j7HH2BEi4L17|X;~z>vfFGQ
z%P>r}9xh+Qa=5@ype#hVJ$zFBK3w1GiG|8cv4dxJcrdiKw~(2EU_!}{xw)kQzS1c%
z)a^q?E-K~2^is>%$<c64+xV8P<O1aNr-<@l(C1rs#`9dS97H!Y>6fD^Y4!6X;{n^E
zqhFhqVK^hP^OYZ?hKDa6rreDa)Dji->>;%g9p5<2KW>NZRA2N6Qem;#<zom0RmcBS
z5{fgg-`c8taH?DQSuW}6)ACIAB%1pQlO&S;WagQ5qIYvnY^>OGW}4!dYOF(Ysz=-!
znQeyYS+0NIXB3o}A*-pWX|G<4@OR5CDk>=-58O66dDZny*9Zf<daLk-z#KYdeAQ{g
z`)0&ddz#7ixNg-Oot-18i}Y~t^c4_JRlkJ98c%;J&bYX)(Gmx6)xhIH1`rE4`Bm-O
z_kDwL8FghjWgKcpN4=S%jph&Voz=>FN<5S6^X0=@LL&Qo9=%;C-jd&>BrXIpq3?0|
z01=B=m`Is%a>Smv7Ugs@-SlYvtLBvc3MG^7)T0ZzF38b)?8&~N5ofXhk)9Ra5L)(B
zZsDqq6F&d0<fq<3>%$*S+s{A9M$_S-1{Om;rOM|ix^kWle<T6V@+C@3OAoVi^{%SQ
zlogm@CzO4krA$m9#dq42FdT_~?OvSa;#mu<#3ROxrhPej0+hW^q6vwJ?su|M5w&tE
zSdh_C&`#qQR?|y=l`<We6&TFzKaMWACPoh%nm^GSF*&^!-tgij{l!vVMycCkmDD0E
zi}_EGI$eKXX=y;jyxu}65=m?6)6&u+r?t`7R@Za$<Ax7T(|!@ZblK6<@AihbF7XjP
z-6P{O$#CpA0w%Wm4oDfyUS-g=E&$2P<Y2HX>-pUQO{@}!JDYExq%qrHbKQKnGuvx!
zN*4}4B#&m@>g=3IXlk-W^z}U!%Cy5MnaA(#qH!NQ48o!gg|v5?c?6oVMk5|Qcdt5a
z7Pm~I#O+^vms+Vr+<Q5^0Jb$c#+(MNqjvW)-L+JQmKoN=+El%Fu9tpMCj@s!>4(=o
zZ+1A`dz0U~Scl?v$;YT`=@{hzo!EDymCD4DRhHV4IgZ=pj7TA)t`a}&lUtg8lM(qo
zr`tF4yk@?7C%q3-^%Cf>+in>g9v|S3a=tF7rlb_|Q!gqtb*gbAvFOdH>OjoOIn&~I
zLb73r-fJr>gjpZ#p>XXnjX@9wF4{!y5HhMr(*ea3KL1-2agl40A_gB5>)3k2L{lG&
z(h#;wL#Ue@#gA;uI_grO(lW_)txw5HeeQ%N(ldd3QxZVkA*Bu7j>L}Jw6$Aa^HD2d
zAXi#T+HSiV^_88=ymsl*>)10b!}^aN**-LYnS{JP9YJX%A&UQQRQWbEcG_QR4g)N=
z9|892u^csCm%*%mtkSm;GY1QljMD@K;0-l%B-Mv=V!CqP9f{6(>}m{5-%AN1l!L^T
zVJ-6}O0KZiwI~vpBZzst)$GiW<m%kZn@uoUN0~?^3*>ctDLG@xxt-GquR6BOU{t`w
z)#p#1xj1~=pi(ow*bX7B>*h!FYN+azpIU-@u*ku*Q9^Ax<e(a3pjnTqx^5kBQX*UY
z+?9SZxlUGA9yPGq2$&OU-!Qt9y7om@OF}I?il927CLmP+?#{Lci=%BMZnwO}<!F_P
z16RXB8&L;i1qB5|X6X%tLby4V-*CmWLaxz4#&F_O67Ap5nR4i9WD7o3J+i&Oe{#f*
zyaaigIdhA4T}fFhy*tg`+@CdXc)zH*k*Ev5@*@Uy?2Dr%dU?5FB6ZbCk~?XL9~|7p
zkd+3$R3QlF@ApUR3Moir@|K9RYN<$Qx`Nis+R70^g8Hjd(oqsoh>?U10RruKtCkS_
zTcvlG8{x)PM)Fk`Jpk|}Y9x?f*ryhL!K#S)#<fR3S&aFlS`8NaH7_W_wAW?C!^fl!
zXc?$KIEJ;KIG+Uaajk(x&I4RO8_{);2!u$)ypc?-#slsgqX52TY*U?$fhZAg%=Cq+
zy=Phfy=^oESBg=^<+>=&aW2$Nt7n121?xiK!!dVTa99Y-rly@0p8+J45Lkg7KW?M8
zap!1=Fm@$RF)q&H_12enemi$kcf(Ht!m{3N0Yh7L<he>!l!DOCc$L>W)z8*?Q)}+3
zJ%Pqezzh9YDR?^Wv$M+kBJ1$WTuZ==$P`340|=)s$vADHX6PhJ_dyqLItPh<81ZhU
z@cRKyIHtt?#rhS=^7o2cj;&b>JDxFY!1wIhGzwMGo*RvDCN4w6^MlE~T$ll*`E(>-
z?GcLBV=2I!SEplQB6|-mY=;nf7Av{jcA@djcKSU$h_p|Jn^o#0hpwv-=vfYwA5Ht)
z=-{=+)uo7lTJI49C*j?3L8b~kOfG$>>C_HmcL6X-oN9K*Bh1~EgjwvSx)}L?0k}z2
zF;)(?zlP9}*a>O<+P~^Rh?f`P^jWn%+!<?OSpx;(Pn8D<Nz78mj=kUbpLH?76<z$b
zwMY_o@j&~#eNW<#;Q^AwgMQwP$w}tq+$*50unO|M@zpP22s`K(2~^udh@B{u4J3ZA
zC9vwGlr<~Dn4i)=SXodEi)eABAl2F01(@|)U<gE^h5#XjfeQd+U89r~iQz74o;<7a
z#F@hgwk74-epo{bi-~KjP)DHa4-rgH5YVd{x_P%q<2xLLk=camR%yI6$DW5W)dD>D
zhQ~CoWQ!JHiQ2v~wZ3!?5J^Z_^6`Ij;kzXO_rbO&kMhmMCjP(%7+=)ZKV0Q8{|_$(
z2S9;WR^PN=?dy-9j})fB4um?+Sw6mvHs29|)BU_n3J=L&Mf*Jo?pNLhgoO+Ie1G1T
zn>zuCZ>{8?-)4~aDwl>80Ph064B)#XFdn|+Bw_3SahnJr@GIP1eBZU)bc(Y8B!3Cz
n+XwNNP`;JN=Nb5a31v-QJ5Op?U3UC1ANW|Furtj&?*8-dPU+kU

literal 0
HcmV?d00001

diff --git a/docs/source/autodoc2_docstring_parser.py b/docs/source/autodoc2_docstring_parser.py
new file mode 100644
index 00000000000..41c49ed1c54
--- /dev/null
+++ b/docs/source/autodoc2_docstring_parser.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+from docutils import nodes
+from myst_parser.parsers.sphinx_ import MystParser
+from sphinx.ext.napoleon import docstring
+
+
+class NapoleonParser(MystParser):
+
+    def parse(self, input_string: str, document: nodes.document) -> None:
+        # Get the Sphinx configuration
+        config = document.settings.env.config
+
+        parsed_content = str(
+            docstring.GoogleDocstring(
+                str(docstring.NumpyDocstring(input_string, config)),
+                config,
+            ))
+        return super().parse(parsed_content, document)
+
+
+Parser = NapoleonParser
diff --git a/docs/source/conf.py b/docs/source/conf.py
index a83ad764125..060649e43b9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -13,16 +13,17 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
 import datetime
-import inspect
 import logging
 import os
+import re
 import sys
+from pathlib import Path
 
 import requests
-from sphinx.ext import autodoc
 
 logger = logging.getLogger(__name__)
-sys.path.append(os.path.abspath("../.."))
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.append(os.path.abspath(REPO_ROOT))
 
 # -- Project information -----------------------------------------------------
 
@@ -40,8 +41,7 @@
     "sphinx.ext.linkcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
+    "autodoc2",
     "myst_parser",
     "sphinxarg.ext",
     "sphinx_design",
@@ -49,7 +49,22 @@
 ]
 myst_enable_extensions = [
     "colon_fence",
+    "fieldlist",
 ]
+autodoc2_packages = [
+    {
+        "path": "../../vllm",
+        "exclude_dirs": ["__pycache__", "third_party"],
+    },
+]
+autodoc2_output_dir = "api"
+autodoc2_render_plugin = "myst"
+autodoc2_hidden_objects = ["dunder", "private", "inherited"]
+autodoc2_docstring_parser_regexes = [
+    (".*", "docs.source.autodoc2_docstring_parser"),
+]
+autodoc2_sort_names = True
+autodoc2_index_template = None
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -77,6 +92,11 @@
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
     'use_edit_page_button': True,
+    # Prevents the full API being added to the left sidebar of every page.
+    # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
+    'collapse_navbar': True,
+    # Makes API visible in the right sidebar on API reference pages.
+    'show_toc_level': 3,
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@@ -164,68 +184,64 @@ def linkcode_resolve(domain, info):
         return None
     if not info['module']:
         return None
-    filename = info['module'].replace('.', '/')
-    module = info['module']
-
-    # try to determine the correct file and line number to link to
-    obj = sys.modules[module]
-
-    # get as specific as we can
-    lineno: int = 0
-    filename: str = ""
-    try:
-        for part in info['fullname'].split('.'):
-            obj = getattr(obj, part)
-
-            if not (inspect.isclass(obj) or inspect.isfunction(obj)
-                    or inspect.ismethod(obj)):
-                obj = obj.__class__  # Get the class of the instance
-
-            lineno = inspect.getsourcelines(obj)[1]
-            filename = (inspect.getsourcefile(obj)
-                        or f"{filename}.py").split("vllm/", 1)[1]
-    except Exception:
-        # For some things, like a class member, won't work, so
-        # we'll use the line number of the parent (the class)
-        pass
-
-    if filename.startswith("checkouts/"):
+
+    # Get path from module name
+    file = Path(f"{info['module'].replace('.', '/')}.py")
+    path = REPO_ROOT / file
+    if not path.exists():
+        path = REPO_ROOT / file.with_suffix("") / "__init__.py"
+    if not path.exists():
+        return None
+
+    # Get the line number of the object
+    with open(path) as f:
+        lines = f.readlines()
+    name = info['fullname'].split(".")[-1]
+    pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
+    for lineno, line in enumerate(lines, 1):
+        if not line or line.startswith("#"):
+            continue
+        if re.match(pattern, line):
+            break
+
+    # If the line number is not found, return None
+    if lineno == len(lines):
+        return None
+
+    # If the line number is found, create the URL
+    filename = path.relative_to(REPO_ROOT)
+    if "checkouts" in path.parts:
         # a PR build on readthedocs
-        pr_number = filename.split("/")[1]
-        filename = filename.split("/", 2)[2]
+        pr_number = REPO_ROOT.name
         base, branch = get_repo_base_and_branch(pr_number)
         if base and branch:
             return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
-
     # Otherwise, link to the source file on the main branch
     return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
 
 
-# Mock out external dependencies here, otherwise the autodoc pages may be blank.
+# Mock out external dependencies here, otherwise sphinx-argparse won't work.
 autodoc_mock_imports = [
+    "huggingface_hub",
+    "pydantic",
+    "zmq",
+    "cloudpickle",
+    "aiohttp",
+    "starlette",
     "blake3",
-    "compressed_tensors",
     "cpuinfo",
-    "cv2",
-    "torch",
     "transformers",
     "psutil",
-    "prometheus_client",
-    "sentencepiece",
     "vllm._C",
     "PIL",
     "numpy",
-    'triton',
     "tqdm",
-    "tensorizer",
-    "pynvml",
-    "outlines",
-    "xgrammar",
-    "librosa",
-    "soundfile",
-    "gguf",
-    "lark",
-    "decord",
+    # The mocks below are required by
+    # docs/source/serving/openai_compatible_server.md's
+    # vllm.entrypoints.openai.cli_args
+    "openai",
+    "fastapi",
+    "partial_json_parser",
 ]
 
 for mock_target in autodoc_mock_imports:
@@ -236,18 +252,6 @@ def linkcode_resolve(domain, info):
             "been loaded into sys.modules when the sphinx build starts.",
             mock_target)
 
-
-class MockedClassDocumenter(autodoc.ClassDocumenter):
-    """Remove note about base class when a class is derived from object."""
-
-    def add_line(self, line: str, source: str, *lineno: int) -> None:
-        if line == "   Bases: :py:class:`object`":
-            return
-        super().add_line(line, source, *lineno)
-
-
-autodoc.ClassDocumenter = MockedClassDocumenter
-
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
     "typing_extensions":
@@ -259,7 +263,4 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
     "psutil": ("https://psutil.readthedocs.io/en/stable", None),
 }
 
-autodoc_preserve_defaults = True
-autodoc_warningiserror = True
-
 navigation_with_keys = False
diff --git a/docs/source/contributing/deprecation_policy.md b/docs/source/contributing/deprecation_policy.md
new file mode 100644
index 00000000000..598f1612d3a
--- /dev/null
+++ b/docs/source/contributing/deprecation_policy.md
@@ -0,0 +1,87 @@
+# Deprecation Policy
+
+This document outlines the official policy and process for deprecating features
+in the vLLM project.
+
+## Overview
+
+vLLM uses a structured "deprecation pipeline" to guide the lifecycle of
+deprecated features. This policy ensures that users are given clear and
+sufficient notice when a feature is deprecated and that deprecations proceed in
+a consistent and predictable manner.
+
+We aim to strike a balance between continued innovation and respecting users’
+reliance on existing functionality. Deprecations are tied to our **minor (Y)
+releases** following semantic versioning (X.Y.Z), where:
+
+- **X** is a major version (rare)
+- **Y** is a minor version (used for significant changes, including deprecations/removals)
+- **Z** is a patch version (used for fixes and safer enhancements)
+
+Features that fall under this policy include (at a minimum) the following:
+
+- CLI flags
+- Environment variables
+- Configuration files
+- APIs in the OpenAI-compatible API server
+- Public Python APIs for the `vllm` library
+
+## Deprecation Pipeline
+
+The deprecation process consists of several clearly defined stages that span
+multiple Y releases:
+
+**1. Deprecated (Still On By Default)**
+
+- **Action**: Feature is marked as deprecated.
+- **Timeline**: A removal version is explicitly stated in the deprecation
+warning (e.g., "This will be removed in v0.10.0").
+- **Communication**: Deprecation is noted in the following, as applicable:
+  - Help strings
+  - Log output
+  - API responses
+  - `/metrics` output (for metrics features)
+  - User-facing documentation
+  - Release notes
+  - GitHub Issue (RFC) for feedback
+  - Documentation and use of the `@typing_extensions.deprecated` decorator for Python APIs
+
+**2.Deprecated (Off By Default)**
+
+- **Action**: Feature is disabled by default, but can still be re-enabled via a
+CLI flag or environment variable. Feature throws an error when used without
+re-enabling.
+- **Purpose**: Allows users who missed earlier warnings a temporary escape hatch
+while signaling imminent removal. Ensures any remaining usage is clearly
+surfaced and blocks silent breakage before full removal.
+
+**3. Removed**
+
+- **Action**: Feature is completely removed from the codebase.
+- **Note**: Only features that have passed through the previous deprecation
+stages will be removed.
+
+## Example Timeline
+
+Assume a feature is deprecated in `v0.9.0`.
+
+| Release       | Status                                                                                          |
+|---------------|-------------------------------------------------------------------------------------------------|
+| `v0.9.0`      | Feature is deprecated with clear removal version listed.                                        |
+| `v0.10.0`     | Feature is now off by default, throws an error when used, and can be re-enabled for legacy use. |
+| `v0.11.0`     | Feature is removed.                                                                             |
+
+## Important Guidelines
+
+- **No Removals in Patch Releases**: Removing deprecated features in patch
+(`.Z`) releases is disallowed to avoid surprising users.
+- **Grace Period for Existing Deprecations**: Any feature deprecated **before
+this policy** will have its grace period start **now**, not retroactively.
+- **Documentation is Critical**: Ensure every stage of the pipeline is
+documented clearly for users.
+
+## Final Notes
+
+This policy is a living document and may evolve as the needs of the project and
+its users change. Community feedback is welcome and encouraged as we refine the
+process.
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 03d830fe90f..b42536f054d 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -128,11 +128,9 @@ HF processing as well as memory profiling.
 
 ### For memory profiling
 
-Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
-to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
-the model so that vLLM can reserve the correct amount of memory for it.
+Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
 
-Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
 
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
@@ -244,38 +242,45 @@ def get_num_image_tokens(
 ```
 
 Notice that the number of image tokens doesn't depend on the image width and height.
-We can simply use a dummy `image_size`:
+We can simply use a dummy `image_size` to calculate the multimodal profiling data:
 
 ```python
+# NOTE: In actuality, this is usually implemented as part of the
+# model's subclass of `BaseProcessingInfo`, but we show it as is
+# here for simplicity.
 def get_image_size_with_most_features(self) -> ImageSize:
     hf_config = self.get_hf_config()
     width = height = hf_config.image_size
     return ImageSize(width=width, height=height)
 
-def get_dummy_processor_inputs(
+def get_dummy_mm_data(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
     num_images = mm_counts.get("image", 0)
 
-    processor = self.info.get_hf_processor()
-    image_token = processor.image_token
-  
-    hf_config = self.get_hf_config()
-    target_width, target_height = self.info.get_image_size_with_most_features()
+    target_width, target_height = \
+        self.info.get_image_size_with_most_features()
 
-    mm_data = {
+    return {
         "image":
         self._get_dummy_images(width=target_width,
                                height=target_height,
                                num_images=num_images)
     }
+```
 
-    return ProcessorInputs(
-        prompt_text=image_token * num_images,
-        mm_data=mm_data,
-    )
+For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
+
+```python
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    num_images = mm_counts.get("image", 0)
+
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+
+    return image_token * num_images
 ```
 
 :::
@@ -412,29 +417,30 @@ def get_image_size_with_most_features(self) -> ImageSize:
 
 Fuyu does not expect image placeholders in the inputs to HF processor, so
 the dummy prompt text is empty regardless of the number of images.
-Otherwise, the logic of this method is very similar to LLaVA:
 
 ```python
-def get_dummy_processor_inputs(
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    return ""
+```
+
+For the multimodal image profiling data, the logic is very similar to LLaVA:
+
+```python
+def get_dummy_mm_data(
     self,
     seq_len: int,
     mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
     target_width, target_height = \
         self.info.get_image_size_with_most_features()
     num_images = mm_counts.get("image", 0)
 
-    mm_data = {
+    return {
         "image":
         self._get_dummy_images(width=target_width,
-                                height=target_height,
-                                num_images=num_images)
+                               height=target_height,
+                               num_images=num_images)
     }
-
-    return ProcessorInputs(
-        prompt_text="",
-        mm_data=mm_data,
-    )
 ```
 
 :::
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 31c7059fda3..89b31f0311e 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -17,7 +17,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
 
 - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
   - [Selected onboarding tasks](gh-project:6)
-- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22)
+- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
   - [Models with multi-modal capabilities](gh-project:10)
 
 ## License
@@ -40,6 +40,10 @@ pre-commit install --hook-type pre-commit --hook-type commit-msg
 # You can manually run pre-commit with
 pre-commit run --all-files
 
+# To manually run something from CI that does not run
+# locally by default, you can run:
+pre-commit run mypy-3.9 --hook-stage manual --all-files
+
 # Unit tests
 pytest tests/
 ```
@@ -54,6 +58,12 @@ Therefore, we recommend developing with Python 3.12 to minimise the chance of yo
 Currently, the repository is not fully checked by `mypy`.
 :::
 
+:::{note}
+Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
+platform to run unit tests locally, rely on the continuous integration system to run the tests for
+now.
+:::
+
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 6b794db656c..ca56710bc2e 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \
     --model mistralai/Mistral-7B-v0.1
 ```
 
+This image can also be used with other container engines such as [Podman](https://podman.io/).
+
+```console
+$ podman run --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 \
+  --ipc=host \
+  vllm/vllm-openai:latest \
+  --model mistralai/Mistral-7B-v0.1
+```
+
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 
 :::{note}
diff --git a/docs/source/deployment/frameworks/anything-llm.md b/docs/source/deployment/frameworks/anything-llm.md
new file mode 100644
index 00000000000..d430c170ef5
--- /dev/null
+++ b/docs/source/deployment/frameworks/anything-llm.md
@@ -0,0 +1,47 @@
+(deployment-anything-llm)=
+
+# Anything LLM
+
+[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
+
+It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
+```
+
+- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
+
+- On the bottom left of open settings, AI Prooviders --> LLM:
+  - LLM Provider: Generic OpenAI
+  - Base URL: http://{vllm server host}:{vllm server port}/v1
+  - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+
+:::{image} /assets/deployment/anything-llm-provider.png
+:::
+
+- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
+
+:::{image} /assets/deployment/anything-llm-chat-without-doc.png
+:::
+
+- Click the upload button:
+  - upload the doc
+  - select the doc and move to the workspace
+  - save and embed
+
+:::{image} /assets/deployment/anything-llm-upload-doc.png
+:::
+
+- Chat again:
+
+:::{image} /assets/deployment/anything-llm-chat-with-doc.png
+:::
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
index cb758d3e6d2..683fa8217a8 100644
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -3,12 +3,15 @@
 :::{toctree}
 :maxdepth: 1
 
+anything-llm
 bentoml
 cerebrium
 dstack
 helm
 lws
 modal
+open-webui
 skypilot
+streamlit
 triton
 :::
diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/source/deployment/frameworks/open-webui.md
new file mode 100644
index 00000000000..83e5303a00e
--- /dev/null
+++ b/docs/source/deployment/frameworks/open-webui.md
@@ -0,0 +1,29 @@
+(deployment-open-webui)=
+
+# Open WebUI
+
+1. Install the [Docker](https://docs.docker.com/engine/install/)
+
+2. Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
+
+```console
+docker run -d -p 3000:8080 \
+--name open-webui \
+-v open-webui:/app/backend/data \
+-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
+--restart always \
+ghcr.io/open-webui/open-webui:main
+```
+
+1. Open it in the browser: <http://open-webui-host:3000/>
+
+On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
+
+:::{image} /assets/deployment/open_webui.png
+:::
diff --git a/docs/source/deployment/frameworks/streamlit.md b/docs/source/deployment/frameworks/streamlit.md
new file mode 100644
index 00000000000..084550ec991
--- /dev/null
+++ b/docs/source/deployment/frameworks/streamlit.md
@@ -0,0 +1,42 @@
+(deployment-streamlit)=
+
+# Streamlit
+
+[Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
+
+It can be quickly integrated with vLLM as a backend API server, enabling powerful LLM inference via API calls.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+- Install streamlit and openai:
+
+```console
+pip install streamlit openai
+```
+
+- Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
+
+- Start the streamlit web UI and start to chat:
+
+```console
+streamlit run streamlit_openai_chatbot_webserver.py
+
+# or specify the VLLM_API_BASE or VLLM_API_KEY
+VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py
+
+# start with debug mode to view more details
+streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
+```
+
+:::{image} /assets/deployment/streamlit-chat.png
+:::
diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/source/deployment/integrations/production-stack.md
index e66e8e6a16b..05f1568306c 100644
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/source/deployment/integrations/production-stack.md
@@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [
 
 ## Deployment using vLLM production stack
 
-The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server.
+The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
 
 To install the vLLM production stack, run the following commands on your desktop:
 
diff --git a/docs/source/deployment/security.md b/docs/source/deployment/security.md
new file mode 100644
index 00000000000..e2ef8196c16
--- /dev/null
+++ b/docs/source/deployment/security.md
@@ -0,0 +1,58 @@
+# Security Guide
+
+## Inter-Node Communication
+
+All communications between nodes in a multi-node vLLM deployment are **insecure by default** and must be protected by placing the nodes on an isolated network. This includes:
+
+1. PyTorch Distributed communications
+2. KV cache transfer communications
+3. Tensor, Pipeline, and Data parallel communications
+
+### Configuration Options for Inter-Node Communications
+
+The following options control inter-node communications in vLLM:
+
+1. **Environment Variables:**
+   - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
+
+2. **KV Cache Transfer Configuration:**
+   - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
+   - `--kv-port`: The port for KV cache transfer communications (default: 14579)
+
+3. **Data Parallel Configuration:**
+   - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
+   - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
+
+### Notes on PyTorch Distributed
+
+vLLM uses PyTorch's distributed features for some inter-node communication. For
+detailed information about PyTorch Distributed security considerations, please
+refer to the [PyTorch Security
+Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
+
+Key points from the PyTorch security guide:
+- PyTorch Distributed features are intended for internal communication only
+- They are not built for use in untrusted environments or networks
+- No authorization protocol is included for performance reasons
+- Messages are sent unencrypted
+- Connections are accepted from anywhere without checks
+
+### Security Recommendations
+
+1. **Network Isolation:**
+   - Deploy vLLM nodes on a dedicated, isolated network
+   - Use network segmentation to prevent unauthorized access
+   - Implement appropriate firewall rules
+
+2. **Configuration Best Practices:**
+   - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
+   - Configure firewalls to only allow necessary ports between nodes
+
+3. **Access Control:**
+   - Restrict physical and network access to the deployment environment
+   - Implement proper authentication and authorization for management interfaces
+   - Follow the principle of least privilege for all system components
+
+## Reporting Security Vulnerabilities
+
+If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 7bed0a001d6..94bda8b5c58 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -52,8 +52,8 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-More API details can be found in the {doc}`Offline Inference
-</api/offline_inference/index>` section of the API docs.
+More API details can be found in the [Offline Inference]
+(#offline-inference-api) section of the API docs.
 
 The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
 
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
index 0947c1da1e5..dc92a3c2c51 100644
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
 
 ### Dummy text
 
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 
 (mm-automatic-prompt-updating)=
 
diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index b3981b2dc24..7e7c8b925e2 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
-- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
-- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:prompt_tokens_total` - Prompt Tokens
+- `vllm:generation_tokens_total` - Generation Tokens
 - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
@@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
 
 Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
 
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+
+```bash
+$ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
+http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
+http_request_size_bytes_count{handler="/v1/completions"} 201.0
+http_response_size_bytes_count{handler="/v1/completions"} 201.0
+http_request_duration_highr_seconds_count 201.0
+http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
+```
+
 ### Multi-process Mode
 
 In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
@@ -456,6 +467,9 @@ In general:
    hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
    for some time before deleting them.
 
+See the [deprecation policy](project:../../contributing/deprecation_policy.md) for
+the project-wide deprecation policy.
+
 ### Unimplemented - `vllm:tokens_total`
 
 Added by <gh-pr:4464>, but apparently never implemented. This can just be
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index ec1f3cb8d64..ec661d8ec64 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -16,7 +16,7 @@ In the example above, the KV cache in the first block can be uniquely identified
 
 * Parent hash value: The hash value of the parent hash block.
 * Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
-* Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
+* Extra hashes: Other values required to make this block unique, such as LoRA IDs, multi-modality input hashes (see the example below), and cache salts to isolate caches in multi-tenant environments.
 
 > **Note 1:** We only cache full blocks.
 
@@ -76,6 +76,24 @@ Block 3
 
 In the rest of this document, we first introduce the data structure used for prefix caching in vLLM v1, followed by the prefix caching workflow of major KV cache operators (e.g., allocate, append, free, eviction). Finally, we use an example to illustrate the end to end prefix caching workflow.
 
+**Cache Isolation for Security**
+To improve privacy in shared environments, vLLM supports isolating prefix cache reuse through optional per-request salting. By including a `cache_salt` in the request, this value is injected into the hash of the first block, ensuring that only requests with the same salt can reuse cached KV blocks. This prevents timing-based attacks where an adversary could infer cached content by observing latency differences. This offers protection without compromising performance.
+
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Here is a document with details about the world series: ..."},
+    {"role": "user", "content": "Who won the world series in 2020?"}
+  ],
+  "cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ=="
+}
+```
+
+With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
+
+> **Note:** Cache isolation is not supported in engine V0.
+
 ## Data Structure
 
 The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
index 57dba680b97..7920131643c 100644
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
 
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
 
@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
 
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index 6056ca0d366..8865d26deae 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -42,7 +42,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
   * [APC](#automatic-prefix-caching)
   * [LoRA](#lora-adapter)
   * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * [SD](#spec_decode)
+  * [SD](#spec-decode)
   * CUDA graph
   * <abbr title="Pooling Models">pooling</abbr>
   * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
@@ -122,7 +122,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
   *
   *
   *
-- * [SD](#spec_decode)
+- * [SD](#spec-decode)
   * ✅
   * ✅
   * ❌
@@ -377,7 +377,7 @@ Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/h
   * ✅
   * [❌](gh-issue:8475)
   * ✅
-- * [SD](#spec_decode)
+- * [SD](#spec-decode)
   * ✅
   * ✅
   * ✅
diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 52d253b9c2b..2fa20140c08 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
-Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
 
 ## Benchmarks
 
-Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
+Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
 
 ## Development
 
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index a71da72e436..b5b51095b3a 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \
 
 ## Dynamically serving LoRA Adapters
 
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
-LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
-to change models on-the-fly is needed.
+In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
 
 Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
 
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+is set to `True`.
 
 ```bash
 export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
 
+### Using API Endpoints
 Loading a LoRA Adapter:
 
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 }'
 ```
 
+### Using Plugins
+Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
+
+You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
+
+You can either install existing plugins or implement your own.
+
+Steps to implement your own LoRAResolver plugin:
+1. Implement the LoRAResolver interface.
+
+    Example of a simple S3 LoRAResolver implementation:
+
+    ```python
+    import os
+    import s3fs
+    from vllm.lora.request import LoRARequest
+    from vllm.lora.resolver import LoRAResolver
+
+    class S3LoRAResolver(LoRAResolver):
+        def __init__(self):
+            self.s3 = s3fs.S3FileSystem()
+            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+
+        async def resolve_lora(self, base_model_name, lora_name):
+            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+
+            # Download the LoRA from S3 to the local path
+            await self.s3._get(
+                s3_path, local_path, recursive=True, maxdepth=1
+            )
+
+            lora_request = LoRARequest(
+                lora_name=lora_name,
+                lora_path=local_path,
+                lora_int_id=abs(hash(lora_name))
+            )
+            return lora_request
+    ```
+
+2. Register LoRAResolver plugin.
+
+     ```python
+    from vllm.lora.resolver import LoRAResolverRegistry
+
+    s3_resolver = S3LoRAResolver()
+    LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
+    ```
+
+    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
+
 ## New format for `--lora-modules`
 
 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index b703d019531..b4ac597f5a7 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
 
-You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
+You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
 
 ```console
 pip install autoawq
 ```
 
-After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
 
 ```python
 from awq import AutoAWQForCausalLM
diff --git a/docs/source/features/quantization/bitblas.md b/docs/source/features/quantization/bitblas.md
new file mode 100644
index 00000000000..d0b2bf858c9
--- /dev/null
+++ b/docs/source/features/quantization/bitblas.md
@@ -0,0 +1,48 @@
+(bitblas)=
+
+# BitBLAS
+
+vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
+
+:::{note}
+Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
+:::
+
+Below are the steps to utilize BitBLAS with vLLM.
+
+```console
+pip install bitblas>=0.1.0
+```
+
+vLLM reads the model's config file and supports pre-quantized checkpoints.
+
+You can find pre-quantized models on:
+
+- [Hugging Face (BitBLAS)](https://huggingface.co/models?search=bitblas)
+- [Hugging Face (GPTQ)](https://huggingface.co/models?search=gptq)
+
+Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
+
+## Read bitblas format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+```
+
+## Read gptq format checkpoint
+
+```python
+from vllm import LLM
+import torch
+
+# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+```
diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index e356b99d85c..1843a33a3df 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -14,7 +14,7 @@ pip install bitsandbytes>=0.45.3
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
 
-You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
+You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>.
 And usually, these repositories have a config.json file that includes a quantization_config section.
 
 ## Read quantized checkpoint
diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md
index a62e0124b77..95e105357bd 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/source/features/quantization/fp8.md
@@ -30,6 +30,7 @@ from vllm import LLM
 model = LLM("facebook/opt-125m", quantization="fp8")
 # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
 result = model.generate("Hello, my name is")
+print(result[0].outputs[0].text)
 ```
 
 :::{warning}
@@ -44,6 +45,12 @@ To produce performant FP8 quantized models with vLLM, you'll need to install the
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves three main steps:
@@ -86,7 +93,7 @@ recipe = QuantizationModifier(
 # Apply the quantization algorithm.
 oneshot(model=model, recipe=recipe)
 
-# Save the model.
+# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
@@ -94,18 +101,13 @@ tokenizer.save_pretrained(SAVE_DIR)
 
 ### 3. Evaluating Accuracy
 
-Install `vllm` and `lm-evaluation-harness`:
-
-```console
-pip install vllm lm-eval==0.4.4
-```
-
 Load and run the model in `vllm`:
 
 ```python
 from vllm import LLM
 model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-model.generate("Hello my name is")
+result = model.generate("Hello my name is")
+print(result[0].outputs[0].text)
 ```
 
 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
@@ -188,4 +190,5 @@ from vllm import LLM
 model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
 # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
 result = model.generate("Hello, my name is")
+print(result[0].outputs[0].text)
 ```
diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md
index 34adf6512b7..9771d5a4fe9 100644
--- a/docs/source/features/quantization/gptqmodel.md
+++ b/docs/source/features/quantization/gptqmodel.md
@@ -16,12 +16,16 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna
 is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
 for more details on this and other advanced features.
 
-You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
+## Installation
+
+You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
 
 ```console
 pip install -U gptqmodel --no-build-isolation -v
 ```
 
+## Quantizing a model
+
 After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
 
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
@@ -49,12 +53,16 @@ model.quantize(calibration_dataset, batch_size=2)
 model.save(quant_path)
 ```
 
+## Running a quantized model with vLLM
+
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 
 ```console
-python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
 
+## Using GPTQModel with vLLM's Python API
+
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
 
 ```python
@@ -67,17 +75,22 @@ prompts = [
     "The capital of France is",
     "The future of AI is",
 ]
+
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
 
 # Create an LLM.
-llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
+
 # Print the outputs.
+print("-"*50)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-"*50)
 ```
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 6f539f6e3f4..7ad46b7094e 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -11,11 +11,13 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 supported_hardware
 auto_awq
 bnb
+bitblas
 gguf
 gptqmodel
 int4
 int8
 fp8
+modelopt
 quark
 quantized_kvcache
 torchao
diff --git a/docs/source/features/quantization/int4.md b/docs/source/features/quantization/int4.md
index f8939e5bf01..be48788a4ef 100644
--- a/docs/source/features/quantization/int4.md
+++ b/docs/source/features/quantization/int4.md
@@ -18,6 +18,12 @@ To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves four main steps:
@@ -87,7 +93,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md
index b381f34bccd..d6ddca18e26 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/source/features/quantization/int8.md
@@ -19,6 +19,12 @@ To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](
 pip install llmcompressor
 ```
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 The quantization process involves four main steps:
@@ -91,7 +97,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save the compressed model
+# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/modelopt.md b/docs/source/features/quantization/modelopt.md
new file mode 100644
index 00000000000..001d18657da
--- /dev/null
+++ b/docs/source/features/quantization/modelopt.md
@@ -0,0 +1,78 @@
+# NVIDIA TensorRT Model Optimizer
+
+The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models.
+
+We recommend installing the library with:
+
+```console
+pip install nvidia-modelopt
+```
+
+## Quantizing HuggingFace Models with PTQ
+
+You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
+
+Below is an example showing how to quantize a model using modelopt's PTQ API:
+
+```python
+import modelopt.torch.quantization as mtq
+from transformers import AutoModelForCausalLM
+
+# Load the model from HuggingFace
+model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+
+# Select the quantization config, for example, FP8
+config = mtq.FP8_DEFAULT_CFG
+
+# Define a forward loop function for calibration
+def forward_loop(model):
+    for data in calib_set:
+        model(data)
+
+# PTQ with in-place replacement of quantized modules
+model = mtq.quantize(model, config, forward_loop)
+```
+
+After the model is quantized, you can export it to a quantized checkpoint using the export API:
+
+```python
+import torch
+from modelopt.torch.export import export_hf_checkpoint
+
+with torch.inference_mode():
+    export_hf_checkpoint(
+        model,  # The quantized model.
+        export_dir,  # The directory where the exported files will be stored.
+    )
+```
+
+The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
+
+```python
+from vllm import LLM, SamplingParams
+
+def main():
+
+    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
+
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+if __name__ == "__main__":
+    main()
+```
diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md
index 9f36c2949e0..86e6354ec82 100644
--- a/docs/source/features/quantization/quantized_kvcache.md
+++ b/docs/source/features/quantization/quantized_kvcache.md
@@ -126,7 +126,7 @@ oneshot(
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save quantized model
+# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
 SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md
index 935ee37a815..955890dbc75 100644
--- a/docs/source/features/quantization/quark.md
+++ b/docs/source/features/quantization/quark.md
@@ -19,6 +19,12 @@ pip install amd-quark
 You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
 for more installation details.
 
+Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
+
+```console
+pip install vllm lm-eval==0.4.4
+```
+
 ## Quantization Process
 
 After installing Quark, we will use an example to illustrate how to use Quark.  
@@ -150,6 +156,7 @@ LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
 export_config = ExporterConfig(json_export_config=JsonExporterConfig())
 export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
 
+# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
 EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
 exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
 with torch.no_grad():
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index 2cbe8779dd8..f8af1ba60b1 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations
   * ❌
   * ❌
   * ❌
+- * BitBLAS (GPTQ)
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * AQLM
   * ✅︎
   * ✅︎
@@ -118,7 +129,17 @@ The table below shows the compatibility of various quantization implementations
   * ❌
   * ❌
   * ❌
-
+- * modelopt
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎︎
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 :::
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
diff --git a/docs/source/features/quantization/torchao.md b/docs/source/features/quantization/torchao.md
index 9a85f0bab9e..82100c6ddca 100644
--- a/docs/source/features/quantization/torchao.md
+++ b/docs/source/features/quantization/torchao.md
@@ -30,5 +30,4 @@ tokenizer.push_to_hub(hub_repo)
 quantized_model.push_to_hub(hub_repo, safe_serialization=False)
 ```
 
-Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
-See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
+Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 3a0be69f8e1..a079eb8b77e 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -15,16 +15,16 @@ vLLM currently supports the following reasoning models:
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
 
 - IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
 
 ## Quickstart
 
-To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
 
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --enable-reasoning --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
 ```
 
 Next, make a request to the model that should return the reasoning content in the response.
@@ -139,8 +139,7 @@ Remember to check whether the `reasoning_content` exists in the response before
 The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
 
 ```bash
-VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --enable-reasoning --reasoning-parser deepseek_r1
+VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
 ```
 
 Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
@@ -315,9 +314,8 @@ class DeepSeekReasoner(Reasoner):
 
 The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
 
-Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
+Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
 
 ```bash
-vllm serve <model_tag> \
-    --enable-reasoning --reasoning-parser example
+vllm serve <model_tag> --reasoning-parser example
 ```
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index de3c5bf5e7a..03119ec7441 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -2,8 +2,11 @@
 
 # Structured Outputs
 
-vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
-This document shows you some examples of the different options that are available to generate structured outputs.
+vLLM supports the generation of structured outputs using
+[xgrammar](https://github.com/mlc-ai/xgrammar) or
+[guidance](https://github.com/guidance-ai/llguidance) as backends.
+This document shows you some examples of the different options that are
+available to generate structured outputs.
 
 ## Online Serving (OpenAI API)
 
@@ -15,10 +18,17 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_regex`: the output will follow the regex pattern.
 - `guided_json`: the output will follow the JSON schema.
 - `guided_grammar`: the output will follow the context free grammar.
-- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
-- `guided_decoding_backend`: used to select the guided decoding backend to use. Additional backend-specific options can be supplied in a comma separated list following a colon after the backend name. For example `"xgrammar:no-fallback"` will not allow vLLM to fallback to a different backend on error.
+- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
-You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server) page.
+
+Structured outputs are supported by default in the OpenAI-Compatible Server. You
+may choose to specify the backend to use by setting the
+`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
+which will try to choose an appropriate backend based on the details of the
+request. You may also choose a specific backend, along with
+some options. A full set of options is available in the `vllm serve --help`
+text.
 
 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
 
@@ -50,7 +60,7 @@ completion = client.chat.completions.create(
             "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
         }
     ],
-    extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
+    extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
 )
 print(completion.choices[0].message.content)
 ```
@@ -96,26 +106,29 @@ print(completion.choices[0].message.content)
 ```
 
 :::{tip}
-While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
-This can improve the results notably in most cases.
+While not strictly necessary, normally it´s better to indicate in the prompt the
+JSON schema and how the fields should be populated.  This can improve the
+results notably in most cases.
 :::
 
-Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
-It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
+Finally we have the `guided_grammar` option, which is probably the most
+difficult to use, but it´s really powerful. It allows us to define complete
+languages like SQL queries.  It works by using a context free EBNF grammar.
+As an example, we can use to define a specific format of simplified SQL queries:
 
 ```python
 simplified_sql_grammar = """
-    ?start: select_statement
+    root ::= select_statement
 
-    ?select_statement: "SELECT " column_list " FROM " table_name
+    select_statement ::= "SELECT " column " from " table " where " condition
 
-    ?column_list: column_name ("," column_name)*
+    column ::= "col_1 " | "col_2 "
 
-    ?table_name: identifier
+    table ::= "table_1 " | "table_2 "
 
-    ?column_name: identifier
+    condition ::= column "= " number
 
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+    number ::= "1 " | "2 "
 """
 
 completion = client.chat.completions.create(
@@ -226,6 +239,8 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
 Answer: x = -29/8
 ```
 
+An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py>
+
 ## Offline Inference
 
 Offline inference allows for the same types of guided decoding.
@@ -236,11 +251,11 @@ The main available options inside `GuidedDecodingParams` are:
 - `regex`
 - `choice`
 - `grammar`
-- `backend`
-- `whitespace_pattern`
+- `structural_tag`
 
-These parameters can be used in the same way as the parameters from the Online Serving examples above.
-One example for the usage of the `choices` parameter is shown below:
+These parameters can be used in the same way as the parameters from the Online
+Serving examples above.  One example for the usage of the `choice` parameter is
+shown below:
 
 ```python
 from vllm import LLM, SamplingParams
diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 8b8bbd28d34..f98ec6108ce 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -152,12 +152,14 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
 
 Supported models:
 
-* `meta-llama/Meta-Llama-3.1-8B-Instruct`
-* `meta-llama/Meta-Llama-3.1-70B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
+All Llama 3.1, 3.2 and 4 models should be supported.
+
+* `meta-llama/Llama-3.1-*`
+* `meta-llama/Llama-3.2-*`
+* `meta-llama/Llama-4-*`
+
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
 
-The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
@@ -166,10 +168,20 @@ Known issues:
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
 
-The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
-it works better with vLLM.
+VLLM provides two JSON based chat templates for Llama 3.1 and 3.2:
+
+* `examples/tool_chat_template_llama3.1_json.jinja` - this is the "official" chat template for the Llama 3.1
+models, but tweaked so that it works better with vLLM.
+* `examples/tool_chat_template_llama3.2_json.jinja` - this extends upon the Llama 3.1 chat template by adding support for
+images.
+
+Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
+
+VLLM also provides a JSON based chat template for Llama 4:
+* `examples/tool_chat_template_llama4_json.jinja` - this is based on the "official" chat template for the Llama 4
+models, but tweaked so that it works better with vLLM.
 
-Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`.
 
 #### IBM Granite
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index e3046f35ee1..78938de317c 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -13,11 +13,11 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 - Intel Gaudi accelerator
 - Intel Gaudi software version 1.18.0
 
-Please follow the instructions provided in the [Gaudi Installation
-Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+Please follow the instructions provided in the
+[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
 to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the [Optimizing Training Platform
-Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+please follow the methods outlined in the
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
 ## Configure a new environment
 
@@ -32,15 +32,13 @@ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloade
 pip list | grep neural # verify that neural_compressor is installed
 ```
 
-Refer to [Intel Gaudi Software Stack
-Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 for more details.
 
 ### Run Docker Image
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi
-documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
 for more details.
 
 Use the following commands to run a Docker image:
@@ -278,8 +276,9 @@ Lower value corresponds to less usable graph memory reserved for prefill stage,
 :::
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
@@ -326,8 +325,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 - We recommend running inference on Gaudi 2 with `block_size` of 128
   for BF16 data type. Using default values (16, 32) might lead to
   sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi
-  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
 - For max throughput on Llama 7B, we recommend running with batch size
   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
   If you encounter out-of-memory issues, see troubleshooting section.
@@ -336,11 +334,11 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 **Diagnostic and profiling knobs:**
 
-- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
 
 **Performance tuning knobs:**
 
@@ -381,7 +379,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
 
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
 
 ## Troubleshooting: tweaking HPU graphs
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index beb803cf059..4459cc61e1c 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -44,7 +44,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
 
 You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
 or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
-API. This section shows how to create TPUs using the queued resource API. For
+API (preferred). This section shows how to create TPUs using the queued resource API. For
 more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
 Queued resources enable you to request Cloud TPU resources in a queued manner.
 When you request queued resources, the request is added to a queue maintained by
@@ -97,10 +97,10 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
     `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
 - * ACCELERATOR_TYPE
   * The TPU version you want to use. Specify the TPU version, for example
-    `v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
-    see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+    `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information,
+    see [TPU versions](https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions).
 - * RUNTIME_VERSION
-  * The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
+  * The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes).
 - * SERVICE_ACCOUNT
   * The email address for your service account. You can find it in the IAM
     Cloud Console under *Service Accounts*. For example:
@@ -158,7 +158,7 @@ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 Run the setup script:
 
 ```bash
-VLLM_TARGET_DEVICE="tpu" python setup.py develop
+VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
 ```
 
 ## Set up using Docker
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index db22ef79c92..2c0ec60d710 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -272,7 +272,7 @@ $ python examples/offline_inference/basic/basic.py
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
 
   - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 39d9dfbd2b2..f385f3d5b19 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -2,7 +2,7 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as
 
 ```console
 sudo apt-get update  -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
@@ -26,3 +26,9 @@ Finally, build and install vLLM CPU backend:
 ```console
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
+
+If you want to develop vllm, install it in editable mode instead.
+
+```console
+VLLM_TARGET_DEVICE=cpu python setup.py develop
+```
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index d3e375aec10..06915f09dd5 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -1,6 +1,6 @@
 # Installation
 
-vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
+vLLM contains pre-compiled C++ and CUDA (12.6) binaries.
 
 ## Requirements
 
@@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-# Install vLLM with CUDA 12.4.
+# Install vLLM with CUDA 12.6.
 pip install vllm # If you are using pip.
 uv pip install vllm # If you are using uv.
 ```
 
-As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions:
 
 ```console
 # Install vLLM with CUDA 11.8.
@@ -46,7 +46,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ##### Install the latest code using `pip`
 
 ```console
-pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 `--pre` is required for `pip` to consider pre-released versions.
@@ -65,9 +65,11 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
 Another way to install the latest code is to use `uv`:
 
 ```console
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
+##### Install specific revisions using `uv`
+
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
@@ -151,7 +153,7 @@ git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
 pip install -r requirements/build.txt
-pip install -e . --no-build-isolation
+pip install --no-build-isolation -e .
 ```
 
 ##### Use the local cutlass for compilation
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 21c8d7d01ad..dc74368fe2c 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -73,7 +73,22 @@ Currently, there are no pre-built ROCm wheels.
     You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
     :::
 
-3. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
+3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
+
+    ```console
+    python3 -m pip uninstall -y aiter
+    git clone --recursive https://github.com/ROCm/aiter.git
+    cd aiter
+    git checkout $AITER_BRANCH_OR_COMMIT
+    git submodule sync; git submodule update --init --recursive
+    python3 setup.py develop
+    ```
+
+    :::{note}
+    You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
+    :::
+
+4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
 
     ```bash
     $ pip install --upgrade pip
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index c41905f250f..4ab41a21c2a 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -23,6 +23,8 @@ Currently, there are no pre-built XPU wheels.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```console
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
 pip install --upgrade pip
 pip install -v -r requirements/xpu.txt
 ```
@@ -33,13 +35,6 @@ pip install -v -r requirements/xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-- Finally, due to a known issue of conflict dependency(oneapi related) in torch-xpu 2.6 and ipex-xpu 2.6, we install ipex here. This will be fixed in the ipex-xpu 2.7.
-
-```console
-pip install intel-extension-for-pytorch==2.6.10+xpu \
-    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-```
-
 :::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
   type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
@@ -79,5 +74,3 @@ python -m vllm.entrypoints.openai.api_server \
 ```
 
 By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
-
-There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 87fa442e9a4..a4744827f22 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 ## Out of memory
 
-If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption.
 
 ## Generation quality changed
 
diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index a87484c3bb0..de90b8a7851 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -44,8 +44,8 @@ This living user guide outlines a few known **important changes and limitations*
 |-----------------|-----------------------------------------------------------------------------------|
 | **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
 | **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
 | **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
 | **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
@@ -121,11 +121,6 @@ Although we have re-implemented and partially optimized many features and models
 These features are already supported in vLLM V1, but their optimization is still
 in progress.
 
-- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
-  inferior to that of V0. The team is actively working on improving its
-  performance
-(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
-
 - **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
   will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 28dc0f67d77..bbff7361f75 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -132,6 +132,7 @@ serving/integrations/index
 :caption: Deployment
 :maxdepth: 1
 
+deployment/security
 deployment/docker
 deployment/k8s
 deployment/nginx
@@ -180,6 +181,7 @@ design/v1/metrics
 :maxdepth: 2
 
 contributing/overview
+contributing/deprecation_policy
 contributing/profiling/profiling_index
 contributing/dockerfile/dockerfile
 contributing/model/index
@@ -192,11 +194,8 @@ contributing/vulnerability_management
 :caption: API Reference
 :maxdepth: 2
 
-api/offline_inference/index
-api/engine/index
-api/inference_params
-api/multimodal/index
-api/model/index
+api/summary
+api/vllm/vllm
 :::
 
 % Latest news and acknowledgements
diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/source/models/extensions/fastsafetensor.md
index 66cd710c97e..531d5869001 100644
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
@@ -1,5 +1,5 @@
 Loading Model weights with fastsafetensors
 ===================================================================
 
-Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
 For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index 99c37876a01..e0daa6f86dd 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -51,3 +51,29 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer -
 :::{note}
 For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
 :::
+
+## Sharded Model Loading
+
+vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:
+
+```console
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
+```
+
+The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
+
+```console
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
+```
+
+To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
+
+The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
+
+```console
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
+```
+
+:::{note}
+The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
+:::
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 63fc53b0e7c..dd765e4a976 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -14,7 +14,7 @@ Usually, this is automatically inferred so you don't have to specify it.
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See <project:#configuration> for a list of options when initializing the model.
 
 ### `LLM.generate`
 
@@ -59,7 +59,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/basi
 
 ### `LLM.beam_search`
 
-The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`.
+The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of {class}`~vllm.LLM.generate`.
 For example, to search using 5 beams and output at most 50 tokens:
 
 ```python
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index dbcd846cc97..8c8d1832d38 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -60,7 +60,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ## Offline Inference
 
 The {class}`~vllm.LLM` class provides various methods for offline inference.
-See [Engine Arguments](#engine-args) for a list of options when initializing the model.
+See <project:#configuration> for a list of options when initializing the model.
 
 ### `LLM.encode`
 
@@ -141,3 +141,77 @@ Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints tha
 - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
 - [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
+
+## Matryoshka Embeddings
+
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+
+:::{warning}
+Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
+
+For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
+
+```json
+{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
+```
+
+:::
+
+### Manually enable Matryoshka Embeddings
+
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
+
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`,  `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+
+Here is an example to serve a model with Matryoshka Embeddings enabled.
+
+```text
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
+```
+
+### Offline Inference
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`.
+
+```python
+from vllm import LLM, PoolingParams
+
+model = LLM(model="jinaai/jina-embeddings-v3", 
+            task="embed", 
+            trust_remote_code=True)
+outputs = model.embed(["Follow the white rabbit."], 
+                      pooling_params=PoolingParams(dimensions=32))
+print(outputs[0].outputs)
+```
+
+A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
+
+### Online Inference
+
+Use the following command to start vllm server.
+
+```text
+vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+```
+
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
+
+```text
+curl http://127.0.0.1:8000/v1/embeddings \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "input": "Follow the white rabbit.",
+    "model": "jinaai/jina-embeddings-v3",
+    "encoding_format": "float",
+    "dimensions": 32
+  }'
+```
+
+Expected output:
+
+```json
+{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
+```
+
+A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index b6fef2f43b8..831f9a86d1d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -40,29 +40,37 @@ You can force the use of `TransformersForCausalLM` by setting `model_impl="trans
 vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
 :::
 
-#### Supported features
+#### Custom models
 
-The Transformers modeling backend explicitly supports the following features:
+If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
 
-- <project:#quantization-index> (except GGUF)
-- <project:#lora-adapter>
-- <project:#distributed-serving>
+For a model to be compatible with the Transformers backend for vLLM it must:
 
-#### Remote Code
+- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
+  * The model directory must have the correct structure (e.g. `config.json` is present).
+  * `config.json` must contain `auto_map.AutoModel`.
+- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>):
+  * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
-If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
+If the compatible model is:
 
-Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
-Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
+- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remode-code` for the <project:#openai-compatible-server>.
+- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>.
 
-```python
-from vllm import LLM
-llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
-llm.apply_model(lambda model: print(model.__class__))
-```
+This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
+
+(writing-custom-models)=
+
+#### Writing custom models
+
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
 
 To make your model compatible with the Transformers backend, it needs:
 
+1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
+2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
+3. `MyModel` must contain `_supports_attention_backend = True`.
+
 ```{code-block} python
 :caption: modeling_my_model.py
 
@@ -71,7 +79,7 @@ from torch import nn
 
 class MyAttention(nn.Module):
 
-  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+  def forward(self, hidden_states, **kwargs):
     ...
     attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
     attn_output, attn_weights = attention_interface(
@@ -87,11 +95,11 @@ class MyModel(PreTrainedModel):
   _supports_attention_backend = True
 ```
 
-Here is what happens in the background:
+Here is what happens in the background when this model is loaded:
 
-1. The config is loaded
-2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
-3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+1. The config is loaded.
+2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
+3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 
@@ -129,7 +137,7 @@ class MyConfig(PretrainedConfig):
 
 ### Hugging Face Hub
 
-By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
 
 To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
@@ -213,6 +221,16 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```
 
+(feature-status-legend)=
+
+## Feature Status Legend
+
+- ✅︎ indicates that the feature is supported for the model.
+
+- 🚧 indicates that the feature is planned but not yet supported for the model.
+
+- ⚠️ indicates that the feature is available but may have known issues or limitations.
+
 (supported-text-models)=
 
 ## List of Text-only Language Models
@@ -314,7 +332,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `GemmaForCausalLM`
   * Gemma
-  * `google/gemma-2b`, `google/gemma-7b`, etc.
+  * `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.
   * ✅︎
   * ✅︎
 - * `Gemma2ForCausalLM`
@@ -334,7 +352,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `Glm4ForCausalLM`
   * GLM-4-0414
-  * `THUDM/GLM-4-32B-Chat-0414`, etc.
+  * `THUDM/GLM-4-32B-0414`, etc.
   * ✅︎
   * ✅︎
 - * `GPT2LMHeadModel`
@@ -497,6 +515,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
   *
   * ✅︎
+- * `Plamo2ForCausalLM`
+  * PLaMo2
+  * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.
+  *
+  *
 - * `QWenLMHeadModel`
   * Qwen
   * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
@@ -519,8 +542,8 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `Qwen3MoeForCausalLM`
   * Qwen3MoE
-  * `Qwen/Qwen3-MoE-15B-A2B`, etc.
-  * ✅︎
+  * `Qwen/Qwen3-30B-A3B`, etc.
+  *
   * ✅︎
 - * `StableLmForCausalLM`
   * StableLM
@@ -735,6 +758,11 @@ If your model is not in the above list, we will try to automatically convert the
   * `BAAI/bge-reranker-v2-m3`, etc.
   *
   *
+- * `ModernBertForSequenceClassification`
+  * ModernBert-based
+  * `Alibaba-NLP/gte-reranker-modernbert-base`, etc.
+  *
+  *
 :::
 
 (supported-mm-models)=
@@ -765,6 +793,8 @@ or `--limit-mm-per-prompt` (online serving). For example, to enable passing up t
 Offline inference:
 
 ```python
+from vllm import LLM
+
 llm = LLM(
     model="Qwen/Qwen2-VL-7B-Instruct",
     limit_mm_per_prompt={"image": 4},
@@ -774,7 +804,7 @@ llm = LLM(
 Online serving:
 
 ```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
 ```
 
 **This is no longer required if you are using vLLM V1.**
@@ -865,6 +895,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `GraniteSpeechForConditionalGeneration`
+  * Granite Speech
+  * T + A
+  * `ibm-granite/granite-speech-3.3-8b`
+  * ✅︎
+  * ✅︎
+  * ✅︎
 - * `H2OVLChatModel`
   * H2OVL
   * T + I<sup>E+</sup>
@@ -942,11 +979,18 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `MiniMaxVL01ForConditionalGeneration`
+  * MiniMax-VL
+  * T + I<sup>E+</sup>
+  * `MiniMaxAI/MiniMax-VL-01`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `Mistral3ForConditionalGeneration`
   * Mistral3
   * T + I<sup>+</sup>
   * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
-  *
+  * ✅︎
   * ✅︎
   * ✅︎
 - * `MllamaForConditionalGeneration`
@@ -970,6 +1014,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Ovis2ForConditionalGeneration`<sup>^</sup>
+  * Ovis2
+  * T + I<sup>+</sup>
+  * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis2-2B`, etc.
+  *
+  *
+  * ✅︎
 - * `PaliGemmaForConditionalGeneration`
   * PaliGemma, PaliGemma 2
   * T + I<sup>E</sup>
@@ -990,7 +1041,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `microsoft/Phi-4-multimodal-instruct`, etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
@@ -1026,6 +1077,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Qwen2_5OmniThinkerForConditionalGeneration`
+  * Qwen2.5-Omni
+  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>
+  * `Qwen/Qwen2.5-Omni-7B`
+  *
+  * ✅︎
+  * ✅︎\*
 - * `SkyworkR1VChatModel`
   * Skywork-R1V-38B
   * T + I
@@ -1057,7 +1115,7 @@ See [this page](#generative-models) for more information on how to use generativ
 
 :::{important}
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
-You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
+You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": true}'`.
 :::
 
 :::{warning}
@@ -1072,7 +1130,7 @@ V0 correctly implements the model's attention pattern:
 
 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
-- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
 - Will be updated in the future to support the correct behavior
 
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
@@ -1086,6 +1144,36 @@ This limitation exists because the model's mixed attention pattern (bidirectiona
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 :::
 
+:::{warning}
+The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
+
+For the best results, we recommend using the following dependency versions (tested on A10 and L40):
+
+```text
+# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
+torch==2.5.1
+torchvision==0.20.1
+transformers==4.48.1
+tokenizers==0.21.0
+tiktoken==0.7.0
+vllm==0.7.0
+
+# Optional but recommended for improved performance and stability
+triton==3.1.0
+xformers==0.0.28.post3
+uvloop==0.21.0
+protobuf==5.29.3
+openai==1.60.2
+opencv-python-headless==4.11.0.86
+pillow==10.4.0
+
+# Installed FlashAttention (for float16 only)
+flash-attn>=2.5.6  # Not used in float32, but should be documented
+```
+
+**Note:** Make sure you understand the security implications of using outdated packages.
+:::
+
 :::{note}
 The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
@@ -1095,6 +1183,14 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 :::
 
+:::{note}
+To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers.git`.
+
+Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index ccbe8a36706..4160f078496 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -2,65 +2,188 @@
 
 # Optimization and Tuning
 
+This guide covers optimization strategies and performance tuning for vLLM V1.
+
 ## Preemption
 
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
-The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
-available again. When this occurs, the following warning is printed:
+In such cases, vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes
+available again. When this occurs, you may see the following warning:
 
 ```text
-WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
+WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.RECOMPUTE mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1
 ```
 
 While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency.
-If you frequently encounter preemptions from the vLLM engine, consider the following actions:
+If you frequently encounter preemptions, consider the following actions:
+
+- Increase `gpu_memory_utilization`. vLLM pre-allocates GPU cache using this percentage of memory. By increasing utilization, you can provide more KV cache space.
+- Decrease `max_num_seqs` or `max_num_batched_tokens`. This reduces the number of concurrent requests in a batch, thereby requiring less KV cache space.
+- Increase `tensor_parallel_size`. This shards model weights across GPUs, allowing each GPU to have more memory available for KV cache. However, increasing this value may cause excessive synchronization overhead.
+- Increase `pipeline_parallel_size`. This distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, indirectly leaving more memory available for KV cache. However, increasing this value may cause latency penalties.
 
-- Increase `gpu_memory_utilization`. The vLLM pre-allocates GPU cache by using gpu_memory_utilization% of memory. By increasing this utilization, you can provide more KV cache space.
-- Decrease `max_num_seqs` or `max_num_batched_tokens`. This can reduce the number of concurrent requests in a batch, thereby requiring less KV cache space.
-- Increase `tensor_parallel_size`. This approach shards model weights, so each GPU has more memory available for KV cache.
-- Increase `pipeline_parallel_size`. This approach distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, which indirectly leaves more memory available for KV cache.
+You can monitor the number of preemption requests through Prometheus metrics exposed by vLLM. Additionally, you can log the cumulative number of preemption requests by setting `disable_log_stats=False`.
 
-You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
+In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
 
 (chunked-prefill)=
 
 ## Chunked Prefill
 
-vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
+Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
+
+In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
+
+With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
+
+This policy has two benefits:
+
+- It improves ITL and generation decode because decode requests are prioritized.
+- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
 
-You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
+### Performance Tuning with Chunked Prefill
+
+You can tune the performance by adjusting `max_num_batched_tokens`:
+
+- Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
+- Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
+- For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs.
+- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
 
 ```python
 from vllm import LLM
 
-llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
-# Set max_num_batched_tokens to tune performance.
-# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
-# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
+# Set max_num_batched_tokens to tune performance
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", max_num_batched_tokens=16384)
 ```
 
-By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
-This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
+See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).
 
-Once chunked prefill is enabled, the policy is changed to prioritize decode requests.
-It batches all pending decode requests to the batch before scheduling any prefill.
-When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills.
-If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
+## Parallelism Strategies
 
-This policy has two benefits:
+vLLM supports multiple parallelism strategies that can be combined to optimize performance across different hardware configurations.
 
-- It improves ITL and generation decode because decode requests are prioritized.
-- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
+### Tensor Parallelism (TP)
 
-You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
-Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
-Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
+Tensor parallelism shards model parameters across multiple GPUs within each model layer. This is the most common strategy for large model inference within a single node.
 
-- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
-- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
+**When to use:**
 
-We recommend you set `max_num_batched_tokens > 2048` for throughput.
+- When the model is too large to fit on a single GPU
+- When you need to reduce memory pressure per GPU to allow more KV cache space for higher throughput
 
-See related papers for more details (<https://arxiv.org/pdf/2401.08671> or <https://arxiv.org/pdf/2308.16369>).
+```python
+from vllm import LLM
+
+# Split model across 4 GPUs
+llm = LLM(model="meta-llama/Llama-3.3-70B-Instruct", tensor_parallel_size=4)
+```
+
+For models that are too large to fit on a single GPU (like 70B parameter models), tensor parallelism is essential.
+
+### Pipeline Parallelism (PP)
+
+Pipeline parallelism distributes model layers across multiple GPUs. Each GPU processes different parts of the model in sequence.
+
+**When to use:**
+
+- When you've already maxed out efficient tensor parallelism but need to distribute the model further, or across nodes
+- For very deep and narrow models where layer distribution is more efficient than tensor sharding
+
+Pipeline parallelism can be combined with tensor parallelism for very large models:
+
+```python
+from vllm import LLM
+
+# Combine pipeline and tensor parallelism
+llm = LLM(
+    model="meta-llama/Llama-3.3-70B-Instruct,
+    tensor_parallel_size=4,
+    pipeline_parallel_size=2
+)
+```
+
+### Expert Parallelism (EP)
+
+Expert parallelism is a specialized form of parallelism for Mixture of Experts (MoE) models, where different expert networks are distributed across GPUs.
+
+**When to use:**
 
-Please try out this feature and let us know your feedback via GitHub issues!
+- Specifically for MoE models (like DeepSeekV3, Qwen3MoE, Llama-4)
+- When you want to balance the expert computation load across GPUs
+
+Expert parallelism is enabled by setting `enable_expert_parallel=True`, which will use expert parallelism instead of tensor parallelism for MoE layers.
+It will use the same degree of parallelism as what you have set for tensor parallelism.
+
+### Data Parallelism (DP)
+
+Data parallelism replicates the entire model across multiple GPU sets and processes different batches of requests in parallel.
+
+**When to use:**
+
+- When you have enough GPUs to replicate the entire model
+- When you need to scale throughput rather than model size
+- In multi-user environments where isolation between request batches is beneficial
+
+Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
+Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
+
+## Reducing Memory Usage
+
+If you encounter out-of-memory issues, consider these strategies:
+
+### Context Length and Batch Size
+
+You can reduce memory usage by limiting the context length and batch size:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    max_model_len=2048,  # Limit context window
+    max_num_seqs=4       # Limit batch size
+)
+```
+
+### Adjust CUDA Graph Compilation
+
+CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
+    )
+)
+```
+
+Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
+
+```python
+from vllm import LLM
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    enforce_eager=True  # Disable CUDA graph compilation
+)
+```
+
+### Multimodal Models
+
+For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
+
+```python
+from vllm import LLM
+
+# Accept up to 2 images per prompt
+llm = LLM(
+    model="Qwen/Qwen2.5-VL-3B-Instruct",
+    limit_mm_per_prompt={"image": 2}
+)
+```
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 591acc2c9b7..c285ef3e8e1 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -77,6 +77,10 @@ bash run_cluster.sh \
 
 Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
 
+:::{warning}
+It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties.
+:::
+
 :::{warning}
 Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
 :::
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index e9943571a40..97ea01cd3b2 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -16,6 +16,7 @@ Below, you can find an explanation of every engine argument:
     :func: _engine_args_parser
     :prog: vllm serve
     :nodefaultconst:
+    :markdownhelp:
 ```
 
 ## Async Engine Arguments
@@ -29,4 +30,5 @@ Additional arguments are available to the asynchronous engine which is used for
     :func: _async_engine_args_parser
     :prog: vllm serve
     :nodefaultconst:
+    :markdownhelp:
 ```
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index f45d36c3cca..d9a093e8d14 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -228,7 +228,7 @@ First, launch the OpenAI-compatible server:
 
 ```bash
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 ```
 
 Then, you can use the OpenAI client as follows:
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 85f2cafacdd..2621eda3254 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -25,13 +25,15 @@ The available APIs depend on the type of model that is being run:
 Please refer to the above pages for more details about each API.
 
 :::{seealso}
-[API Reference](/api/offline_inference/index)
+[API Reference](#offline-inference-api)
 :::
 
+(configuration-options)=
+
 ## Configuration Options
 
 This section lists the most common options for running the vLLM engine.
-For a full list, refer to the [Engine Arguments](#engine-args) page.
+For a full list, refer to the <project:#configuration> page.
 
 (model-resolution)=
 
@@ -59,6 +61,8 @@ model = LLM(
 
 Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
 
+(reducing-memory-usage)=
+
 ### Reducing memory usage
 
 Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
@@ -81,6 +85,12 @@ before initializing vLLM. Otherwise, you may run into an error like `RuntimeErro
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
 :::
 
+:::{note}
+With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
+
+You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+:::
+
 #### Quantization
 
 Quantized models take less memory at the cost of lower precision.
@@ -103,6 +113,39 @@ llm = LLM(model="adept/fuyu-8b",
           max_num_seqs=2)
 ```
 
+#### Reduce CUDA Graphs
+
+By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
+
+:::{important}
+CUDA graph capture takes up more memory in V1 than in V0.
+:::
+
+You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        # By default, it goes up to max_num_seqs
+        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+    ),
+)
+```
+
+You can disable graph capturing completely via the `enforce_eager` flag:
+
+```python
+from vllm import LLM
+
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+          enforce_eager=True)
+```
+
 #### Adjust cache size
 
 If you run out of CPU RAM, try the following options:
@@ -110,16 +153,25 @@ If you run out of CPU RAM, try the following options:
 - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
-#### Disable unused modalities
+#### Multi-modal input limits
 
-You can disable unused modalities (except for text) by setting its limit to zero.
+You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
+
+```python
+from vllm import LLM
+
+# Accept up to 3 images and 1 video per prompt
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"image": 3, "video": 1})
+```
 
+You can go a step further and disable unused modalities completely by setting its limit to zero.
 For example, if your application only accepts image input, there is no need to allocate any memory for videos.
 
 ```python
 from vllm import LLM
 
-# Accept images but not videos
+# Accept any number of images but no videos
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
           limit_mm_per_prompt={"video": 0})
 ```
@@ -134,6 +186,29 @@ llm = LLM(model="google/gemma-3-27b-it",
           limit_mm_per_prompt={"image": 0})
 ```
 
+#### Multi-modal processor arguments
+
+For certain models, you can adjust the multi-modal processor arguments to
+reduce the size of the processed multi-modal inputs, which in turn saves memory.
+
+Here are some examples:
+
+```python
+from vllm import LLM
+
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
+
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
+
 ### Performance optimization and tuning
 
 You can potentially improve the performance of vLLM by finetuning various options.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 11ca571c684..34382c87a48 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -33,11 +33,13 @@ print(completion.choices[0].message)
 vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
 You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 :::
+
 :::{important}
 By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
 To disable this behavior, please pass `--generation-config vllm` when launching the server.
 :::
+
 ## Supported APIs
 
 We currently support the following OpenAI APIs:
@@ -172,6 +174,12 @@ print(completion._request_id)
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 
+:::{tip}
+The vast majority of command-line arguments are based on those for offline inference.
+
+See [here](configuration-options) for some common options.
+:::
+
 :::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
@@ -394,9 +402,26 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
 :::
 
+Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
 
-Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+#### Extra Parameters
+
+The following [sampling parameters](#sampling-params) are supported.
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-transcription-sampling-params
+:end-before: end-transcription-sampling-params
+:::
+
+The following extra parameters are supported:
+
+:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-transcription-extra-params
+:end-before: end-transcription-extra-params
+:::
 
 (tokenizer-api)=
 
diff --git a/examples/lmcache/README.md b/examples/lmcache/README.md
new file mode 100644
index 00000000000..95a6bf995b2
--- /dev/null
+++ b/examples/lmcache/README.md
@@ -0,0 +1,56 @@
+# LMCache Examples
+
+This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
+
+## 1. Disaggregated Prefill in vLLM v1
+
+This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
+
+### Prerequisites
+
+- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
+- Install [NIXL](https://github.com/ai-dynamo/nixl).
+- At least 2 GPUs
+- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
+
+### Usage
+
+Run
+`cd disagg_prefill_lmcache_v1`
+to get into `disagg_prefill_lmcache_v1` folder, and then run
+
+```bash
+bash disagg_example_nixl.sh
+```
+
+to run disaggregated prefill and benchmark the performance.
+
+### Components
+
+#### Server Scripts
+- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
+- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
+- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
+
+#### Configuration
+- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
+- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
+
+#### Log Files
+The main script generates several log files:
+- `prefiller.log` - Logs from the prefill server
+- `decoder.log` - Logs from the decode server
+- `proxy.log` - Logs from the proxy server
+
+## 2. CPU Offload Examples
+
+- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
+- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1
+
+## 3. KV Cache Sharing
+
+The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
+
+## 4. Disaggregated Prefill in vLLM v0
+
+The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0.
diff --git a/examples/lmcache/cpu_offload_lmcache.py b/examples/lmcache/cpu_offload_lmcache.py
new file mode 100644
index 00000000000..bf191960b08
--- /dev/null
+++ b/examples/lmcache/cpu_offload_lmcache.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of cpu offloading
+with LMCache in vLLM v1 or v0.
+
+Usage:
+
+    Specify vLLM version
+
+    -v v0 : Use LMCacheConnector
+            model = mistralai/Mistral-7B-Instruct-v0.2
+            (Includes enable_chunked_prefill = True)
+
+    -v v1 : Use LMCacheConnectorV1 (default)
+            model = meta-llama/Meta-Llama-3.1-8B-Instruct
+            (Without enable_chunked_prefill)
+
+Note that `lmcache` is needed to run this example.
+Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
+Learn more about LMCache environment setup, please refer to:
+https://docs.lmcache.ai/getting_started/installation.html
+"""
+import argparse
+import contextlib
+import os
+import time
+from dataclasses import asdict
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+from vllm.engine.arg_utils import EngineArgs
+
+
+def setup_environment_variables():
+    # LMCache-related environment variables
+    # Use experimental features in LMCache
+    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+    # LMCache is set to use 256 tokens per chunk
+    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+    # Enable local CPU backend in LMCache
+    os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    # Set local CPU memory limit to 5.0 GB
+    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+
+
+@contextlib.contextmanager
+def build_llm_with_lmcache(lmcache_connector: str, model: str,
+                           vllm_version: str):
+    ktc = KVTransferConfig(
+        kv_connector=lmcache_connector,
+        kv_role="kv_both",
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
+    if vllm_version == "v0":
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+            enable_chunked_prefill=True,  # Only in v0
+        )
+    else:
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+        )
+
+    llm = LLM(**asdict(llm_args))
+    try:
+        yield llm
+    finally:
+        # Clean up lmcache backend
+        LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def print_output(
+    llm: LLM,
+    prompt: list[str],
+    sampling_params: SamplingParams,
+    req_str: str,
+):
+    # Should be able to see logs like the following:
+    # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
+    # This indicates that the KV cache has been stored in LMCache.
+    start = time.time()
+    outputs = llm.generate(prompt, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print(f"Generation took {time.time() - start:.2f} seconds, "
+          f"{req_str} request done.")
+    print("-" * 50)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-v",
+                        "--version",
+                        choices=["v0", "v1"],
+                        default="v1",
+                        help="Specify vLLM version (default: v1)")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.version == "v0":
+        lmcache_connector = "LMCacheConnector"
+        model = "mistralai/Mistral-7B-Instruct-v0.2"
+    else:
+        lmcache_connector = "LMCacheConnectorV1"
+        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+    setup_environment_variables()
+
+    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
+
+        # This example script runs two requests with a shared prefix.
+        # Define the shared prompt and specific prompts
+        shared_prompt = "Hello, how are you?" * 1000
+        first_prompt = [
+            shared_prompt + "Hello, my name is",
+        ]
+        second_prompt = [
+            shared_prompt + "Tell me a very long story",
+        ]
+
+        sampling_params = SamplingParams(temperature=0,
+                                         top_p=0.95,
+                                         max_tokens=10)
+
+        # Print the first output
+        print_output(llm, first_prompt, sampling_params, "first")
+
+        time.sleep(1)
+
+        # print the second output
+        print_output(llm, second_prompt, sampling_params, "second")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/lmcache/disagg_prefill_lmcache_v0.py
similarity index 100%
rename from examples/offline_inference/disaggregated_prefill_lmcache.py
rename to examples/lmcache/disagg_prefill_lmcache_v0.py
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
new file mode 100644
index 00000000000..c3f5a0ae69c
--- /dev/null
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
@@ -0,0 +1,13 @@
+local_cpu: False
+max_local_cpu_size: 0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "receiver"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
new file mode 100644
index 00000000000..8b0e82958a6
--- /dev/null
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
@@ -0,0 +1,13 @@
+local_cpu: False
+max_local_cpu_size: 0
+#local_disk: 
+max_local_disk_size: 0
+remote_serde: NULL
+
+enable_nixl: True
+nixl_role: "sender"
+nixl_peer_host: "localhost"
+nixl_peer_port: 55555
+nixl_buffer_size: 1073741824 # 1GB
+nixl_buffer_device: "cuda"
+nixl_enable_gc: True
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
new file mode 100644
index 00000000000..df8a4129350
--- /dev/null
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
+
+
+PIDS=()
+
+# Switch to the directory of the current script
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+check_hf_token() {
+    if [ -z "$HF_TOKEN" ]; then
+        echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    if [[ "$HF_TOKEN" != hf_* ]]; then
+        echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
+        exit 1
+    fi
+    echo "HF_TOKEN is set and valid."
+}
+
+check_num_gpus() {
+    # can you check if the number of GPUs are >=2 via nvidia-smi?
+    num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$num_gpus" -lt 2 ]; then
+        echo "You need at least 2 GPUs to run disaggregated prefill."
+        exit 1
+    else
+        echo "Found $num_gpus GPUs."
+    fi
+}
+
+ensure_python_library_installed() {
+    echo "Checking if $1 is installed..."
+    python -c "import $1" > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        if [ "$1" == "nixl" ]; then
+            echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
+        else
+            echo "$1 is not installed. Please install it via pip install $1."
+        fi
+        exit 1
+    else
+        echo "$1 is installed."
+    fi
+}
+
+cleanup() {
+    echo "Stopping everything…"
+    trap - INT TERM        # prevent re-entrancy
+    kill -- -$$            # negative PID  ==  “this whole process-group”
+    wait                   # reap children so we don't leave zombies
+    exit 0
+}
+
+wait_for_server() {
+  local port=$1
+  local timeout_seconds=1200
+  local start_time=$(date +%s)
+
+  echo "Waiting for server on port $port..."
+
+  while true; do
+    if curl -s "localhost:${port}/v1/completions" > /dev/null; then
+      return 0
+    fi
+
+    local now=$(date +%s)
+    if (( now - start_time >= timeout_seconds )); then
+      echo "Timeout waiting for server"
+      return 1
+    fi
+
+    sleep 1
+  done
+}
+
+
+main() {
+    check_hf_token
+    check_num_gpus
+    ensure_python_library_installed lmcache
+    ensure_python_library_installed nixl
+    ensure_python_library_installed pandas
+    ensure_python_library_installed datasets
+    ensure_python_library_installed vllm
+
+    trap cleanup INT
+    trap cleanup USR1
+    trap cleanup TERM
+
+    echo "Launching prefiller, decoder and proxy..."
+    echo "Please check prefiller.log, decoder.log and proxy.log for logs."
+
+    bash disagg_vllm_launcher.sh prefiller \
+        > >(tee prefiller.log) 2>&1 &
+    prefiller_pid=$!
+    PIDS+=($prefiller_pid)
+
+    bash disagg_vllm_launcher.sh decoder  \
+        > >(tee decoder.log)  2>&1 &
+    decoder_pid=$!
+    PIDS+=($decoder_pid)
+
+    python3 disagg_proxy_server.py \
+        --host localhost \
+        --port 9000 \
+        --prefiller-host localhost \
+        --prefiller-port 8100 \
+        --decoder-host localhost \
+        --decoder-port 8200  \
+        > >(tee proxy.log)    2>&1 &
+    proxy_pid=$!
+    PIDS+=($proxy_pid)
+
+    wait_for_server 8100
+    wait_for_server 8200
+    wait_for_server 9000
+
+    echo "All servers are up. Starting benchmark..."
+
+    # begin benchmark
+    cd ../../../benchmarks/
+    python benchmark_serving.py --port 9000 --seed $(date +%s) \
+        --model meta-llama/Llama-3.1-8B-Instruct \
+        --dataset-name random --random-input-len 7500 --random-output-len 200 \
+        --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
+
+    echo "Benchmarking done. Cleaning up..."
+
+    cleanup
+
+}
+
+main
\ No newline at end of file
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
new file mode 100644
index 00000000000..8db93bc8931
--- /dev/null
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+import time
+from contextlib import asynccontextmanager
+
+import httpx
+import numpy as np
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager to handle startup and shutdown events.
+    """
+    # Startup: Initialize clients
+    prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1'
+    decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1'
+
+    app.state.prefill_client = httpx.AsyncClient(timeout=None,
+                                                 base_url=prefiller_base_url)
+    app.state.decode_client = httpx.AsyncClient(timeout=None,
+                                                base_url=decoder_base_url)
+
+    yield
+
+    # Shutdown: Close clients
+    await app.state.prefill_client.aclose()
+    await app.state.decode_client.aclose()
+
+
+# Update FastAPI app initialization to use lifespan
+app = FastAPI(lifespan=lifespan)
+
+
+class StatsCalculator:
+
+    def __init__(self):
+        self._stats = []
+        self._last_log_time = time.time()
+
+    def add(self, value):
+        self._stats.append(value)
+        if time.time() - self._last_log_time > 5:
+            self._log_stats()
+            self._last_log_time = time.time()
+
+    def _log_stats(self):
+        # Print average, median, and 99th percentile
+        np_arr = np.array(self._stats)
+        output_str = f"\nNum requests: {len(self._stats)}" + \
+                "\nPrefill node TTFT stats:" + \
+                f"\n - Average (ms): {np.mean(np_arr)}" + \
+                f"\n - Median (ms): {np.median(np_arr)}" + \
+                f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
+        print("===============================", output_str,
+              "===============================")
+
+
+stats_calculator = StatsCalculator()
+counter = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--prefiller-host", type=str, default="localhost")
+    parser.add_argument("--prefiller-port", type=int, default=8100)
+    parser.add_argument("--decoder-host", type=str, default="localhost")
+    parser.add_argument("--decoder-port", type=int, default=8200)
+    args = parser.parse_args()
+    return args
+
+
+# Initialize variables to hold the persistent clients
+app.state.prefill_client = None
+app.state.decode_client = None
+
+
+async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
+                                  req_data: dict):
+    """
+    Send a request to a service using a persistent client.
+    """
+    req_data = req_data.copy()
+    req_data['max_tokens'] = 1
+    if 'max_completion_tokens' in req_data:
+        req_data['max_completion_tokens'] = 1
+
+    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    response = await client.post(endpoint, json=req_data, headers=headers)
+    response.raise_for_status()
+    return response
+
+
+async def stream_service_response(client: httpx.AsyncClient, endpoint: str,
+                                  req_data: dict):
+    """
+    Asynchronously stream the response from a service using a persistent client.
+    """
+    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+    async with client.stream("POST", endpoint, json=req_data,
+                             headers=headers) as response:
+        response.raise_for_status()
+        async for chunk in response.aiter_bytes():
+            yield chunk
+
+
+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    global counter, stats_calculator
+    counter += 1
+
+    st = time.time()
+    try:
+        req_data = await request.json()
+
+        # Send request to prefill service, ignore the response
+        await send_request_to_service(app.state.prefill_client, "/completions",
+                                      req_data)
+
+        et = time.time()
+        stats_calculator.add(et - st)
+
+        # Stream response from decode service
+        async def generate_stream():
+            async for chunk in stream_service_response(app.state.decode_client,
+                                                       "/completions",
+                                                       req_data):
+                yield chunk
+
+        return StreamingResponse(generate_stream(),
+                                 media_type="application/json")
+
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server"
+              " - completions endpoint")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+        raise
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    global counter, stats_calculator
+    counter += 1
+
+    st = time.time()
+    try:
+        req_data = await request.json()
+
+        # Send request to prefill service, ignore the response
+        await send_request_to_service(app.state.prefill_client,
+                                      "/chat/completions", req_data)
+
+        et = time.time()
+        stats_calculator.add(et - st)
+
+        # Stream response from decode service
+        async def generate_stream():
+            async for chunk in stream_service_response(app.state.decode_client,
+                                                       "/chat/completions",
+                                                       req_data):
+                yield chunk
+
+        return StreamingResponse(generate_stream(),
+                                 media_type="application/json")
+
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server "
+              " - chat completions endpoint")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+        raise
+
+
+if __name__ == '__main__':
+    global global_args
+    global_args = parse_args()
+
+    import uvicorn
+    uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
new file mode 100644
index 00000000000..831ef0bb574
--- /dev/null
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+if [[ $# -lt 1 ]]; then
+    echo "Usage: $0 <prefiller | decoder> [model]"
+    exit 1
+fi
+
+if [[ $# -eq 1 ]]; then
+    echo "Using default model: meta-llama/Llama-3.1-8B-Instruct"
+    MODEL="meta-llama/Llama-3.1-8B-Instruct"
+else
+    echo "Using model: $2"
+    MODEL=$2
+fi
+
+
+if [[ $1 == "prefiller" ]]; then
+    # Prefiller listens on port 8100
+    prefill_config_file=$SCRIPT_DIR/configs/lmcache-prefiller-config.yaml
+
+    UCX_TLS=cuda_ipc,cuda_copy,tcp \
+        LMCACHE_CONFIG_FILE=$prefill_config_file \
+        LMCACHE_USE_EXPERIMENTAL=True \
+        VLLM_ENABLE_V1_MULTIPROCESSING=1 \
+        VLLM_WORKER_MULTIPROC_METHOD=spawn \
+        CUDA_VISIBLE_DEVICES=0 \
+        vllm serve $MODEL \
+        --port 8100 \
+        --disable-log-requests \
+        --enforce-eager \
+        --kv-transfer-config \
+        '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
+
+
+elif [[ $1 == "decoder" ]]; then
+    # Decoder listens on port 8200
+    decode_config_file=$SCRIPT_DIR/configs/lmcache-decoder-config.yaml
+
+    UCX_TLS=cuda_ipc,cuda_copy,tcp \
+        LMCACHE_CONFIG_FILE=$decode_config_file \
+        LMCACHE_USE_EXPERIMENTAL=True \
+        VLLM_ENABLE_V1_MULTIPROCESSING=1 \
+        VLLM_WORKER_MULTIPROC_METHOD=spawn \
+        CUDA_VISIBLE_DEVICES=1 \
+        vllm serve $MODEL \
+        --port 8200 \
+        --disable-log-requests \
+        --enforce-eager \
+        --kv-transfer-config \
+        '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
+
+
+else
+    echo "Invalid role: $1"
+    echo "Should be either prefill, decode"
+    exit 1
+fi
diff --git a/examples/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/lmcache/kv_cache_sharing_lmcache_v1.py
new file mode 100644
index 00000000000..af1b4351dd5
--- /dev/null
+++ b/examples/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of remote KV cache sharing
+with LMCache.
+We will launch 2 vllm instances, and launch an additional LMCache server.
+KV cache is transferred in the following manner: 
+(1) vLLM instance 1 -> LMCache server (KV cache store).
+(2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
+
+Note that lmcache needs to be installed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+import os
+import subprocess
+import time
+from multiprocessing import Event, Process
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# The port to start LMCache server
+port = 8100
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Disable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "False"
+# Set local CPU memory buffer limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+# Set the remote URL for LMCache server
+os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
+# Set the serializer/deserializer between vllm and LMCache server
+# `naive` indicates using raw bytes of the tensor without any compression
+os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
+
+prompts = [
+    "Hello, how are you?" * 1000,
+]
+
+
+def run_store(store_done, prompts):
+    # We use GPU 0 for KV cache store process.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              gpu_memory_utilization=0.8,
+              enforce_eager=True)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print("KV cache store is finished.")
+    store_done.set()
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_retrieve(store_done, prompts, timeout=1):
+    # We use GPU 1 for KV cache retrieve process.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # of memory. Reduce the value if your GPU has less memory.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              gpu_memory_utilization=0.8,
+              enforce_eager=True)
+
+    print("Waiting for KV cache store to finish...")
+    store_done.wait()
+    time.sleep(timeout)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_lmcache_server(port):
+    server_proc = subprocess.Popen([
+        "python", "-m", "lmcache.experimental.server", "localhost",
+        str(port)
+    ])
+    return server_proc
+
+
+def main():
+    store_done = Event()
+    store_process = Process(target=run_store, args=(store_done, prompts))
+    retrieve_process = Process(target=run_retrieve, args=(store_done, prompts))
+    lmcache_server_process = run_lmcache_server(port)
+
+    # Start KV cache store process
+    store_process.start()
+
+    # Start KV cache retrieve process
+    retrieve_process.start()
+
+    # Clean up the processes
+    store_process.join()
+    retrieve_process.terminate()
+    lmcache_server_process.terminate()
+    lmcache_server_process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 8f6779088e8..bab41c915c3 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -38,6 +38,37 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
+# Granite Speech
+def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
+    # NOTE - the setting in this example are somehat different than what is
+    # optimal for granite speech, and it is generally recommended to use beam
+    # search. Check the model README for suggested settings.
+    # https://huggingface.co/ibm-granite/granite-speech-3.3-8b
+    model_name = "ibm-granite/granite-speech-3.3-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=2048,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=64,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    # The model has an audio-specific lora directly in its model dir;
+    # it should be enabled whenever you pass audio inputs to the model.
+    speech_lora_path = model_name
+    audio_placeholder = "<|audio|>" * audio_count
+    prompts = f"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>{audio_placeholder}{question}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )
+
+
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
     model_name = "openbmb/MiniCPM-o-2_6"
@@ -89,7 +120,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
@@ -130,6 +161,36 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# Qwen2.5-Omni
+def run_qwen2_5_omni(question: str, audio_count: int):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join([
+        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+    ])
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_in_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # Ultravox 0.5-1B
 def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
@@ -179,9 +240,11 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 
 
 model_example_map = {
+    "granite_speech": run_granite_speech,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
     "qwen2_audio": run_qwen2_audio,
+    "qwen2_5_omni": run_qwen2_5_omni,
     "ultravox": run_ultravox,
     "whisper": run_whisper,
 }
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
new file mode 100644
index 00000000000..6548857b6d1
--- /dev/null
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use Ray Data for data parallel batch inference.
+
+Ray Data is a data processing framework that can handle large datasets
+and integrates tightly with vLLM for data-parallel inference.
+
+As of Ray 2.44, Ray Data has a native integration with
+vLLM (under ray.data.llm).
+
+Ray Data provides functionality for:
+* Reading and writing to cloud storage (S3, GCS, etc.)
+* Automatic sharding and load-balancing across a cluster
+* Optimized configuration of vLLM using continuous batching
+* Compatible with tensor/pipeline parallel inference as well.
+
+Learn more about Ray Data's LLM integration:
+https://docs.ray.io/en/latest/data/working-with-llms.html
+"""
+import ray
+from packaging.version import Version
+from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
+
+assert Version(ray.__version__) >= Version(
+    "2.44.1"), "Ray version must be at least 2.44.1"
+
+# Uncomment to reduce clutter in stdout
+# ray.init(log_to_driver=False)
+# ray.data.DataContext.get_current().enable_progress_bars = False
+
+# Read one text file from S3. Ray Data supports reading multiple files
+# from cloud storage (such as JSONL, Parquet, CSV, binary format).
+ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
+print(ds.schema())
+
+size = ds.count()
+print(f"Size of dataset: {size} prompts")
+
+# Configure vLLM engine.
+config = vLLMEngineProcessorConfig(
+    model_source="unsloth/Llama-3.1-8B-Instruct",
+    engine_kwargs={
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4096,
+        "max_model_len": 16384,
+    },
+    concurrency=1,  # set the number of parallel vLLM replicas
+    batch_size=64,
+)
+
+# Create a Processor object, which will be used to
+# do batch inference on the dataset
+vllm_processor = build_llm_processor(
+    config,
+    preprocess=lambda row: dict(
+        messages=[{
+            "role": "system",
+            "content": "You are a bot that responds with haikus."
+        }, {
+            "role": "user",
+            "content": row["text"]
+        }],
+        sampling_params=dict(
+            temperature=0.3,
+            max_tokens=250,
+        )),
+    postprocess=lambda row: dict(
+        answer=row["generated_text"],
+        **row  # This will return all the original columns in the dataset.
+    ),
+)
+
+ds = vllm_processor(ds)
+
+# Peek first 10 results.
+# NOTE: This is for local testing and debugging. For production use case,
+# one should write full result out as shown below.
+outputs = ds.take(limit=10)
+
+for output in outputs:
+    prompt = output["prompt"]
+    generated_text = output["generated_text"]
+    print(f"Prompt: {prompt!r}")
+    print(f"Generated text: {generated_text!r}")
+
+# Write inference output data out as Parquet files to S3.
+# Multiple files would be written to the output destination,
+# and each task would write one or more files separately.
+#
+# ds.write_parquet("s3://<your-output-bucket>")
diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py
deleted file mode 100644
index 8211629b24e..00000000000
--- a/examples/offline_inference/cpu_offload_lmcache.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This file demonstrates the example usage of cpu offloading
-with LMCache.
-
-Note that `pip install lmcache` is needed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
-"""
-import os
-import time
-
-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
-from lmcache.integration.vllm.utils import ENGINE_NAME
-
-from vllm import LLM, SamplingParams
-from vllm.config import KVTransferConfig
-
-# LMCache-related environment variables
-# Use experimental features in LMCache
-os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-# LMCache is set to use 256 tokens per chunk
-os.environ["LMCACHE_CHUNK_SIZE"] = "256"
-# Enable local CPU backend in LMCache
-os.environ["LMCACHE_LOCAL_CPU"] = "True"
-# Set local CPU memory limit to 5.0 GB
-os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-
-# This example script runs two requests with a shared prefix.
-shared_prompt = "Hello, how are you?" * 1000
-first_prompt = [
-    shared_prompt + "Hello, my name is",
-]
-second_prompt = [
-    shared_prompt + "Tell me a very long story",
-]
-
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-
-ktc = KVTransferConfig.from_cli(
-    '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
-# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
-# memory. Reduce the value if your GPU has less memory.
-# Note that LMCache is not compatible with chunked prefill for now.
-llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-          kv_transfer_config=ktc,
-          max_model_len=8000,
-          enable_chunked_prefill=False,
-          gpu_memory_utilization=0.8)
-
-outputs = llm.generate(first_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-print("First request done.")
-
-time.sleep(1)
-
-outputs = llm.generate(second_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-print("Second request done.")
-
-# Clean up lmcache backend
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
new file mode 100644
index 00000000000..66efbc0c9de
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# Read prompts from output.txt
+prompts = []
+try:
+    with open("output.txt") as f:
+        for line in f:
+            prompts.append(line.strip())
+    print(f"Loaded {len(prompts)} prompts from output.txt")
+except FileNotFoundError:
+    print("Error: output.txt file not found")
+    exit(-1)
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    enforce_eager=True,
+    gpu_memory_utilization=0.8,
+    max_num_batched_tokens=64,
+    max_num_seqs=16,
+    kv_transfer_config=KVTransferConfig.from_cli(
+        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
new file mode 100644
index 00000000000..f7cbf6557d5
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+context = "Hi " * 1000
+context2 = "Hey " * 500
+prompts = [
+    context + "Hello, my name is",
+    context + "The capital of France is",
+    context2 + "Your name is",
+    context2 + "The capital of China is",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          kv_transfer_config=KVTransferConfig.from_cli(
+              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              '"kv_connector_extra_config": '
+              '{"shared_storage_path": "local_storage"}}')
+          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# Write new_prompts to output.txt
+with open("output.txt", "w") as f:
+    for prompt in new_prompts:
+        f.write(prompt + "\n")
+print(f"Saved {len(new_prompts)} prompts to output.txt")
diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh
new file mode 100644
index 00000000000..0ebf45a1586
--- /dev/null
+++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh
@@ -0,0 +1,5 @@
+rm -rf local_storage/
+rm output.txt
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
deleted file mode 100644
index e890c6dad8b..00000000000
--- a/examples/offline_inference/distributed.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This example shows how to use Ray Data for running offline batch inference
-distributively on a multi-nodes cluster.
-
-Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
-"""
-
-from typing import Any
-
-import numpy as np
-import ray
-from packaging.version import Version
-from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-from vllm import LLM, SamplingParams
-
-assert Version(ray.__version__) >= Version(
-    "2.22.0"), "Ray version must be at least 2.22.0"
-
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Set tensor parallelism per instance.
-tensor_parallel_size = 1
-
-# Set number of instances. Each instance will use tensor_parallel_size GPUs.
-num_instances = 1
-
-
-# Create a class to do batch inference.
-class LLMPredictor:
-
-    def __init__(self):
-        # Create an LLM.
-        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-                       tensor_parallel_size=tensor_parallel_size)
-
-    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
-        # Generate texts from the prompts.
-        # The output is a list of RequestOutput objects that contain the prompt,
-        # generated text, and other information.
-        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: list[str] = []
-        generated_text: list[str] = []
-        for output in outputs:
-            prompt.append(output.prompt)
-            generated_text.append(' '.join([o.text for o in output.outputs]))
-        return {
-            "prompt": prompt,
-            "generated_text": generated_text,
-        }
-
-
-# Read one text file from S3. Ray Data supports reading multiple files
-# from cloud storage (such as JSONL, Parquet, CSV, binary format).
-ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
-
-
-# For tensor_parallel_size > 1, we need to create placement groups for vLLM
-# to use. Every actor has to have its own placement group.
-def scheduling_strategy_fn():
-    # One bundle per tensor parallel worker
-    pg = ray.util.placement_group(
-        [{
-            "GPU": 1,
-            "CPU": 1
-        }] * tensor_parallel_size,
-        strategy="STRICT_PACK",
-    )
-    return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
-        pg, placement_group_capture_child_tasks=True))
-
-
-resources_kwarg: dict[str, Any] = {}
-if tensor_parallel_size == 1:
-    # For tensor_parallel_size == 1, we simply set num_gpus=1.
-    resources_kwarg["num_gpus"] = 1
-else:
-    # Otherwise, we have to set num_gpus=0 and provide
-    # a function that will create a placement group for
-    # each instance.
-    resources_kwarg["num_gpus"] = 0
-    resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
-
-# Apply batch inference for all input data.
-ds = ds.map_batches(
-    LLMPredictor,
-    # Set the concurrency to the number of LLM instances.
-    concurrency=num_instances,
-    # Specify the batch size for inference.
-    batch_size=32,
-    **resources_kwarg,
-)
-
-# Peek first 10 results.
-# NOTE: This is for local testing and debugging. For production use case,
-# one should write full result out as shown below.
-outputs = ds.take(limit=10)
-for output in outputs:
-    prompt = output["prompt"]
-    generated_text = output["generated_text"]
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-# Write inference output data out as Parquet files to S3.
-# Multiple files would be written to the output destination,
-# and each task would write one or more files separately.
-#
-# ds.write_parquet("s3://<your-output-bucket>")
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index c7b4368c9b1..91e2f68ecff 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -36,6 +36,10 @@ def parse_args():
         help="downloaded from the eagle repo " \
         "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
     )
+    parser.add_argument("--method",
+                        type=str,
+                        default='eagle',
+                        choices=['eagle', 'eagle3'])
     parser.add_argument("--max_num_seqs", type=int, default=8)
     parser.add_argument("--num_prompts", type=int, default=80)
     parser.add_argument("--num_spec_tokens", type=int, default=2)
@@ -52,8 +56,14 @@ def main():
 
     args = parse_args()
 
-    model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-    eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+
+    if args.method == 'eagle':
+        eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+    elif args.method == 'eagle3':
+        eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    else:
+        raise ValueError(f"unknown method: {args.method}")
 
     max_model_len = 2048
 
@@ -81,7 +91,7 @@ def main():
         max_num_seqs=args.max_num_seqs,
         gpu_memory_utilization=0.8,
         speculative_config={
-            "method": "eagle",
+            "method": args.method,
             "model": eagle_dir,
             "num_speculative_tokens": args.num_spec_tokens,
             "draft_tensor_parallel_size": args.draft_tp,
@@ -95,6 +105,9 @@ def main():
     outputs = llm.generate(prompt_token_ids=prompt_ids,
                            sampling_params=sampling_params)
 
+    if not hasattr(outputs, "metrics") or outputs.metrics is None:
+        return
+
     # calculate the average number of accepted tokens per forward pass, +1 is
     # to account for the token from the target model that's always going to be
     # accepted
@@ -109,6 +122,11 @@ def main():
         {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
     print("-" * 50)
 
+    # print acceptance at each token position
+    for i in range(len(acceptance_counts)):
+        print(f"acceptance at token {i}:"
+              f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 61e5f5eae4e..2883c37ca23 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
 def run_florence2():
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
         max_num_seqs=8,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": 1},
@@ -165,6 +165,7 @@ def main(args):
         temperature=0,
         top_p=1.0,
         max_tokens=64,
+        skip_special_tokens=False,
     )
 
     start = time.time()
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index abff90d1c0c..d84cd9ee9f5 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
     return LLMEngine.from_engine_args(engine_args)
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
 def main(args: argparse.Namespace):
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine(args)
@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description='Demo on using the LLMEngine class directly')
-    parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 9bb66fdbc45..37c3181dc5f 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -16,11 +16,11 @@
 # # Mistral format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
 #   --tokenizer-mode mistral --config-format mistral --load-format mistral \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 #
 # # HF format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
-#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#   --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
         tokenizer_mode="mistral" if args.format == "mistral" else "auto",
         config_format="mistral" if args.format == "mistral" else "auto",
         load_format="mistral" if args.format == "mistral" else "auto",
+        limit_mm_per_prompt={"image": 1},
         max_model_len=4096,
         max_num_seqs=2,
         tensor_parallel_size=2,
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 9c818d07573..99303950d39 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -14,7 +14,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler import layerwise_profile
+from vllm.profiler.layerwise_profile import layerwise_profile
 from vllm.utils import FlexibleArgumentParser
 
 BATCH_SIZE_DEFAULT = 1
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
new file mode 100644
index 00000000000..c30541a598c
--- /dev/null
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -0,0 +1,32 @@
+# Qwen2.5-Omni Offline Inference Examples
+
+This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
+
+## Thinker Only
+
+```bash
+# Audio + image + video
+python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
+
+# Read vision and audio inputs from a single video file
+# NOTE: V1 engine does not support interleaved modalities yet.
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
+
+# Multiple audios
+VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
+```
+
+This script will run the thinker part of Qwen2.5-Omni, and generate text response.
+
+You can also test Qwen2.5-Omni on a single modality:
+
+```bash
+# Process audio inputs
+python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
+
+# Process image inputs
+python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
+
+# Process video inputs
+python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
+```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
new file mode 100644
index 00000000000..c2c28d5ae6a
--- /dev/null
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+import vllm.envs as envs
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech.")
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = ("What is recited in the audio? "
+                "What is the content of this image? Why is this video funny?")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|vision_bos|><|IMAGE|><|vision_eos|>"
+              "<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio":
+                AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image":
+                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
+                "video":
+                VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "image": 1,
+            "video": 1
+        },
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = ("Describe the content of the video, "
+                "then convert what the baby say into text.")
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
+                                  "Please launch this example with "
+                                  "`VLLM_USE_V1=0`.")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 1,
+            "video": 1
+        },
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
+              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+              "<|audio_bos|><|AUDIO|><|audio_eos|>"
+              f"{question}<|im_end|>\n"
+              f"<|im_start|>assistant\n")
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(model=model_name,
+              max_model_len=5632,
+              max_num_seqs=5,
+              limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+              seed=args.seed)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    outputs = llm.generate(query_result.inputs,
+                           sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--query-type',
+                        '-q',
+                        type=str,
+                        default="mixed_modalities",
+                        choices=query_map.keys(),
+                        help='Query type.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 6b533346ac3..aca11f5c50b 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -150,7 +150,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
-        tokenizer="facebook/bart-large",
+        tokenizer="Isotr0py/Florence-2-tokenizer",
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
@@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model="moonshotai/Kimi-VL-A3B-Instruct",
-        max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
         trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
     )
 
     return ModelRequestData(
@@ -725,6 +725,34 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ovis2
+def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "AIDC-AI/Ovis2-1B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    placeholder = "<image>\n"
+    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                f"<|im_start|>user\n{placeholder}"
+                f"{question}<|im_end|>\n"
+                "<|im_start|>assistant\n") for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # PaliGemma
 def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -814,10 +842,13 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=5120,
         max_num_seqs=2,
+        max_num_batched_tokens=12800,
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
         limit_mm_per_prompt={"image": 1},
     )
 
@@ -941,6 +972,42 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Qwen2.5-Omni
+def run_qwen2_5_omni(questions: list[str], modality: str):
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": [1],
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|IMAGE|>"
+    elif modality == "video":
+        placeholder = "<|VIDEO|>"
+
+    default_system = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+        "Group, capable of perceiving auditory and visual inputs, as well as "
+        "generating text and speech.")
+
+    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
+                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+                f"{question}<|im_end|>\n"
+                "<|im_start|>assistant\n") for question in questions]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # SkyworkR1V
 def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1002,6 +1069,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "llama4": run_llama4,
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
+    "ovis2": run_ovis2,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
@@ -1010,6 +1078,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
+    "qwen2_5_omni": run_qwen2_5_omni,
     "skywork_chat": run_skyworkr1v,
     "smolvlm": run_smolvlm,
 }
@@ -1040,7 +1109,7 @@ def get_multi_modal_input(args):
 
     if args.modality == "video":
         # Input video and question
-        video = VideoAsset(name="sample_demo_1.mp4",
+        video = VideoAsset(name="baby_reading",
                            num_frames=args.num_frames).np_ndarrays
         vid_questions = ["Why is this video funny?"]
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 52e93896706..48d590b05b0 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
     engine_args = EngineArgs(
         model=model_name,
+        trust_remote_code=True,
         max_model_len=4096,
         max_num_seqs=4,
-        tensor_parallel_size=1,
         limit_mm_per_prompt={"image": len(image_urls)},
-        trust_remote_code=True,
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
@@ -437,6 +436,34 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# Ovis2
+def load_ovis2(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2-1B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+        hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
+    )
+
+    placeholder = '\n'.join(
+        [f'Image {i+1}: <image>' for i in range(len(image_urls))]) + '\n'
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n{placeholder}"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
@@ -504,11 +531,13 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=10000,
+        max_model_len=4096,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
     )
 
     placeholders = "".join(f"<|image_{i}|>"
@@ -684,6 +713,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
+    "ovis2": load_ovis2,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "pixtral_hf": load_pixtral_hf,
@@ -790,7 +820,9 @@ def parse_args():
     parser.add_argument(
         "--num-images",
         "-n",
-        choices=list(range(1, 13)),  # 12 is the max number of images
+        type=int,
+        choices=list(range(1,
+                           len(IMAGE_URLS) + 1)),  # the max number of images
         default=2,
         help="Number of images to use for the demo.")
     return parser.parse_args()
diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
index 9c48e7d061b..28dba9a6f68 100644
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -8,7 +8,7 @@ image:
   # -- Image tag
   tag: "latest"
   # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
 
 # -- Container port
 containerPort: 8000
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 13331609eb0..314f1c5b739 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -23,10 +23,6 @@
 from openai import OpenAI
 
 
-def create_openai_client(api_key, base_url):
-    return OpenAI(api_key=api_key, base_url=base_url)
-
-
 def format_history_to_openai(history):
     history_openai_format = [{
         "role": "system",
diff --git a/examples/online_serving/kv_events.sh b/examples/online_serving/kv_events.sh
new file mode 100644
index 00000000000..a111db2179f
--- /dev/null
+++ b/examples/online_serving/kv_events.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# This file demonstrates the KV cache event publishing
+# We will launch a vllm instances configured to publish KV cache
+# events and launch a simple subscriber to log those events.
+
+set -xe
+
+echo "🚧🚧 Warning: The usage of KV cache events is experimental and subject to change 🚧🚧"
+sleep 1
+
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
+# Trap the SIGINT signal (triggered by Ctrl+C)
+trap 'cleanup' INT
+
+# Cleanup function
+cleanup() {
+    echo "Caught Ctrl+C, cleaning up..."
+    # Cleanup commands
+    pgrep python | xargs kill -9
+    pkill -f python
+    echo "Cleanup complete. Exiting."
+    exit 0
+}
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+vllm serve $MODEL_NAME \
+    --port 8100 \
+    --max-model-len 100 \
+    --enforce-eager \
+    --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
+    --kv-events-config \
+    '{"enable_kv_cache_events": true, "publisher": "zmq", "topic": "kv-events"}' &
+
+wait_for_server 8100
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+python3 "$SCRIPT_DIR/kv_events_subscriber.py" &
+sleep 1
+
+# serve two example requests
+output1=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 5-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+output2=$(curl -X POST -s http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "'"$MODEL_NAME"'",
+"prompt": "Explain quantum computing in simple terms a 50-year-old could understand.",
+"max_tokens": 80,
+"temperature": 0
+}')
+
+# Cleanup commands
+pkill -9 -u "$USER" -f python
+pkill -9 -u "$USER" -f vllm
+
+sleep 1
+
+echo "Cleaned up"
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
+echo ""
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
new file mode 100644
index 00000000000..88bbbebd747
--- /dev/null
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Optional, Union
+
+import msgspec
+import zmq
+from msgspec.msgpack import Decoder
+
+
+#
+# Types copied from vllm.distributed.kv_events
+#
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
+                 gc=False):
+    ts: float
+    events: list[Any]
+
+
+class KVCacheEvent(msgspec.Struct,
+                   array_like=True,
+                   omit_defaults=True,
+                   gc=False,
+                   tag=True):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+def process_event(event_batch):
+    print(f"Received event batch at {event_batch.ts}:")
+    for event in event_batch.events:
+        print(f"  - {event}")
+
+
+def main():
+    decoder = Decoder(type=KVEventBatch)
+    last_seq = -1
+
+    context = zmq.Context()
+
+    # Set up the main subscription socket
+    sub = context.socket(zmq.SUB)
+    sub.connect("tcp://localhost:5557")
+    topic = "kv-events"
+    sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+    # Initialize replay socket
+    replay = context.socket(zmq.REQ)
+    replay.connect("tcp://localhost:5558")
+    poller = zmq.Poller()
+    poller.register(replay, zmq.POLLIN)
+
+    print("Listening for KV cache events on topic:", topic)
+
+    while True:
+        try:
+            if sub.poll(50):
+                _, seq_bytes, payload = sub.recv_multipart()
+                seq = int.from_bytes(seq_bytes, "big")
+
+                if last_seq >= 0 and seq > last_seq + 1:
+                    missed = seq - last_seq - 1
+                    print(f"Missed {missed} messages"
+                          f" (last: {last_seq}, current: {seq})")
+
+                    replay.send((last_seq + 1).to_bytes(8, "big"))
+
+                    while poller.poll(timeout=200):
+                        seq_bytes, replay_payload = replay.recv_multipart()
+                        if not replay_payload:
+                            # End of replay marker is sent as an empty frame
+                            # for the payload
+                            break
+
+                        replay_seq = int.from_bytes(seq_bytes, "big")
+
+                        if replay_seq > last_seq:
+                            event_batch = decoder.decode(replay_payload)
+                            process_event(event_batch)
+                            last_seq = replay_seq
+                            if replay_seq >= seq - 1:
+                                break
+
+                event_batch = decoder.decode(payload)
+                process_event(event_batch)
+
+            # ... do other periodic work or check for shutdown ...
+
+        except KeyboardInterrupt:
+            print("Interrupted")
+            break
+        except Exception as e:
+            print("Error decoding message:", e)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index ecfcf05a90d..70db4d95e64 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -9,7 +9,7 @@
 
 (multi-image inference with Phi-3.5-vision-instruct)
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
@@ -303,12 +303,7 @@ def run_audio() -> None:
 }
 
 
-def main(args) -> None:
-    chat_type = args.chat_type
-    example_function_map[chat_type]()
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
@@ -318,5 +313,14 @@ def main(args) -> None:
                         default="single-image",
                         choices=list(example_function_map.keys()),
                         help='Conversation type with multimodal data.')
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    chat_type = args.chat_type
+    example_function_map[chat_type]()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 416fb61ca8b..c25203860ff 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -17,6 +17,7 @@
             --enable-auto-tool-choice --tool-call-parser hermes
 """
 import json
+from typing import Any
 
 from openai import OpenAI
 
@@ -24,15 +25,6 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [{
     "type": "function",
     "function": {
@@ -78,86 +70,123 @@
     "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
 }]
 
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools)
-
-print("Chat completion results:")
-print(chat_completion)
-print("\n\n")
-
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
-    if chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls[0])
-    else:
-        print(chunk.choices[0].delta)
-
-arguments = []
-tool_call_idx = -1
-for chunk in chunks:
-
-    if chunk.choices[0].delta.tool_calls:
-        tool_call = chunk.choices[0].delta.tool_calls[0]
-
-        if tool_call.index != tool_call_idx:
-            if tool_call_idx >= 0:
-                print(
-                    f"streamed tool call arguments: {arguments[tool_call_idx]}"
-                )
-            tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
-            arguments.append("")
-        if tool_call.id:
-            print(f"streamed tool call id: {tool_call.id} ")
-
-        if tool_call.function:
-            if tool_call.function.name:
-                print(f"streamed tool call name: {tool_call.function.name}")
-
-            if tool_call.function.arguments:
-                arguments[tool_call_idx] += tool_call.function.arguments
-
-if len(arguments):
-    print(f"streamed tool call arguments: {arguments[-1]}")
-
-print("\n\n")
-
-messages.append({
-    "role": "assistant",
-    "tool_calls": chat_completion.choices[0].message.tool_calls
-})
 
-
-# Now, simulate a tool call
 def get_current_weather(city: str, state: str, unit: 'str'):
     return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
             "partly cloudly, with highs in the 90's.")
 
 
-available_tools = {"get_current_weather": get_current_weather}
-
-completion_tool_calls = chat_completion.choices[0].message.tool_calls
-for call in completion_tool_calls:
-    tool_to_call = available_tools[call.function.name]
-    args = json.loads(call.function.arguments)
-    result = tool_to_call(**args)
-    print(result)
+def handle_tool_calls_stream(
+    client: OpenAI,
+    messages: list[dict[str, str]],
+    model: str,
+    tools: list[dict[str, Any]],
+) -> list[Any]:
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
+    chunks = []
+    print("chunks: ")
+    for chunk in tool_calls_stream:
+        chunks.append(chunk)
+        if chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls[0])
+        else:
+            print(chunk.choices[0].delta)
+    return chunks
+
+
+def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
+    arguments = []
+    tool_call_idx = -1
+    print("arguments: ")
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                if tool_call_idx >= 0:
+                    print(f"streamed tool call arguments: "
+                          f"{arguments[tool_call_idx]}")
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+            if tool_call.id:
+                print(f"streamed tool call id: {tool_call.id} ")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    print(
+                        f"streamed tool call name: {tool_call.function.name}")
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+
+    return arguments
+
+
+def main():
+    # Initialize OpenAI client
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Get available models and select one
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools)
+
+    print("-" * 70)
+    print("Chat completion results:")
+    print(chat_completion)
+    print("-" * 70)
+
+    # Stream tool calls
+    chunks = handle_tool_calls_stream(client, messages, model, tools)
+    print("-" * 70)
+
+    # Handle arguments from streamed tool calls
+    arguments = handle_tool_calls_arguments(chunks)
+
+    if len(arguments):
+        print(f"streamed tool call arguments: {arguments[-1]}\n")
+
+    print("-" * 70)
+
+    # Add tool call results to the conversation
     messages.append({
-        "role": "tool",
-        "content": result,
-        "tool_call_id": call.id,
-        "name": call.function.name
+        "role": "assistant",
+        "tool_calls": chat_completion.choices[0].message.tool_calls
     })
 
-chat_completion_2 = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=False)
-print("\n\n")
-print(chat_completion_2)
+    # Now, simulate a tool call
+    available_tools = {"get_current_weather": get_current_weather}
+
+    completion_tool_calls = chat_completion.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        print("tool_to_call result: ", result)
+        messages.append({
+            "role": "tool",
+            "content": result,
+            "tool_call_id": call.id,
+            "name": call.function.name
+        })
+
+    chat_completion_2 = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=False)
+    print("Chat completion2 results:")
+    print(chat_completion_2)
+    print("-" * 70)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 779369d1634..97d900bb75f 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-To run this example, you can start the vLLM server 
+To run this example, you can start the vLLM server
 without any specific flags:
 
 ```bash
@@ -8,7 +8,7 @@
     --guided-decoding-backend outlines
 ```
 
-This example demonstrates how to generate chat completions 
+This example demonstrates how to generate chat completions
 using the OpenAI Python client library.
 """
 
@@ -18,15 +18,6 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [
     {
         "type": "function",
@@ -116,21 +107,36 @@
     },
 ]
 
-chat_completion = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice="required",
-    stream=True  # Enable streaming response
-)
 
-for chunk in chat_completion:
-    if chunk.choices and chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls)
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True  # Enable streaming response
+    )
+
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools,
+                                                     tool_choice="required")
+
+    print(chat_completion.choices[0].message.tool_calls)
 
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools,
-                                                 tool_choice="required")
 
-print(chat_completion.choices[0].message.tool_calls)
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 986ff500e58..9c57af1c158 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,43 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you need to start the vLLM server:
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+"""
 
 from enum import Enum
 
 from openai import BadRequestError, OpenAI
 from pydantic import BaseModel
 
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="-",
-)
 
 # Guided decoding by Choice (list of possible options)
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": "Classify this sentiment: vLLM is wonderful!"
-    }],
-    extra_body={"guided_choice": ["positive", "negative"]},
-)
-print(completion.choices[0].message.content)
+def guided_choice_completion(client: OpenAI, model: str):
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": "Classify this sentiment: vLLM is wonderful!"
+        }],
+        extra_body={"guided_choice": ["positive", "negative"]},
+    )
+    return completion.choices[0].message.content
+
 
 # Guided decoding by Regex
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
-
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={
-        "guided_regex": "\w+@\w+\.com\n",
-        "stop": ["\n"]
-    },
-)
-print(completion.choices[0].message.content)
+def guided_regex_completion(client: OpenAI, model: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={
+            "guided_regex": r"\w+@\w+\.com\n",
+            "stop": ["\n"]
+        },
+    )
+    return completion.choices[0].message.content
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -54,66 +60,101 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-json_schema = CarDescription.model_json_schema()
-
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print(completion.choices[0].message.content)
+def guided_json_completion(client: OpenAI, model: str):
+    json_schema = CarDescription.model_json_schema()
 
-# Guided decoding by Grammar
-simplified_sql_grammar = """
-    ?start: select_statement
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    return completion.choices[0].message.content
 
-    ?select_statement: "SELECT " column_list " FROM " table_name
 
-    ?column_list: column_name ("," column_name)*
+# Guided decoding by Grammar
+def guided_grammar_completion(client: OpenAI, model: str):
+    simplified_sql_grammar = """
+        root ::= select_statement
 
-    ?table_name: identifier
+        select_statement ::= "SELECT " column " from " table " where " condition
 
-    ?column_name: identifier
+        column ::= "col_1 " | "col_2 "
 
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-"""
+        table ::= "table_1 " | "table_2 "
 
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print(completion.choices[0].message.content)
+        condition ::= column "= " number
 
-# Extra backend options
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
+        number ::= "1 " | "2 "
+    """
 
-try:
-    # The no-fallback option forces vLLM to use xgrammar, so when it fails
-    # you get a 400 with the reason why
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
     completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
+        model=model,
         messages=[{
             "role": "user",
             "content": prompt,
         }],
-        extra_body={
-            "guided_regex": "\w+@\w+\.com\n",
-            "stop": ["\n"],
-            "guided_decoding_backend": "xgrammar:no-fallback"
-        },
+        extra_body={"guided_grammar": simplified_sql_grammar},
     )
-except BadRequestError as e:
-    print("This error is expected:", e)
+    return completion.choices[0].message.content
+
+
+# Extra backend options
+def extra_backend_options_completion(client: OpenAI, model: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+
+    try:
+        # The guided_decoding_disable_fallback option forces vLLM to use
+        # xgrammar, so when it fails you get a 400 with the reason why
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[{
+                "role": "user",
+                "content": prompt,
+            }],
+            extra_body={
+                "guided_regex": r"\w+@\w+\.com\n",
+                "stop": ["\n"],
+                "guided_decoding_backend": "xgrammar",
+                "guided_decoding_disable_fallback": True,
+            },
+        )
+        return completion.choices[0].message.content
+    except BadRequestError as e:
+        print("This error is expected:", e)
+
+
+def main():
+    client: OpenAI = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+
+    model = "Qwen/Qwen2.5-3B-Instruct"
+
+    print("Guided Choice Completion:")
+    print(guided_choice_completion(client, model))
+
+    print("\nGuided Regex Completion:")
+    print(guided_regex_completion(client, model))
+
+    print("\nGuided JSON Completion:")
+    print(guided_json_completion(client, model))
+
+    print("\nGuided Grammar Completion:")
+    print(guided_grammar_completion(client, model))
+
+    print("\nExtra Backend Options Completion:")
+    print(extra_backend_options_completion(client, model))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
new file mode 100644
index 00000000000..b807bc54052
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+from openai import OpenAI
+
+# This example demonstrates the `structural_tag` response format.
+# It can be used to specify a structured output format that occurs between
+# specific tags in the response. This example shows how it could be used
+# to enforce the format of a tool call response, but it could be used for
+# any structured output within a subset of the response.
+
+
+def main():
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
+    )
+
+    messages = [{
+        "role":
+        "user",
+        "content":
+        """
+You have access to the following function to retrieve the weather in a city:
+
+    {
+        "name": "get_weather",
+        "parameters": {
+            "city": {
+                "param_type": "string",
+                "description": "The city to get the weather for",
+                "required": True
+            }
+        }
+    }
+
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name as key and function
+              argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+
+Given the previous instructions, what is the weather in New York City, Boston,
+and San Francisco?
+"""
+    }]
+
+    response = client.chat.completions.create(
+        model="meta-llama/Llama-3.1-8B-Instruct",
+        messages=messages,
+        response_format={
+            "type":
+            "structural_tag",
+            "structures": [{
+                "begin": "<function=get_weather>",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "end": "</function>"
+            }],
+            "triggers": ["<function="]
+        })
+    print(response)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 9ceeae8fa96..5da9236c530 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -9,7 +9,7 @@
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --enable-reasoning --reasoning-parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 
 This example demonstrates how to generate chat completions from reasoning models
@@ -25,29 +25,28 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def print_completion_details(completion):
+    print("reasoning_content: ",
+          completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
+
 
 # Guided decoding by Regex
-prompt = ("What is the capital of France?")
-
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={
-        "guided_regex": "(Paris|London)",
-    },
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
+def guided_regex_completion(client: OpenAI, model: str):
+    prompt = ("What is the capital of France?")
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={
+            "guided_regex": "(Paris|London)",
+        },
+    )
+    print_completion_details(completion)
 
 
 class People(BaseModel):
@@ -55,19 +54,19 @@ class People(BaseModel):
     age: int
 
 
-json_schema = People.model_json_schema()
+def guided_json_completion(client: OpenAI, model: str):
+    json_schema = People.model_json_schema()
 
-prompt = ("Generate a JSON with the name and age of one random person.")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
+    prompt = ("Generate a JSON with the name and age of one random person.")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    print_completion_details(completion)
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-json_schema = CarDescription.model_json_schema()
+def guided_car_json_completion(client: OpenAI, model: str):
+    json_schema = CarDescription.model_json_schema()
+
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    print_completion_details(completion)
 
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
 
 # Guided decoding by Grammar
-simplified_sql_grammar = """
-    ?start: select_statement
+def guided_grammar_completion(client: OpenAI, model: str):
+    simplified_sql_grammar = """
+        root ::= select_statement
 
-    ?select_statement: "SELECT " column_list " FROM " table_name
+        select_statement ::= "SELECT " column " from " table " where " condition
 
-    ?column_list: column_name ("," column_name)*
+        column ::= "col_1 " | "col_2 "
 
-    ?table_name: identifier
+        table ::= "table_1 " | "table_2 "
 
-    ?column_name: identifier
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+    """
+
+    # This may be very slow https://github.com/vllm-project/vllm/issues/12122
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print_completion_details(completion)
+
+
+def main():
+    client: OpenAI = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model: str = models.data[0].id
+
+    print("Guided Regex Completion:")
+    guided_regex_completion(client, model)
+
+    print("\nGuided JSON Completion (People):")
+    guided_json_completion(client, model)
+
+    print("\nGuided JSON Completion (CarDescription):")
+    guided_car_json_completion(client, model)
+
+    print("\nGuided Grammar Completion:")
+    guided_grammar_completion(client, model)
 
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-"""
 
-# This may be very slow https://github.com/vllm-project/vllm/issues/12122
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 9e7a69c6c87..9417abd3989 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -9,7 +9,7 @@
 
 ```bash
 vllm serve Qwen/QwQ-32B \
-     --enable-reasoning --reasoning-parser deepseek_r1 \
+     --reasoning-parser deepseek_r1 \
      --enable-auto-tool-choice --tool-call-parser hermes
      
 ```
@@ -31,14 +31,6 @@ def get_current_weather(city: str, state: str, unit: 'str'):
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [{
     "type": "function",
     "function": {
@@ -109,69 +101,87 @@ def extract_reasoning_and_calls(chunks: list):
     return reasoning_content, arguments, function_names
 
 
-print("---------Full Generate With Automatic Function Calling-------------")
-tool_calls = client.chat.completions.create(messages=messages,
-                                            model=model,
-                                            tools=tools)
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
-print(f"function name: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
-print(f"function arguments: "
-      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
-
-print("----------Stream Generate With Automatic Function Calling-----------")
-tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                   model=model,
-                                                   tools=tools,
-                                                   stream=True)
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
-
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-    chunks)
-
-print(f"reasoning_content: {reasoning_content}")
-print(f"function name: {function_names[0]}")
-print(f"function arguments: {arguments[0]}")
-
-print("----------Full Generate With Named Function Calling-----------------")
-tool_calls = client.chat.completions.create(messages=messages,
-                                            model=model,
-                                            tools=tools,
-                                            tool_choice={
-                                                "type": "function",
-                                                "function": {
-                                                    "name":
-                                                    "get_current_weather"
-                                                }
-                                            })
-
-tool_call = tool_calls.choices[0].message.tool_calls[0].function
-print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
-print(f"function name: {tool_call.name}")
-print(f"function arguments: {tool_call.arguments}")
-print("----------Stream Generate With Named Function Calling--------------")
-
-tool_calls_stream = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice={
-        "type": "function",
-        "function": {
-            "name": "get_current_weather"
-        }
-    },
-    stream=True)
-
-chunks = []
-for chunk in tool_calls_stream:
-    chunks.append(chunk)
-
-reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-    chunks)
-print(f"reasoning_content: {reasoning_content}")
-print(f"function name: {function_names[0]}")
-print(f"function arguments: {arguments[0]}")
-print("\n\n")
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    print(
+        "---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(messages=messages,
+                                                model=model,
+                                                tools=tools)
+    print(
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+    )
+    print(f"function name: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+    print(f"function arguments: "
+          f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+
+    print(
+        "----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                       model=model,
+                                                       tools=tools,
+                                                       stream=True)
+
+    chunks = list(tool_calls_stream)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+
+    print(
+        "----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(messages=messages,
+                                                model=model,
+                                                tools=tools,
+                                                tool_choice={
+                                                    "type": "function",
+                                                    "function": {
+                                                        "name":
+                                                        "get_current_weather"
+                                                    }
+                                                })
+
+    tool_call = tool_calls.choices[0].message.tool_calls[0].function
+    print(
+        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+    )
+    print(f"function name: {tool_call.name}")
+    print(f"function arguments: {tool_call.arguments}")
+    print(
+        "----------Stream Generate With Named Function Calling--------------")
+
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "get_current_weather"
+            }
+        },
+        stream=True)
+
+    chunks = list(tool_calls_stream)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    print(f"reasoning_content: {reasoning_content}")
+    print(f"function name: {function_names[0]}")
+    print(f"function arguments: {arguments[0]}")
+    print("\n\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index e753cedcdc0..4bf7731cb41 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -3,12 +3,12 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
-parser:
+To run this example, you need to start the vLLM server
+with the reasoning parser:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --enable-reasoning --reasoning-parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 
 This example demonstrates how to generate chat completions from reasoning models
@@ -21,35 +21,44 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Round 1
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-response = client.chat.completions.create(model=model, messages=messages)
+    models = client.models.list()
+    model = models.data[0].id
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 1:", reasoning_content)
-print("content for Round 1:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
 
-# Round 2
-messages.append({"role": "assistant", "content": content})
-messages.append({
-    "role": "user",
-    "content": "How many Rs are there in the word 'strawberry'?",
-})
-response = client.chat.completions.create(model=model, messages=messages)
+    print("reasoning_content for Round 1:", reasoning_content)
+    print("content for Round 1:", content)
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "How many Rs are there in the word 'strawberry'?",
+    })
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 2:", reasoning_content)
-print("content for Round 2:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index cb13b0c614a..9cc0a5f2476 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -3,12 +3,12 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server with the reasoning
 parser:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --enable-reasoning --reasoning-parser deepseek_r1
+     --reasoning-parser deepseek_r1
 ```
 
 Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
@@ -29,41 +29,49 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
 
-models = client.models.list()
-model = models.data[0].id
 
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-stream = client.chat.completions.create(model=model,
-                                        messages=messages,
-                                        stream=True)
-
-print("client: Start streaming chat completions...")
-printed_reasoning_content = False
-printed_content = False
-
-for chunk in stream:
-    reasoning_content = None
-    content = None
-    # Check the content is reasoning_content or content
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
-    elif hasattr(chunk.choices[0].delta, "content"):
-        content = chunk.choices[0].delta.content
-
-    if reasoning_content is not None:
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
-            print("reasoning_content:", end="", flush=True)
-        print(reasoning_content, end="", flush=True)
-    elif content is not None:
-        if not printed_content:
-            printed_content = True
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
-        print(content, end="", flush=True)
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index 2c63c5ec370..c850b5aa2f8 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
     print("Embedding output:", response_json["data"][0]["embedding"])
 
 
-if __name__ == '__main__':
+def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
         "the model with --task embed before running this.")
@@ -107,8 +107,10 @@ def dse_qwen2_vl(inp: dict):
                         choices=["vlm2vec", "dse_qwen2_vl"],
                         required=True,
                         help="Which model to call.")
-    args = parser.parse_args()
+    return parser.parse_args()
+
 
+def main(args):
     if args.model == "vlm2vec":
         vlm2vec()
     elif args.model == "dse_qwen2_vl":
@@ -120,3 +122,8 @@ def dse_qwen2_vl(inp: dict):
             "type": "text",
             "content": "What is the weather like today?",
         })
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 06b93d7d193..6ab7619bff1 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -6,28 +6,36 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="A robot may not injure a human being",
-    echo=False,
-    n=2,
-    stream=stream,
-    logprobs=3)
-
-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=stream,
+        logprobs=3)
+
+    print("-" * 50)
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 67c5fc91bc6..20a64ddb214 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
+
 
-    args = parser.parse_args()
+def main(args):
     api_url = f"http://{args.host}:{args.port}/score"
     model_name = args.model
 
@@ -30,9 +32,9 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     text_2 = "The capital of Brazil is Brasilia."
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both strings:")
+    print("\nPrompt when text_1 and text_2 are both strings:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = "What is the capital of France?"
@@ -41,9 +43,9 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 is string and text_2 is a list:")
+    print("\nPrompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = [
@@ -54,7 +56,12 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both lists:")
+    print("\nPrompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index b7c5651e3ba..bc217f7ca7a 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -6,22 +6,29 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
-responses = client.embeddings.create(
-    input=[
-        "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
-    ],
-    model=model,
-)
-
-for data in responses.data:
-    print(data.embedding)  # List of float of len 4096
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        # ruff: noqa: E501
+        input=[
+            "Hello my name is",
+            "The best thing about vLLM is that it supports many different models"
+        ],
+        model=model,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 4096
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
new file mode 100644
index 00000000000..4544dcfb5ab
--- /dev/null
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Example Python client for embedding API dimensions using vLLM API server
+NOTE:
+    start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
+    vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        input=["Follow the white rabbit."],
+        model=model,
+        dimensions=32,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 32
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index e17f9c5efd6..abcfe27c276 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
@@ -25,15 +25,20 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
                         type=str,
                         default="jason9693/Qwen2.5-1.5B-apeach")
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/pooling"
     model_name = args.model
 
     # Input like Completions API
     prompt = {"model": model_name, "input": "vLLM is great!"}
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("-" * 50)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
 
     # Input like Chat API
     prompt = {
@@ -50,3 +55,9 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 062868dd8ad..66e622672ef 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -26,7 +26,12 @@ def sync_openai():
             model="openai/whisper-large-v3",
             language="en",
             response_format="json",
-            temperature=0.0)
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=4419,
+                repetition_penalty=1.3,
+            ))
         print("transcription result:", transcription.text)
 
 
@@ -41,11 +46,15 @@ async def stream_openai_response():
         "model": "openai/whisper-large-v3",
     }
     url = openai_api_base + "/audio/transcriptions"
+    headers = {"Authorization": f"Bearer {openai_api_key}"}
     print("transcription result:", end=' ')
     async with httpx.AsyncClient() as client:
         with open(str(winning_call), "rb") as f:
-            async with client.stream('POST', url, files={'file': f},
-                                     data=data) as response:
+            async with client.stream('POST',
+                                     url,
+                                     files={'file': f},
+                                     data=data,
+                                     headers=headers) as response:
                 async for line in response.aiter_lines():
                     # Each line is a JSON object prefixed with 'data: '
                     if line:
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
new file mode 100644
index 00000000000..f9ef3e2da1a
--- /dev/null
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
+See Ray Serve LLM documentation at:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+
+Run `python3 ray_serve_deepseek.py` to deploy the model.
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "deepseek",
+        # Since DeepSeek model is huge, it is recommended to pre-download
+        # the model to local disk, say /path/to/the/model and specify:
+        # model_source="/path/to/the/model"
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
+    # Change to the accelerator type of the node
+    accelerator_type="H100",
+    runtime_env={"env_vars": {
+        "VLLM_USE_V1": "1"
+    }},
+    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
+)
+
+# Deploy the application
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
new file mode 100644
index 00000000000..d8a0f211d44
--- /dev/null
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+vLLM Chat Assistant - A Streamlit Web Interface
+
+A streamlined chat interface that quickly integrates
+with vLLM API server.
+
+Features:
+- Multiple chat sessions management
+- Streaming response display
+- Configurable API endpoint
+- Real-time chat history
+
+Requirements:
+    pip install streamlit openai
+
+Usage:
+    # Start the app with default settings
+    streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Start with custom vLLM API endpoint
+    VLLM_API_BASE="http://your-server:8000/v1" \
+        streamlit run streamlit_openai_chatbot_webserver.py
+
+    # Enable debug mode
+    streamlit run streamlit_openai_chatbot_webserver.py \
+        --logger.level=debug
+"""
+import os
+from datetime import datetime
+
+import streamlit as st
+from openai import OpenAI
+
+# Get command line arguments from environment variables
+openai_api_key = os.getenv('VLLM_API_KEY', "EMPTY")
+openai_api_base = os.getenv('VLLM_API_BASE', "http://localhost:8000/v1")
+
+# Initialize session states for managing chat sessions
+if "sessions" not in st.session_state:
+    st.session_state.sessions = {}
+
+if "current_session" not in st.session_state:
+    st.session_state.current_session = None
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+if "active_session" not in st.session_state:
+    st.session_state.active_session = None
+
+# Initialize session state for API base URL
+if "api_base_url" not in st.session_state:
+    st.session_state.api_base_url = openai_api_base
+
+
+def create_new_chat_session():
+    """Create a new chat session with timestamp as ID"""
+    session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    st.session_state.sessions[session_id] = []
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = []
+
+
+def switch_to_chat_session(session_id):
+    """Switch to a different chat session"""
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = st.session_state.sessions[session_id]
+
+
+def get_llm_response(messages, model):
+    """Get streaming response from llm
+
+    Args:
+        messages: List of message dictionaries
+        model: Name of model
+
+    Returns:
+        Streaming response object or error message string
+    """
+    try:
+        response = client.chat.completions.create(model=model,
+                                                  messages=messages,
+                                                  stream=True)
+        return response
+    except Exception as e:
+        st.error(f"Error details: {str(e)}")
+        return f"Error: {str(e)}"
+
+
+# Sidebar - API Settings first
+st.sidebar.title("API Settings")
+new_api_base = st.sidebar.text_input("API Base URL:",
+                                     value=st.session_state.api_base_url)
+if new_api_base != st.session_state.api_base_url:
+    st.session_state.api_base_url = new_api_base
+    st.rerun()
+
+st.sidebar.divider()
+
+# Sidebar - Session Management
+st.sidebar.title("Chat Sessions")
+if st.sidebar.button("New Session"):
+    create_new_chat_session()
+
+# Display all sessions in reverse chronological order
+for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
+    # Mark the active session with a pinned button
+    if session_id == st.session_state.active_session:
+        st.sidebar.button(f"📍 {session_id}",
+                          key=session_id,
+                          type="primary",
+                          on_click=switch_to_chat_session,
+                          args=(session_id, ))
+    else:
+        st.sidebar.button(f"Session {session_id}",
+                          key=session_id,
+                          on_click=switch_to_chat_session,
+                          args=(session_id, ))
+
+# Main interface
+st.title("vLLM Chat Assistant")
+
+# Initialize OpenAI client with API settings
+client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
+
+# Get and display current model id
+models = client.models.list()
+model = models.data[0].id
+st.markdown(f"**Model**: {model}")
+
+# Initialize first session if none exists
+if st.session_state.current_session is None:
+    create_new_chat_session()
+    st.session_state.active_session = st.session_state.current_session
+
+# Display chat history for current session
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+
+# Handle user input and generate llm response
+if prompt := st.chat_input("Type your message here..."):
+    # Save user message to session
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.session_state.sessions[
+        st.session_state.current_session] = st.session_state.messages
+
+    # Display user message
+    with st.chat_message("user"):
+        st.write(prompt)
+
+    # Prepare messages for llm
+    messages_for_llm = [{
+        "role": m["role"],
+        "content": m["content"]
+    } for m in st.session_state.messages]
+
+    # Generate and display llm response
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        full_response = ""
+
+        # Get streaming response from llm
+        response = get_llm_response(messages_for_llm, model)
+        if isinstance(response, str):
+            message_placeholder.markdown(response)
+            full_response = response
+        else:
+            for chunk in response:
+                if hasattr(chunk.choices[0].delta, "content"):
+                    content = chunk.choices[0].delta.content
+                    if content:
+                        full_response += content
+                        message_placeholder.markdown(full_response + "▌")
+
+            message_placeholder.markdown(full_response)
+
+    # Save llm response to session history
+    st.session_state.messages.append({
+        "role": "assistant",
+        "content": full_response
+    })
diff --git a/examples/tool_chat_template_llama4_json.jinja b/examples/tool_chat_template_llama4_json.jinja
new file mode 100644
index 00000000000..759f1655443
--- /dev/null
+++ b/examples/tool_chat_template_llama4_json.jinja
@@ -0,0 +1,116 @@
+{%- macro is_array_of_type_objects(var) -%}
+    {%- if var is iterable and var is not string -%}
+        {%- set valid = true -%}
+        {%- for item in var -%}
+            {%- if 'type' not in item -%}
+                {%- set valid = false -%}
+                {%- break -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ valid }}
+    {%- else -%}
+        {{ false }}
+    {%- endif -%}
+{%- endmacro %}
+
+{%- macro render_message(message) %}
+    {%- if message['content'] is string %}
+        {{- message['content']|trim }}
+    {%- elif is_array_of_type_objects(data) == 'True' %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text']|trim }}
+            {%- endif %}
+        {%- endfor %}
+    {%- else %}
+        {{- message['content']|tojson }}
+    {%- endif %}
+{%- endmacro %}
+
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
+        "capabilities. Only reply with a tool call if the function exists in the "
+        "library provided by the user. If it doesn't exist, just reply directly in "
+        "natural language. When you receive a tool call response, use the output to "
+        "format an answer to the original user question."}) %}
+{%- endif %}
+
+{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
+    'or more function/tool calls to fulfill the task. \n'
+    'If none are needed, then proceed to the response.\n\n'
+    'Tool Call Syntax: You can call tools using the following syntax:\n'
+    '{"name": function name, "parameters": dictionary of argument name and its value}.\n'
+    'Separate multiple function calls by "; ". Do not use variables.\n'
+    'Do not include anything else when calling the tools with the syntax above.\n\n'
+    'Here is a list of functions in JSON format that you can invoke.\n' %}
+
+{{- "<|header_start|>system<|header_end|>\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- render_message(system_message) }}
+{{ "<|eot|>\n" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' }}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- render_message(first_user_message) + "\n<|eot|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {{- render_message(message) }}
+        {{- "\n<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
+        {{- render_message(message) }}
+        {%- for tool_call in message.tool_calls %}
+           {{- '{"name": "' + tool_call.function.name + '", ' }}
+           {{- '"parameters": ' }}
+           {{- tool_call.function.arguments | tojson }}
+           {{- "}" }}
+        {%- endfor %}
+       {{- "\n<|eot|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
+        {{- render_message(message) }}
+        {{- "\n<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/examples/tool_chat_template_mistral3.jinja b/examples/tool_chat_template_mistral3.jinja
new file mode 100644
index 00000000000..2b2f94d7ef5
--- /dev/null
+++ b/examples/tool_chat_template_mistral3.jinja
@@ -0,0 +1,119 @@
+{%- set today = strftime_now("%Y-%m-%d") %}
+{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %}
+
+{{- bos_token }}
+
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content'] %}
+        {%- set loop_messages = messages[1:] %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text'] %}
+        {%- set loop_messages = messages[1:] %}
+    {%- endif %}
+{%- else %}
+    {%- set system_message = default_system_message %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- elif tools is not none %}
+    {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
+    {%- if system_message is defined %}
+        {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
+    {%- else %}
+        {%- set system_message = parallel_tool_prompt %}
+    {%- endif %}
+{%- endif %}
+{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
+
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+
+{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if message['content'] is string %}
+        {{- '[INST]' + message['content'] + '[/INST]' }}
+        {%- else %}
+                {{- '[INST]' }}
+                {%- for block in message['content'] %}
+                        {%- if block['type'] == 'text' %}
+                                {{- block['text'] }}
+                        {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}
+                                {{- '[IMG]' }}
+                            {%- else %}
+                                {{- raise_exception('Only text and image blocks are supported in message content!') }}
+                            {%- endif %}
+                    {%- endfor %}
+                {{- '[/INST]' }}
+            {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message['role'] == 'assistant' %}
+        {%- if message['content'] is string %}
+            {{- message['content'] + eos_token }}
+        {%- else %}
+            {{- message['content'][0]['text'] + eos_token }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 167e975c70f..069e295bfb9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,10 +3,10 @@
 requires = [
     "cmake>=3.26",
     "ninja",
-    "packaging",
-    "setuptools>=61",
+    "packaging>=24.2",
+    "setuptools>=77.0.3,<80.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.6.0",
+    "torch == 2.7.0",
     "wheel",
     "jinja2",
 ]
@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vllm"
 authors = [{name = "vLLM Team"}]
-license = { "file"= "LICENSE" }
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 readme = "README.md"
 description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
 classifiers = [
@@ -23,7 +24,6 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
-    "License :: OSI Approved :: Apache Software License",
     "Intended Audience :: Developers",
     "Intended Audience :: Information Technology",
     "Intended Audience :: Science/Research",
@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"
 
 [tool.setuptools.packages.find]
 where = ["."]
-exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
-namespaces = false
+include = ["vllm*"]
 
 [tool.yapfignore]
 ignore_patterns = [
@@ -59,7 +58,8 @@ ignore_patterns = [
 line-length = 80
 exclude = [
     # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py"
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
 ]
 
 [tool.ruff.lint.per-file-ignores]
@@ -158,7 +158,6 @@ markers = [
     "skip_global_cleanup",
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
-    "quant_model: run this model test under Quantized category",
     "split: run this test as part of a split",
     "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
diff --git a/requirements/build.txt b/requirements/build.txt
index 13d643bcaff..5edc593b927 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -1,9 +1,9 @@
 # Should be mirrored in pyproject.toml
 cmake>=3.26
 ninja
-packaging
-setuptools>=61
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.6.0
+torch==2.7.0
 wheel
 jinja2>=3.1.6
diff --git a/requirements/common.txt b/requirements/common.txt
index 4df32460c2d..7ea27753eab 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -8,7 +8,7 @@ blake3
 py-cpuinfo
 transformers >= 4.51.1
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
-tokenizers >= 0.19.1  # Required for Llama 3.
+tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
@@ -34,9 +34,9 @@ mistral_common[opencv] >= 1.5.4
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
-setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.3 # required for compressed-tensors
+compressed-tensors == 0.9.4 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 69f732c2417..752931158a0 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -2,18 +2,19 @@
 -r common.txt
 
 # Dependencies for CPUs
-torch==2.6.0+cpu; platform_machine == "x86_64"
-torch==2.6.0; platform_system == "Darwin"
-torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.7.0+cpu; platform_machine == "x86_64"
+torch==2.7.0; platform_system == "Darwin"
+torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.6.0; platform_machine == "ppc64le"
+torchaudio==2.7.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.21.0; platform_machine == "ppc64le"
+torchvision==0.22.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # cpu cannot use triton 3.3.0
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index cdc6ee75afb..a71d9728f38 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.6.0
-torchaudio==2.6.0
+torch==2.7.0
+torchaudio==2.7.0
 # These must be updated alongside torch
-torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
+torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
+xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 416ca503b36..401f714ae9f 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,25 +1,15 @@
-sphinx==6.2.1
-sphinx-argparse==0.4.0
-sphinx-book-theme==1.0.1
+sphinx==8.2.3
+sphinx-argparse==0.5.2
+sphinx-autodoc2==0.5.0
+sphinx-book-theme==1.1.4
 sphinx-copybutton==0.5.2
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
-myst-parser==3.0.1
+myst-parser==4.0.1
 msgspec
-cloudpickle
+commonmark # Required by sphinx-argparse when using :markdownhelp:
 
 # packages to install to build the documentation
 cachetools
-pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
-torch
-py-cpuinfo
-transformers
-mistral_common >= 1.5.4
-aiohttp
-starlette
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-requests
-zmq
+torch
\ No newline at end of file
diff --git a/requirements/hpu.txt b/requirements/hpu.txt
index 830f6ef3f50..a88777268a3 100644
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@@ -7,6 +7,6 @@ triton==3.1.0
 pandas
 numpy==1.26.4
 tabulate
-setuptools>=61
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
diff --git a/requirements/neuron.txt b/requirements/neuron.txt
index 5f25bd0546e..f8e3030834e 100644
--- a/requirements/neuron.txt
+++ b/requirements/neuron.txt
@@ -2,5 +2,7 @@
 -r common.txt
 
 # Dependencies for Neuron devices
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
 torch-neuronx >= 2.5.0
 neuronx-cc
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
new file mode 100644
index 00000000000..e2711354ac1
--- /dev/null
+++ b/requirements/nightly_torch_test.txt
@@ -0,0 +1,33 @@
+# Dependency that able to run entrypoints test
+# pytest and its extensions
+pytest
+pytest-asyncio
+pytest-forked
+pytest-mock
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+
+librosa # required by audio tests in entrypoints/openai
+sentence-transformers
+numba == 0.61.2; python_version > '3.9'
+# testing utils
+boto3
+botocore
+datasets
+ray >= 2.10.0
+peft
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+tensorizer>=2.9.0
+lm-eval==0.4.8
+buildkite-test-collector==0.1.9
+lm-eval[api]==0.4.8 # required for model evaluation test
+
+# required for quantization test
+bitsandbytes>=0.45.3
+
+# required for minicpmo_26 test
+vector_quantize_pytorch
+vocos
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 29d5647807b..981b90632c1 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,13 +2,14 @@
 -r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.6.0
-torchvision==0.21.0
-torchaudio==2.6.0
+torch==2.7.0
+torchvision==0.22.0
+torchaudio==2.7.0
 
+triton==3.2
 cmake>=3.26,<4
-packaging
-setuptools>=61
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 4df92aab374..8a84f2ff1ed 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -5,11 +5,10 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req
 numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for AMD GPUs
-awscli
 boto3
 botocore
 datasets
-ray >= 2.10.0
+ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
diff --git a/requirements/test.in b/requirements/test.in
index b9b3df0651b..cdc7c563f08 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -8,7 +8,6 @@ pytest-shard
 pytest-timeout
 
 # testing utils
-awscli
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl and Mamba
@@ -23,18 +22,21 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.6.0
-torchaudio==2.6.0
-torchvision==0.21.0
+torch==2.7.0
+torchaudio==2.7.0
+torchvision==0.22.0
 transformers_stream_generator # required for qwen-vl test
+mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-transformers==4.51.1
+transformers==4.51.3
+tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/requirements/test.txt b/requirements/test.txt
index a5c062b0b1f..9a15d9a0d82 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -20,21 +20,27 @@ aiosignal==1.3.1
 annotated-types==0.7.0
     # via pydantic
 anyio==4.6.2.post1
-    # via httpx
+    # via
+    #   httpx
+    #   starlette
 argcomplete==3.5.1
     # via datamodel-code-generator
+arrow==1.3.0
+    # via isoduration
 attrs==24.2.0
     # via
     #   aiohttp
+    #   hypothesis
     #   jsonlines
     #   jsonschema
+    #   pytest-subtests
     #   referencing
 audioread==3.0.1
     # via librosa
-awscli==1.35.23
-    # via -r requirements/test.in
 backoff==2.2.1
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 bitsandbytes==0.45.3
     # via -r requirements/test.in
 black==24.10.0
@@ -45,7 +51,6 @@ boto3==1.35.57
     # via tensorizer
 botocore==1.35.57
     # via
-    #   awscli
     #   boto3
     #   s3transfer
 bounded-pool-executor==0.0.3
@@ -69,11 +74,12 @@ click==8.1.7
     #   jiwer
     #   nltk
     #   ray
+    #   schemathesis
     #   typer
 colorama==0.4.6
     # via
-    #   awscli
     #   sacrebleu
+    #   schemathesis
     #   tqdm-multiprocess
 contourpy==1.3.0
     # via matplotlib
@@ -105,12 +111,11 @@ dnspython==2.7.0
     # via email-validator
 docopt==0.6.2
     # via num2words
-docutils==0.16
-    # via awscli
 einops==0.8.0
     # via
     #   -r requirements/test.in
     #   encodec
+    #   mamba-ssm
     #   vector-quantize-pytorch
     #   vocos
 einx==0.3.0
@@ -137,6 +142,8 @@ filelock==3.16.1
     #   transformers
 fonttools==4.54.1
     # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
 frozendict==2.4.6
     # via einx
 frozenlist==1.5.0
@@ -155,8 +162,12 @@ genai-perf==0.0.8
     # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
+graphql-core==3.2.6
+    # via hypothesis-graphql
 h11==0.14.0
     # via httpcore
+harfile==0.3.0
+    # via schemathesis
 hf-xet==0.1.4
     # via huggingface-hub
 hiredis==3.0.0
@@ -164,7 +175,9 @@ hiredis==3.0.0
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 huggingface-hub==0.30.1
     # via
     #   -r requirements/test.in
@@ -179,17 +192,29 @@ huggingface-hub==0.30.1
     #   vocos
 humanize==4.11.0
     # via runai-model-streamer
+hypothesis==6.131.0
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.11.1
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
 idna==3.10
     # via
     #   anyio
     #   email-validator
     #   httpx
+    #   jsonschema
     #   requests
     #   yarl
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
     # via pytest
+isoduration==20.11.0
+    # via jsonschema
 isort==5.13.2
     # via datamodel-code-generator
 jinja2==3.1.6
@@ -209,12 +234,18 @@ joblib==1.4.2
     #   scikit-learn
 jsonlines==4.0.0
     # via lm-eval
+jsonpointer==3.0.0
+    # via jsonschema
 jsonschema==4.23.0
     # via
+    #   hypothesis-jsonschema
     #   mistral-common
     #   ray
+    #   schemathesis
 jsonschema-specifications==2024.10.1
     # via jsonschema
+junit-xml==1.9
+    # via schemathesis
 kaleido==0.2.1
     # via genai-perf
 kiwisolver==1.4.7
@@ -233,10 +264,14 @@ lxml==5.3.0
     # via
     #   blobfile
     #   sacrebleu
+mamba-ssm==2.2.4
+    # via -r requirements/test.in
 markdown-it-py==3.0.0
     # via rich
-markupsafe==3.0.2
-    # via jinja2
+markupsafe==3.0.1
+    # via
+    #   jinja2
+    #   werkzeug
 matplotlib==3.9.2
     # via -r requirements/test.in
 mbstrdecoder==1.1.3
@@ -268,6 +303,8 @@ mypy-extensions==1.0.0
     # via black
 networkx==3.2.1
     # via torch
+ninja==1.11.1.3
+    # via mamba-ssm
 nltk==3.9.1
     # via rouge-score
 num2words==0.5.14
@@ -312,45 +349,48 @@ numpy==1.26.4
     #   transformers
     #   tritonclient
     #   vocos
-nvidia-cublas-cu12==12.4.5.8
+nvidia-cublas-cu12==12.8.3.14
     # via
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-cupti-cu12==12.8.57
     # via torch
-nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.8.61
     # via torch
-nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.8.57
     # via torch
-nvidia-cudnn-cu12==9.1.0.70
+nvidia-cudnn-cu12==9.7.1.26
     # via torch
-nvidia-cufft-cu12==11.2.1.3
+nvidia-cufft-cu12==11.3.3.41
     # via torch
-nvidia-curand-cu12==10.3.5.147
+nvidia-cufile-cu12==1.13.0.11
     # via torch
-nvidia-cusolver-cu12==11.6.1.9
+nvidia-curand-cu12==10.3.9.55
     # via torch
-nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusolver-cu12==11.7.2.55
+    # via torch
+nvidia-cusparse-cu12==12.5.7.53
     # via
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cusparselt-cu12==0.6.2
+nvidia-cusparselt-cu12==0.6.3
     # via torch
-nvidia-nccl-cu12==2.21.5
+nvidia-nccl-cu12==2.26.2
     # via torch
-nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvjitlink-cu12==12.8.61
     # via
+    #   nvidia-cufft-cu12
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvtx-cu12==12.4.127
+nvidia-nvtx-cu12==12.8.55
     # via torch
 opencv-python-headless==4.11.0.86
     # via
     #   -r requirements/test.in
     #   mistral-common
-packaging==24.1
+packaging==24.2
     # via
     #   accelerate
     #   black
@@ -360,6 +400,7 @@ packaging==24.1
     #   fastparquet
     #   huggingface-hub
     #   lazy-loader
+    #   mamba-ssm
     #   matplotlib
     #   peft
     #   plotly
@@ -425,8 +466,6 @@ pyarrow==18.0.0
     # via
     #   datasets
     #   genai-perf
-pyasn1==0.6.1
-    # via rsa
 pybind11==2.13.6
     # via lm-eval
 pycparser==2.22
@@ -443,6 +482,8 @@ pygments==2.18.0
     # via rich
 pyparsing==3.2.0
     # via matplotlib
+pyrate-limiter==3.7.0
+    # via schemathesis
 pytablewriter==1.2.0
     # via lm-eval
 pytest==8.3.3
@@ -455,7 +496,9 @@ pytest==8.3.3
     #   pytest-mock
     #   pytest-rerunfailures
     #   pytest-shard
+    #   pytest-subtests
     #   pytest-timeout
+    #   schemathesis
 pytest-asyncio==0.24.0
     # via -r requirements/test.in
 pytest-forked==1.6.0
@@ -466,10 +509,13 @@ pytest-rerunfailures==14.0
     # via -r requirements/test.in
 pytest-shard==0.1.2
     # via -r requirements/test.in
+pytest-subtests==0.14.1
+    # via schemathesis
 pytest-timeout==2.3.1
     # via -r requirements/test.in
 python-dateutil==2.9.0.post0
     # via
+    #   arrow
     #   botocore
     #   matplotlib
     #   pandas
@@ -483,7 +529,6 @@ pytz==2024.2
 pyyaml==6.0.2
     # via
     #   accelerate
-    #   awscli
     #   datamodel-code-generator
     #   datasets
     #   genai-perf
@@ -491,6 +536,7 @@ pyyaml==6.0.2
     #   peft
     #   ray
     #   responses
+    #   schemathesis
     #   timm
     #   transformers
     #   vocos
@@ -521,10 +567,16 @@ requests==2.32.3
     #   pooch
     #   ray
     #   responses
+    #   schemathesis
+    #   starlette-testclient
     #   tiktoken
     #   transformers
 responses==0.25.3
     # via genai-perf
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via jsonschema
 rich==13.9.4
     # via
     #   genai-perf
@@ -535,16 +587,12 @@ rpds-py==0.20.1
     # via
     #   jsonschema
     #   referencing
-rsa==4.7.2
-    # via awscli
 runai-model-streamer==0.11.0
     # via -r requirements/test.in
 runai-model-streamer-s3==0.11.0
     # via -r requirements/test.in
 s3transfer==0.10.3
-    # via
-    #   awscli
-    #   boto3
+    # via boto3
 sacrebleu==2.4.3
     # via lm-eval
 safetensors==0.4.5
@@ -553,6 +601,8 @@ safetensors==0.4.5
     #   peft
     #   timm
     #   transformers
+schemathesis==3.39.15
+    # via -r requirements/test.in
 scikit-learn==1.5.2
     # via
     #   librosa
@@ -569,20 +619,26 @@ sentence-transformers==3.2.1
     # via -r requirements/test.in
 sentencepiece==0.2.0
     # via mistral-common
-setuptools==75.8.0
+setuptools==77.0.3
     # via
+    #   mamba-ssm
     #   pytablewriter
     #   torch
+    #   triton
 shellingham==1.5.4
     # via typer
 six==1.16.0
     # via
+    #   junit-xml
     #   python-dateutil
+    #   rfc3339-validator
     #   rouge-score
 sniffio==1.3.1
     # via
     #   anyio
     #   httpx
+sortedcontainers==2.4.0
+    # via hypothesis
 soundfile==0.12.1
     # via
     #   -r requirements/test.in
@@ -591,9 +647,15 @@ soxr==0.5.0.post1
     # via librosa
 sqlitedict==2.1.0
     # via lm-eval
+starlette==0.46.2
+    # via
+    #   schemathesis
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
 statsmodels==0.14.4
     # via genai-perf
-sympy==1.13.1
+sympy==1.13.3
     # via
     #   einx
     #   torch
@@ -617,9 +679,15 @@ tiktoken==0.7.0
     #   mistral-common
 timm==1.0.11
     # via -r requirements/test.in
-tokenizers==0.21.0
-    # via transformers
-torch==2.6.0
+tokenizers==0.21.1
+    # via
+    #   -r requirements/test.in
+    #   transformers
+tomli==2.2.1
+    # via schemathesis
+tomli-w==1.2.0
+    # via schemathesis
+torch==2.7.0+cu128
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -627,6 +695,7 @@ torch==2.6.0
     #   encodec
     #   fastsafetensors
     #   lm-eval
+    #   mamba-ssm
     #   peft
     #   runai-model-streamer
     #   sentence-transformers
@@ -636,12 +705,12 @@ torch==2.6.0
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.6.0
+torchaudio==2.7.0+cu128
     # via
     #   -r requirements/test.in
     #   encodec
     #   vocos
-torchvision==0.21.0
+torchvision==0.22.0+cu128
     # via
     #   -r requirements/test.in
     #   timm
@@ -659,17 +728,18 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.51.1
+transformers==4.51.3
     # via
     #   -r requirements/test.in
     #   genai-perf
     #   lm-eval
+    #   mamba-ssm
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.2.0
+triton==3.3.0
     # via torch
 tritonclient==2.51.0
     # via
@@ -682,6 +752,8 @@ typepy==1.3.2
     #   tabledata
 typer==0.15.2
     # via fastsafetensors
+types-python-dateutil==2.9.0.20241206
+    # via arrow
 typing-extensions==4.12.2
     # via
     #   huggingface-hub
@@ -694,6 +766,8 @@ typing-extensions==4.12.2
     #   typer
 tzdata==2024.2
     # via pandas
+uri-template==1.3.0
+    # via jsonschema
 urllib3==2.2.3
     # via
     #   blobfile
@@ -705,6 +779,10 @@ vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
 vocos==0.1.0
     # via -r requirements/test.in
+webcolors==24.11.1
+    # via jsonschema
+werkzeug==3.1.3
+    # via schemathesis
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
@@ -712,6 +790,8 @@ xxhash==3.5.0
     #   datasets
     #   evaluate
 yarl==1.17.1
-    # via aiohttp
+    # via
+    #   aiohttp
+    #   schemathesis
 zstandard==0.23.0
     # via lm-eval
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index b63993ba1ee..17d57058bfa 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -3,12 +3,13 @@
 
 # Dependencies for TPU
 cmake>=3.26
-packaging
+packaging>=24.2
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
 ray[default]
 ray[data]
+setuptools==78.1.0
 
 # Install torch_xla
 --pre
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index fa09004d0a9..04c4d4ff85a 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -3,14 +3,14 @@
 
 ray>=2.9
 cmake>=3.26
-packaging
+packaging>=24.2
 setuptools-scm>=8
-setuptools>=75.8.0
+setuptools>=77.0.3,<80.0.0
 wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 
-torch==2.6.0+xpu
+torch==2.7.0+xpu
 torchaudio
 torchvision
 pytorch-triton-xpu
@@ -18,6 +18,6 @@ pytorch-triton-xpu
 
 # Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
 # FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
-# intel-extension-for-pytorch==2.6.10+xpu
-oneccl_bind_pt==2.6.0+xpu
+intel-extension-for-pytorch==2.7.10+xpu
+oneccl_bind_pt==2.7.0+xpu
 --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
diff --git a/setup.py b/setup.py
index b0cc2f48163..7675fbdf3ef 100755
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ def load_module_from_path(module_name, path):
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 
-MAIN_CUDA_VERSION = "12.4"
+MAIN_CUDA_VERSION = "12.8"
 
 
 def is_sccache_available() -> bool:
@@ -269,15 +269,17 @@ def run(self):
         # First, run the standard build_ext command to compile the extensions
         super().run()
 
-        # copy vllm/vllm_flash_attn/*.py from self.build_lib to current
+        # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
         # directory so that they can be included in the editable build
         import glob
-        files = glob.glob(
-            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
+        files = glob.glob(os.path.join(self.build_lib, "vllm",
+                                       "vllm_flash_attn", "**", "*.py"),
+                          recursive=True)
         for file in files:
             dst_file = os.path.join("vllm/vllm_flash_attn",
-                                    os.path.basename(file))
+                                    file.split("vllm/vllm_flash_attn/")[-1])
             print(f"Copying {file} to {dst_file}")
+            os.makedirs(os.path.dirname(dst_file), exist_ok=True)
             self.copy_file(file, dst_file)
 
 
@@ -377,13 +379,22 @@ def run(self) -> None:
                 "vllm/_flashmla_C.abi3.so",
                 "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                 "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
-                "vllm/vllm_flash_attn/flash_attn_interface.py",
-                "vllm/vllm_flash_attn/__init__.py",
                 "vllm/cumem_allocator.abi3.so",
                 # "vllm/_version.py", # not available in nightly wheels yet
             ]
-            file_members = filter(lambda x: x.filename in files_to_copy,
-                                  wheel.filelist)
+
+            file_members = list(
+                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
+
+            # vllm_flash_attn python code:
+            # Regex from
+            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
+            import re
+            compiled_regex = re.compile(
+                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+            file_members += list(
+                filter(lambda x: compiled_regex.match(x.filename),
+                       wheel.filelist))
 
             for file in file_members:
                 print(f"Extracting and including {file.filename} "
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 0b76779b3a7..b6b45d1cbe8 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -103,7 +103,8 @@ def test_compile_correctness(
     method = test_setting.method
     fullgraph = test_setting.fullgraph
     if cuda_device_count_stateless() != pp_size * tp_size:
-        pytest.skip("Not correct CUDA devices for the test.")
+        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
+                    f"{cuda_device_count_stateless()}")
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 579133ec0c3..c0940638598 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
             "dtype": torch.float16,
-            "quantization": "compressed-tensors"
         }),
         ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
             "dtype": torch.float16,
-            "quantization": "compressed-tensors"
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-            "quantization": "compressed-tensors"
         }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 9f9b2d06b22..1e1364ce7bf 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -5,19 +5,19 @@
 
 import vllm.envs as envs
 from vllm import LLM, SamplingParams
+from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
                                      kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 
 from .backend import TestBackend
 
 OPS_IN_MODEL = [
     torch.ops._C.rotary_embedding.default,
     torch.ops._C.fused_add_rms_norm.default,
-    torch.ops._C.silu_and_mul.default,
 ]
 
 RMS_OP = torch.ops._C.rms_norm.default
@@ -29,6 +29,9 @@
     ],
 }
 
+SILU_MUL_OP = torch.ops._C.silu_and_mul.default
+
+SILU_MUL_QUANT_OP = torch.ops._C.silu_and_mul_quant.default
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -49,13 +52,17 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
                                do_fusion: bool):
     torch.set_default_device("cuda")
 
-    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
-                                          enable_noop=True)
-    noop_pass = NoOpEliminationPass(config)
-    fusion_pass = FusionPass.instance(config)
-
-    passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
-    func_pass = FixFunctionalizationPass(config)
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(pass_config= \
+        CompilationConfig.PassConfig(enable_fusion=do_fusion,
+                                          enable_noop=True))
+    noop_pass = NoOpEliminationPass(vllm_config)
+    fusion_pass = FusionPass.instance(vllm_config)
+    act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
+
+    passes = [noop_pass, fusion_pass, act_quant_fusion_pass
+              ] if do_fusion else [noop_pass]
+    func_pass = FixFunctionalizationPass(vllm_config)
     backend_func = TestBackend(*passes, func_pass)
     backend_no_func = TestBackend(*passes)
 
@@ -77,6 +84,7 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
     model_runner.model = torch.compile(orig_model,
                                        fullgraph=True,
                                        backend=backend_no_func)
+
     gen_no_func = llm.generate(prompts, sampling_params)
 
     for output_func, output_no_func in zip(gen_func, gen_no_func):
@@ -86,7 +94,12 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
     # and replaced by fused quantized ops in RMS_QUANT_OPS.
     rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)]
                ] if do_fusion else [RMS_OP]
-    ops = OPS_IN_MODEL + rms_ops
+    silu_mul_ops = [SILU_MUL_QUANT_OP] if do_fusion and \
+        quant_key == kFp8StaticTensorSym else [
+        SILU_MUL_OP
+    ]
+
+    ops = OPS_IN_MODEL + rms_ops + silu_mul_ops
 
     for op in ops:
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index efebf05b6b0..6a696fe0226 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
 
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
+    vllm_config.compilation_config.pass_config = \
+            CompilationConfig.PassConfig(enable_fusion=True,
+                                              enable_noop=True)
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
-        config = CompilationConfig.PassConfig(enable_fusion=True,
-                                              enable_noop=True)
-        noop_pass = NoOpEliminationPass(config)
-        fusion_pass = FusionPass.instance(config)
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = FusionPass.instance(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass)
         model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 2c1ee4dc748..673ebe8b6fd 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -6,7 +6,7 @@
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import CompilationConfig
+from vllm.config import VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
 
 # Should fail to add directly to the pass manager
 def test_bad_callable():
-    config = CompilationConfig().pass_config
+    config = VllmConfig()
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
@@ -43,7 +43,7 @@ def __call__(self, graph: torch.fx.graph.Graph) -> None:
     ],
 )
 def test_pass_manager_uuid(callable):
-    config = CompilationConfig().pass_config
+    config = VllmConfig()
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
@@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
 
     # UUID should be different due to config change
     config2 = copy.deepcopy(config)
-    config2.enable_fusion = not config2.enable_fusion
+    config2.compilation_config.pass_config.enable_fusion = not \
+        config2.compilation_config.pass_config.enable_fusion
     pass_manager3 = PostGradPassManager()
     pass_manager3.configure(config2)
     pass_manager3.add(callable)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
new file mode 100644
index 00000000000..79f5486dadc
--- /dev/null
+++ b/tests/compile/test_sequence_parallelism.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.fix_functionalization import FixFunctionalizationPass
+from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
+                                       find_specified_fn,
+                                       find_specified_fn_maybe, is_func)
+from vllm.compilation.sequence_parallelism import SequenceParallelismPass
+from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
+                         VllmConfig)
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+from ..utils import multi_gpu_test
+from .backend import TestBackend
+
+OPS_IN_MODEL_BEFORE = [
+    torch.ops.vllm.all_reduce.default,
+]
+
+OPS_IN_MODEL_AFTER = [
+    torch.ops.vllm.reduce_scatter.default,
+    torch.ops.vllm.all_gather.default,
+]
+
+OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16, intermediate_size=32):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = torch.nn.Parameter(
+            torch.empty((intermediate_size, hidden_size)))
+        self.norm = RMSNorm(hidden_size, 1e-05)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+    def forward(self, hidden_states, residual):
+        """
+        Forward pass implementing the operations in the FX graph
+        
+        Args:
+            hidden_states: Input tensor
+            residual: Residual tensor from previous layer
+            
+        Returns:
+            Tuple containing the output tensor
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        #matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+
+        # Tensor parallel all-reduce
+        all_reduce = tensor_model_parallel_all_reduce(mm)
+
+        # layer normalization
+        norm_output, residual_output = self.norm(all_reduce, residual)
+
+        return norm_output, residual_output
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_sequence_parallelism_pass(batch_size: int, seq_len: int,
+                                   hidden_size: int, dtype: torch.dtype):
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, batch_size, seq_len,
+                                          hidden_size, dtype),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(sequence_parallelism_pass_on_test_model, num_processes)
+
+
+def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
+                                            batch_size: int, seq_len: int,
+                                            hidden_size: int,
+                                            dtype: torch.dtype):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # configure vllm config for SequenceParallelismPass
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(
+        pass_config=CompilationConfig.PassConfig(
+            enable_sequence_parallelism=True, ), )
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model,
+                                           task="auto",
+                                           tokenizer=model,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+
+    sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
+    backend_no_func = TestBackend(sequence_parallelism_pass)
+    func_pass = FixFunctionalizationPass(vllm_config)
+    backend_func = TestBackend(sequence_parallelism_pass, func_pass)
+
+    model = TestModel(hidden_size, hidden_size * 2)
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                dtype=dtype)
+    residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
+
+    compiled_model_no_func = torch.compile(model, backend=backend_no_func)
+    compiled_model_no_func(hidden_states, residual)
+    compiled_model_func = torch.compile(model, backend=backend_func)
+    compiled_model_func(hidden_states, residual)
+
+    # Check substitution worked
+    pre_nodes = backend_no_func.graph_pre_pass.nodes
+    post_nodes = backend_no_func.graph_post_pass.nodes
+
+    # In pre-nodes, all reduce should be there,
+    # reduce scatter and all gather should not
+    for op in OPS_IN_MODEL_BEFORE:
+        find_specified_fn(pre_nodes, op)
+    for op in OPS_IN_MODEL_AFTER:
+        assert find_specified_fn_maybe(pre_nodes, op) is None
+
+    # In post-nodes, reduce scatter and all gather should be there,
+    # all reduce should not
+    for op in OPS_IN_MODEL_AFTER:
+        find_specified_fn(post_nodes, op)
+    for op in OPS_IN_MODEL_BEFORE:
+        assert find_specified_fn_maybe(post_nodes, op) is None
+
+    # check if the functionalization pass is applied
+    for op in OPS_IN_MODEL:
+        find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
+        assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
+                                  op) is None  # noqa: E501
+
+    # make sure the ops were all de-functionalized
+    found = dict()
+    for node in backend_func.graph_post_pass.nodes:
+        for op in OPS_IN_MODEL:
+            if is_func(node, op):
+                found[op] = True
+    assert all(found[op] for op in OPS_IN_MODEL)
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
new file mode 100644
index 00000000000..313848372e0
--- /dev/null
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm._custom_ops import scaled_fp8_quant
+from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+from vllm.config import CompilationConfig, VllmConfig
+from vllm.model_executor.layers.activation import SiluAndMul
+
+from .backend import TestBackend
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.silu_and_mul = SiluAndMul()
+        self.scale = torch.rand(1, dtype=torch.float32)
+
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        x2 = scaled_fp8_quant(y, self.scale)
+        return x2
+
+
+@pytest.mark.parametrize("num_tokens", [256])
+@pytest.mark.parametrize("hidden_size", [64])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(torch.float16)
+
+    # Reshape pass is needed for the fusion pass to work
+    config = VllmConfig()
+    config.compilation_config = CompilationConfig(
+        pass_config=CompilationConfig.PassConfig(enable_fusion=True,
+                                                 enable_reshape=True))
+    fusion_pass = ActivationQuantFusionPass(config)
+
+    backend = TestBackend(fusion_pass)
+    model = TestModel()
+
+    # First dimension dynamic
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
+
+    result = model(x)
+
+    model2 = torch.compile(model, backend=backend)
+    result2 = model2(x)
+
+    # Check that it gives the same answer
+    torch.testing.assert_close(result[0].to(dtype=torch.float16),
+                               result2[0].to(dtype=torch.float16),
+                               atol=1e-3,
+                               rtol=1e-3)
+
+    # Check substitution worked
+    pre_nodes = backend.graph_pre_pass.nodes
+    post_nodes = backend.graph_post_pass.nodes
+
+    silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
+    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+
+    # In pre-nodes, fp8 quant should be present and fused kernels should not
+    assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
+    find_auto_fn(pre_nodes, fp8_quant)
+
+    # In post-nodes, fused kernels should be present and fp8 quant should not
+    find_auto_fn(post_nodes, silu_and_mul_quant)
+    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
diff --git a/tests/conftest.py b/tests/conftest.py
index d272f448f61..fa979f1093b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import json
 import os
 import tempfile
-from collections import UserList
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
 
@@ -21,9 +19,10 @@
 from tests.models.utils import (TokensTextLogprobs,
                                 TokensTextLogprobsPromptLogprobs)
 from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
+from vllm.config import TaskOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -57,16 +56,12 @@ def _read_prompts(filename: str) -> list[str]:
         return prompts
 
 
-class _ImageAssetPrompts(TypedDict):
+class ImageAssetPrompts(TypedDict):
     stop_sign: str
     cherry_blossom: str
 
 
-class _ImageAssetsBase(UserList[ImageAsset]):
-    pass
-
-
-class _ImageAssets(_ImageAssetsBase):
+class ImageTestAssets(list[ImageAsset]):
 
     def __init__(self) -> None:
         super().__init__([
@@ -74,7 +69,7 @@ def __init__(self) -> None:
             ImageAsset("cherry_blossom"),
         ])
 
-    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
+    def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
         """
         Convenience method to define the prompt for each test image.
 
@@ -84,29 +79,44 @@ def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
         return [prompts["stop_sign"], prompts["cherry_blossom"]]
 
 
-class _VideoAssetPrompts(TypedDict):
-    sample_demo_1: str
+class VideoAssetPrompts(TypedDict):
+    baby_reading: str
+
+
+class VideoTestAssets(list[VideoAsset]):
+
+    def __init__(self) -> None:
+        super().__init__([
+            VideoAsset("baby_reading"),
+        ])
+
+    def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
+        return [prompts["baby_reading"]]
 
 
-class _VideoAssetsBase(UserList[VideoAsset]):
-    pass
+class AudioAssetPrompts(TypedDict):
+    mary_had_lamb: str
+    winning_call: str
 
 
-class _VideoAssets(_VideoAssetsBase):
+class AudioTestAssets(list[AudioAsset]):
 
     def __init__(self) -> None:
         super().__init__([
-            VideoAsset("sample_demo_1.mp4"),
+            AudioAsset("mary_had_lamb"),
+            AudioAsset("winning_call"),
         ])
 
-    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
-        return [prompts["sample_demo_1"]]
+    def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
+        return [prompts["mary_had_lamb"], prompts["winning_call"]]
 
 
-IMAGE_ASSETS = _ImageAssets()
-"""Singleton instance of :class:`_ImageAssets`."""
-VIDEO_ASSETS = _VideoAssets()
-"""Singleton instance of :class:`_VideoAssets`."""
+IMAGE_ASSETS = ImageTestAssets()
+"""Singleton instance of {class}`ImageTestAssets`."""
+VIDEO_ASSETS = VideoTestAssets()
+"""Singleton instance of {class}`VideoTestAssets`."""
+AUDIO_ASSETS = AudioTestAssets()
+"""Singleton instance of {class}`AudioTestAssets`."""
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -254,15 +264,20 @@ def example_long_prompts() -> list[str]:
 
 
 @pytest.fixture(scope="session")
-def image_assets() -> _ImageAssets:
+def image_assets() -> ImageTestAssets:
     return IMAGE_ASSETS
 
 
 @pytest.fixture(scope="session")
-def video_assets() -> _VideoAssets:
+def video_assets() -> VideoTestAssets:
     return VIDEO_ASSETS
 
 
+@pytest.fixture(scope="session")
+def audio_assets() -> AudioTestAssets:
+    return AUDIO_ASSETS
+
+
 _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
 _R = TypeVar("_R")
 
@@ -272,7 +287,8 @@ class HfRunner:
     def get_default_device(self):
         from vllm.platforms import current_platform
 
-        return ("cpu" if current_platform.is_cpu() else "cuda")
+        return ("cpu"
+                if current_platform.is_cpu() else current_platform.device_type)
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
         if x is None or isinstance(x, (bool, )):
@@ -390,10 +406,15 @@ def get_inputs(
                 processor_kwargs["images"] = image
             if videos is not None and (video := videos[i]) is not None:
                 processor_kwargs["videos"] = video
-            if audios is not None and (audio_tuple := audios[i]) is not None:
-                audio, sr = audio_tuple
-                processor_kwargs["audio"] = audio
-                processor_kwargs["sampling_rate"] = sr
+            if audios is not None and (audio_inputs := audios[i]) is not None:
+                # HACK - not all processors take sampling_rate; we should
+                # clean this up in the future.
+                if len(audio_inputs) == 2:
+                    audio, sr = audio_inputs
+                    processor_kwargs["audio"] = audio
+                    processor_kwargs["sampling_rate"] = sr
+                else:
+                    processor_kwargs["audio"] = audio_inputs
 
             inputs = self.processor(**processor_kwargs)
             if isinstance(inputs, BatchFeature):
@@ -531,7 +552,10 @@ def _hidden_states_to_seq_logprobs(
         for _, hidden_state in enumerate(hidden_states):
             last_hidden_states = hidden_state[-1][0]
             logits = torch.matmul(
-                last_hidden_states.to(output_embeddings.weight.device),
+                last_hidden_states.to(
+                    device=output_embeddings.weight.device,
+                    dtype=output_embeddings.weight.dtype,
+                ),
                 output_embeddings.weight.t(),
             )
             if getattr(output_embeddings, "bias", None) is not None:
@@ -700,7 +724,7 @@ def hf_runner():
 class VllmRunner:
     """
     The default value of some arguments have been modified from
-    :class:`~vllm.LLM` as follows:
+    {class}`~vllm.LLM` as follows:
 
     - `trust_remote_code`: Set to `True` instead of `False` for convenience.
     - `seed`: Set to `0` instead of `None` for test reproducibility.
@@ -708,7 +732,7 @@ class VllmRunner:
     - `block_size`: Set to `16` instead of `None` to reduce memory usage.
     - `enable_chunked_prefill`: Set to `False` instead of `None` for
       test reproducibility.
-    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
+    - `enforce_eager`: Set to `False` to test CUDA graph.
     """
 
     def __init__(
@@ -749,7 +773,7 @@ def __init__(
 
     def get_inputs(
         self,
-        prompts: list[str],
+        prompts: Union[list[str], list[torch.Tensor]],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
@@ -771,16 +795,18 @@ def get_inputs(
             if audios is not None and (audio := audios[i]) is not None:
                 multi_modal_data["audio"] = audio
 
-            inputs.append(
-                TextPrompt(prompt=prompt,
-                           multi_modal_data=multi_modal_data
-                           if multi_modal_data else None))
+            text_prompt_kwargs = {
+                ("prompt" if isinstance(prompt, str) else "prompt_embeds"):
+                prompt,
+                "multi_modal_data": multi_modal_data or None
+            }
+            inputs.append(TextPrompt(**text_prompt_kwargs))
 
         return inputs
 
     def generate(
         self,
-        prompts: list[str],
+        prompts: Union[list[str], list[torch.Tensor]],
         sampling_params: SamplingParams,
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
@@ -806,7 +832,7 @@ def generate(
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 req_sample_output_ids.append(prompt_ids + output_ids)
-                req_sample_output_strs.append(prompt_str + output_str)
+                req_sample_output_strs.append((prompt_str or "") + output_str)
             outputs.append((req_sample_output_ids, req_sample_output_strs))
         return outputs
 
@@ -873,7 +899,7 @@ def generate_encoder_decoder_w_logprobs(
 
     def generate_greedy(
         self,
-        prompts: list[str],
+        prompts: Union[list[str], list[torch.Tensor]],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
@@ -925,6 +951,7 @@ def generate_encoder_decoder_greedy_logprobs(
         max_tokens: int,
         num_logprobs: int,
         num_prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
     ) -> Union[list[TokensTextLogprobs],
                list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -932,6 +959,7 @@ def generate_encoder_decoder_greedy_logprobs(
             max_tokens=max_tokens,
             logprobs=num_logprobs,
             prompt_logprobs=(num_prompt_logprobs),
+            skip_special_tokens=skip_special_tokens,
         )
         '''
         Greedy logprobs generation for vLLM encoder/decoder models
@@ -1008,20 +1036,6 @@ def vllm_runner():
     return VllmRunner
 
 
-def get_tokenizer_pool_config(tokenizer_group_type):
-    if tokenizer_group_type is None:
-        return None
-    if tokenizer_group_type == "ray":
-        return TokenizerPoolConfig(pool_size=1,
-                                   pool_type="ray",
-                                   extra_config={})
-    if isinstance(tokenizer_group_type, type):
-        return TokenizerPoolConfig(pool_size=1,
-                                   pool_type=tokenizer_group_type,
-                                   extra_config={})
-    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
-
-
 @pytest.fixture()
 def temporary_enable_log_propagate():
     import logging
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index e9b537ed515..9e8e315d87b 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -195,15 +195,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                          [{
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 2,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 3,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 256,
                              "max_num_seqs": 10,
                          }])
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 8bd64923fe2..a5ba16898d8 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -2,16 +2,18 @@
 
 import time
 from collections import deque
+from typing import Optional
 from unittest.mock import MagicMock
 
 import pytest  # noqa
+import torch
 from torch import Use  # noqa
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SequenceGroup
+from vllm.sequence import SequenceGroup, SequenceStatus
 
 from .utils import (append_new_token, append_new_token_seq,
                     append_new_token_seq_group, create_dummy_prompt,
@@ -968,3 +970,73 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
     ), "A partial prefix of C (4 tokens) should be prefilled, with the "
     "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
     "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
+
+
+def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
+    """
+    Test that the scheduler does not schedule batches with prompt tokens and 
+    prompt embeddings co-mingled.
+    """
+    block_size = 2
+    max_seq_group = 3
+    scheduler = initialize_scheduler(
+        block_size=block_size,
+        num_cpu_blocks=16,
+        num_gpu_blocks=16,
+        max_num_seqs=max_seq_group,
+        max_model_len=100,
+        enable_prefix_caching=True,
+    )
+
+    # the odd indexed inputs should be passed in via embeddings,
+    # evens via token_ids
+    seq_length = 7
+    embedding_size = 5
+    num_seqs = 11
+    seq_tokens: list[list[int]] = []
+    seq_embeds: list[Optional[torch.Tensor]] = []
+    for i in range(num_seqs):
+        if i % 2:
+            seq_tokens.append(list(range(seq_length)))
+            seq_embeds.append(None)
+        else:
+            seq_tokens.append([0] * seq_length)
+            seq_embeds.append(torch.rand(embedding_size))
+
+    seq_and_seq_groups = [
+        create_dummy_prompt(f"{i}",
+                            prompt_tokens=seq_tokens[i],
+                            prompt_embeds=seq_embeds[i],
+                            block_size=block_size)
+        for i in range(len(seq_tokens))
+    ]
+
+    for _, seq_group in seq_and_seq_groups:
+        scheduler.add_seq_group(seq_group)
+
+    while not all(seq.is_finished() for seq, _ in seq_and_seq_groups):
+        unfinished_seq_groups = [
+            seq_group for _, seq_group in seq_and_seq_groups
+            if not seq_group.is_finished()
+        ]
+        _, out = schedule_and_update_computed_tokens(scheduler)
+        assert len(out.scheduled_seq_groups) > 0
+        batch_is_prompt_embeds = out.scheduled_seq_groups[
+            0].seq_group.uses_prompt_embeds()
+        expected_scheduled_seq_groups = [
+            seq_group for seq_group in unfinished_seq_groups
+            if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds
+        ]
+
+        # We should have as many scheduled groups as possible, without mixing
+        assert len(out.scheduled_seq_groups) == min(
+            max_seq_group, len(expected_scheduled_seq_groups))
+        assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() ==
+                   batch_is_prompt_embeds
+                   for scheduled_seq_group in out.scheduled_seq_groups)
+
+        # Finish the scheduled groups
+        for scheduled_seq_group in out.scheduled_seq_groups:
+            for seq in scheduled_seq_group.seq_group.seqs:
+                seq.status = SequenceStatus.FINISHED_STOPPED
+        scheduler.free_finished_seq_groups()
diff --git a/tests/core/utils.py b/tests/core/utils.py
index ea18b879a31..84b0426b470 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -5,9 +5,11 @@
 from collections.abc import Sequence as GenericSequence
 from typing import Any, Optional
 
+import torch
+
 from vllm import SamplingParams
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
-from vllm.inputs import EncoderDecoderInputs, token_inputs
+from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
 from vllm.lora.request import LoRARequest
 from vllm.sequence import (Logprob, Sequence, SequenceGroup,
                            SequenceGroupMetadata)
@@ -19,6 +21,7 @@ def create_dummy_prompt(
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
     prompt_tokens: Optional[list[int]] = None,
+    prompt_embeds: Optional[torch.Tensor] = None,
     min_tokens: int = 0,
     max_tokens: int = 16,
 ) -> tuple[Sequence, SequenceGroup]:
@@ -31,9 +34,13 @@ def create_dummy_prompt(
         prompt_tokens = list(range(prompt_length))
 
     prompt_str = " ".join([str(t) for t in prompt_tokens])
+    inputs = token_inputs(
+        prompt_token_ids=prompt_tokens,
+        prompt=prompt_str) if prompt_embeds is None else embeds_inputs(
+            prompt_embeds=prompt_embeds)
     prompt = Sequence(
         int(request_id),
-        inputs=token_inputs(prompt_tokens, prompt=prompt_str),
+        inputs=inputs,
         block_size=block_size,
     )
     seq_group = SequenceGroup(
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
new file mode 100644
index 00000000000..ee8f2097933
--- /dev/null
+++ b/tests/distributed/conftest.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+import random
+from typing import Optional, Union
+
+import msgspec
+import msgspec.msgpack
+import pytest
+import zmq
+
+from vllm.config import KVEventsConfig
+from vllm.distributed.kv_events import EventPublisherFactory
+
+from .test_events import SampleBatch
+
+
+@pytest.fixture
+def random_port():
+    """Generate a random port number for testing"""
+    return random.randint(10000, 60000)
+
+
+@pytest.fixture
+def publisher_config(random_port, request):
+    """Create a publisher config with inproc transport"""
+    how = request.param if hasattr(request, "param") else "inproc"
+
+    if how == "inproc":
+        endpoint = f"inproc://test-{random_port}"
+        replay_endpoint = endpoint + "-replay"
+    else:
+        endpoint = f"tcp://*:{random_port}"
+        replay_endpoint = f"tcp://*:{random_port + 1}"
+
+    return KVEventsConfig(enable_kv_cache_events=True,
+                          publisher="zmq",
+                          endpoint=endpoint,
+                          replay_endpoint=replay_endpoint,
+                          buffer_steps=100,
+                          hwm=1000,
+                          topic="test")
+
+
+@pytest.fixture
+def publisher(publisher_config):
+    """Create and return a publisher instance"""
+    pub = EventPublisherFactory.create(publisher_config)
+    yield pub
+    pub.shutdown()
+
+
+@pytest.fixture
+def subscriber(publisher_config):
+    """Create and return a subscriber for testing"""
+    endpoint = publisher_config.endpoint
+    replay_endpoint = publisher_config.replay_endpoint
+
+    if endpoint.startswith("tcp://*"):
+        endpoint = endpoint.replace("*", "127.0.0.1")
+    if replay_endpoint and replay_endpoint.startswith("tcp://*"):
+        replay_endpoint = replay_endpoint.replace("*", "127.0.0.1")
+
+    sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic)
+    yield sub
+    sub.close()
+
+
+class MockSubscriber:
+    """Helper class to receive and verify published events"""
+
+    def __init__(self,
+                 pub_endpoint: str,
+                 replay_endpoint: Optional[str] = None,
+                 topic: str = "",
+                 decode_type=SampleBatch):
+        self.ctx = zmq.Context.instance()
+
+        # Set up subscriber socket
+        self.sub = self.ctx.socket(zmq.SUB)
+        self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8'))
+        self.sub.connect(pub_endpoint)
+
+        # Set up replay socket if provided
+        self.replay = None
+        if replay_endpoint:
+            self.replay = self.ctx.socket(zmq.REQ)
+            self.replay.connect(replay_endpoint)
+
+        self.topic = topic
+        self.topic_bytes = topic.encode('utf-8')
+        self.received_msgs: list[tuple[int, SampleBatch]] = []
+        self.last_seq = -1
+        self.decoder = msgspec.msgpack.Decoder(type=decode_type)
+
+    def receive_one(self,
+                    timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+        """Receive a single message with timeout"""
+        if not self.sub.poll(timeout):
+            return None
+
+        topic_bytes, seq_bytes, payload = self.sub.recv_multipart()
+        assert topic_bytes == self.topic_bytes
+
+        seq = int.from_bytes(seq_bytes, "big")
+        data = self.decoder.decode(payload)
+        self.last_seq = seq
+        self.received_msgs.append((seq, data))
+        return seq, data
+
+    def request_replay(self, start_seq: int) -> None:
+        """Request replay of messages starting from start_seq"""
+        if not self.replay:
+            raise ValueError("Replay socket not initialized")
+
+        self.replay.send(start_seq.to_bytes(8, "big"))
+
+    def receive_replay(self) -> list[tuple[int, SampleBatch]]:
+        """Receive replayed messages"""
+        if not self.replay:
+            raise ValueError("Replay socket not initialized")
+
+        replayed: list[tuple[int, SampleBatch]] = []
+        while True:
+            try:
+                if not self.replay.poll(1000):
+                    break
+
+                frames = self.replay.recv_multipart()
+                if not frames or not frames[-1]:
+                    # End of replay marker
+                    break
+
+                seq_bytes, payload = frames
+                seq = int.from_bytes(seq_bytes, "big")
+                data = self.decoder.decode(payload)
+                replayed.append((seq, data))
+            except zmq.ZMQError as _:
+                break
+
+        return replayed
+
+    def close(self):
+        """Clean up resources"""
+        self.sub.close()
+        if self.replay:
+            self.replay.close()
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index ac6d6aae300..8f4c3537e15 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -14,7 +14,8 @@
 
 from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
 
 from ..utils import init_test_distributed_environment, multi_process_parallel
 
@@ -47,6 +48,34 @@ def all_reduce_test_worker(
     torch.testing.assert_close(t, expected)
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
+                               pp_size: int, rank: int,
+                               distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tp_size)
+    ]
+
+    index = rank % tp_size
+    partition_size = num_elements // tp_size
+    all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    t = all_tensors[index]
+    t = tensor_model_parallel_reduce_scatter(t, 0)
+    torch.testing.assert_close(t, expected)
+
+
 @ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(
     monkeypatch: pytest.MonkeyPatch,
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
new file mode 100644
index 00000000000..15bcfdb8555
--- /dev/null
+++ b/tests/distributed/test_events.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+import threading
+import time
+
+import msgspec
+import pytest
+
+from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
+                                        NullEventPublisher)
+
+
+class EventSample(
+        msgspec.Struct,
+        tag=True,  # type: ignore
+        array_like=True  # type: ignore
+):
+    """Test event for publisher testing"""
+    id: int
+    value: str
+
+
+class SampleBatch(EventBatch):
+    """Test event batch for publisher testing"""
+    events: list[EventSample]
+
+
+def create_test_events(count: int) -> SampleBatch:
+    """Create a batch of test events"""
+    events = [EventSample(id=i, value=f"test-{i}") for i in range(count)]
+    return SampleBatch(ts=time.time(), events=events)
+
+
+def test_basic_publishing(publisher, subscriber):
+    """Test basic event publishing works"""
+
+    test_batch = create_test_events(5)
+    publisher.publish(test_batch)
+
+    result = subscriber.receive_one(timeout=1000)
+    assert result is not None, "No message received"
+
+    seq, received = result
+    assert seq == 0, "Sequence number mismatch"
+    assert received.ts == pytest.approx(test_batch.ts,
+                                        abs=0.1), ("Timestamp mismatch")
+    assert len(received.events) == len(
+        test_batch.events), ("Number of events mismatch")
+
+    for i, event in enumerate(received.events):
+        assert event.id == i, "Event id mismatch"
+        assert event.value == f"test-{i}", "Event value mismatch"
+
+
+def test_multiple_events(publisher, subscriber):
+    """Test publishing and receiving multiple event batches"""
+    for _ in range(10):
+        batch = create_test_events(2)
+        publisher.publish(batch)
+
+    received = []
+    for _ in range(10):
+        data = subscriber.receive_one(timeout=100)
+        if data:
+            received.append(data)
+
+    assert len(received) == 10, "Number of messages mismatch"
+    seqs = [seq for seq, _ in received]
+    assert seqs == list(range(10)), "Sequence numbers mismatch"
+
+
+def test_replay_mechanism(publisher, subscriber):
+    """Test the replay mechanism works correctly"""
+    for _ in range(19):
+        batch = create_test_events(1)
+        publisher.publish(batch)
+
+    time.sleep(0.5)  # Need publisher to process above requests
+    subscriber.request_replay(10)
+
+    batch = create_test_events(1)
+    publisher.publish(batch)  # 20th message
+
+    replayed = subscriber.receive_replay()
+
+    assert len(replayed) > 0, "No replayed messages received"
+    seqs = [seq for seq, _ in replayed]
+    assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
+    assert seqs == list(range(min(seqs),
+                              max(seqs) +
+                              1)), ("Replayed messages not consecutive")
+
+
+def test_buffer_limit(publisher, subscriber, publisher_config):
+    """Test buffer limit behavior"""
+    buffer_size = publisher_config.buffer_steps
+
+    # Publish more events than the buffer can hold
+    for i in range(buffer_size + 10):
+        batch = create_test_events(1)
+        publisher.publish(batch)
+
+    time.sleep(0.5)  # Need publisher to process above requests
+    subscriber.request_replay(0)
+
+    batch = create_test_events(1)
+    publisher.publish(batch)
+
+    replayed = subscriber.receive_replay()
+
+    assert len(replayed) <= buffer_size, "Can't replay more than buffer size"
+
+    oldest_seq = min(seq for seq, _ in replayed)
+    assert oldest_seq >= 10, "The oldest sequence should be at least 10"
+
+
+def test_topic_filtering(publisher_config):
+    """
+    Test that a subscriber only receives messages matching its topic filter
+    """
+    publisher_config.replay_endpoint = None
+
+    cfg = publisher_config.model_copy()
+    cfg.topic = "foo"
+    pub = EventPublisherFactory.create(cfg)
+
+    from .conftest import MockSubscriber
+    sub_foo = MockSubscriber(cfg.endpoint, None, "foo")
+    sub_bar = MockSubscriber(cfg.endpoint, None, "bar")
+
+    try:
+        time.sleep(0.1)
+
+        for _ in range(3):
+            pub.publish(create_test_events(1))
+
+        foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
+        assert all(msg is not None for msg in foo_received), (
+            "Subscriber with matching topic should receive messages")
+
+        bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
+        assert all(msg is None for msg in bar_received), (
+            "Subscriber with non-matching topic should receive no messages")
+    finally:
+        pub.shutdown()
+        sub_foo.close()
+        sub_bar.close()
+
+
+def test_high_volume(publisher, subscriber):
+    """Test publishing and receiving a high volume of events"""
+    num_batches = 10_000
+    events_per_batch = 100
+
+    # Publish events in a separate thread to not block
+    def publish_events():
+        for i in range(num_batches):
+            batch = create_test_events(events_per_batch)
+            publisher.publish(batch)
+            # Small delay to avoid overwhelming
+            if i % 100 == 0:
+                time.sleep(0.01)
+
+    received: list[tuple[int, SampleBatch]] = []
+
+    publisher_thread = threading.Thread(target=publish_events)
+    publisher_thread.start()
+
+    start_time = time.time()
+    while len(received) < num_batches:
+        if time.time() - start_time > 10:  # Timeout after 10 seconds
+            break
+
+        result = subscriber.receive_one(timeout=100)
+        if result:
+            received.append(result)
+
+    publisher_thread.join()
+
+    assert len(received) >= num_batches * 0.9, (
+        "We should have received most messages")
+
+    seqs = [seq for seq, _ in received]
+    assert sorted(seqs) == seqs, "Sequence numbers should be in order"
+
+
+def test_null_publisher():
+    """Test that NullEventPublisher can be used without errors"""
+    publisher = NullEventPublisher()
+
+    # This should not raise any errors
+    batch = create_test_events(5)
+    publisher.publish(batch)
+    publisher.shutdown()
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 05e30f855ce..03de8d9b92b 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -161,12 +161,12 @@ def iter_params(self, model_id: str):
     "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
     "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
     "tiiuae/falcon-7b": PPTestSettings.fast(),
-    "google/gemma-2b": PPTestSettings.fast(),
+    "google/gemma-1.1-2b-it": PPTestSettings.fast(),
     "google/gemma-2-9b": PPTestSettings.fast(),
     "gpt2": PPTestSettings.fast(),
     "bigcode/starcoder": PPTestSettings.fast(),
     "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
-    "EleutherAI/pythia-12b": PPTestSettings.fast(),
+    "EleutherAI/pythia-1.4b": PPTestSettings.fast(),
     "ibm/PowerLM-3b": PPTestSettings.fast(),
     "ibm/PowerMoE-3b": PPTestSettings.fast(),
     # Uses Llama
@@ -195,7 +195,7 @@ def iter_params(self, model_id: str):
     "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
     "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
     "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
-    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
+    "Qwen/Qwen2.5-0.5B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
     "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
     "bigcode/starcoder2-3b": PPTestSettings.fast(),
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
new file mode 100644
index 00000000000..19497ad9c14
--- /dev/null
+++ b/tests/distributed/test_sequence_parallel.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import TaskOption
+from vllm.logger import init_logger
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_sequence_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    sp_enabled: bool
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class SPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+
+
+@dataclass
+class SPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    task: TaskOption
+    test_options: SPTestOptions
+
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 2,
+        multi_node_only: bool = False,
+        task: TaskOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        return SPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=True,
+                              chunked_prefill=True)
+            ],
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 2,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        load_format: Optional[str] = None,
+    ):
+        return SPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              sp_enabled=True,
+                              eager_mode=False,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.task, opts)
+
+
+def _compare_sp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: SPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate", "encode"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        sp_enabled,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+
+    pp_size = 1
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    compilation_config = {
+        'level': 3,
+        'custom_ops': ["+rms_norm"],
+        'compile_sizes': [4, 8],
+        'splitting_ops': [],
+        'pass_config': {
+            'enable_sequence_parallism': sp_enabled,
+            'enable_noop': True,
+            'enable_fusion': True,
+        },
+    }
+
+    tp_sp_env = tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+
+    tp_sp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        str(compilation_config),
+    ]
+
+    tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    try:
+        compare_two_settings(model_id,
+                             tp_sp_args,
+                             tp_args,
+                             tp_sp_env,
+                             tp_env,
+                             method=method)
+    except Exception:
+        testing_ray_compiled_graph = tp_sp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+
+
+SP_TEXT_GENERATION_MODELS = {
+    # [Decoder-only]
+    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.detailed(),
+}
+
+SP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "meta-llama/Llama-3.2-1B-Instruct",
+]
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
+    [
+        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in SP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_tp_sp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    task: TaskOption,
+    test_options: SPTestOptions,
+    num_gpus_available,
+):
+    _compare_sp(model_id,
+                parallel_setup,
+                distributed_backend,
+                vllm_major_version,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate",
+                is_multimodal=False)
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 92387b46425..65471cb3af3 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -1,16 +1,151 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from argparse import ArgumentError, ArgumentTypeError
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from typing import Literal, Optional
 
 import pytest
 
-from vllm.config import PoolerConfig
-from vllm.engine.arg_utils import EngineArgs, nullable_kvs
+from vllm.config import config
+from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
+                                   get_type, is_not_builtin, is_type,
+                                   literal_to_kwargs, nullable_kvs,
+                                   optional_type)
 from vllm.utils import FlexibleArgumentParser
 
 
+@pytest.mark.parametrize(("type", "value", "expected"), [
+    (int, "42", 42),
+    (int, "None", None),
+    (float, "3.14", 3.14),
+    (float, "None", None),
+    (str, "Hello World!", "Hello World!"),
+    (str, "None", None),
+    (json.loads, '{"foo":1,"bar":2}', {
+        "foo": 1,
+        "bar": 2
+    }),
+    (json.loads, "foo=1,bar=2", {
+        "foo": 1,
+        "bar": 2
+    }),
+    (json.loads, "None", None),
+])
+def test_optional_type(type, value, expected):
+    optional_type_func = optional_type(type)
+    context = nullcontext()
+    if value == "foo=1,bar=2":
+        context = pytest.warns(DeprecationWarning)
+    with context:
+        assert optional_type_func(value) == expected
+
+
+@pytest.mark.parametrize(("type_hint", "type", "expected"), [
+    (int, int, True),
+    (int, float, False),
+    (list[int], list, True),
+    (list[int], tuple, False),
+    (Literal[0, 1], Literal, True),
+])
+def test_is_type(type_hint, type, expected):
+    assert is_type(type_hint, type) == expected
+
+
+@pytest.mark.parametrize(("type_hints", "type", "expected"), [
+    ({float, int}, int, True),
+    ({int, tuple[int]}, int, True),
+    ({int, tuple[int]}, float, False),
+    ({str, Literal["x", "y"]}, Literal, True),
+])
+def test_contains_type(type_hints, type, expected):
+    assert contains_type(type_hints, type) == expected
+
+
+@pytest.mark.parametrize(("type_hints", "type", "expected"), [
+    ({int, float}, int, int),
+    ({int, float}, str, None),
+    ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
+])
+def test_get_type(type_hints, type, expected):
+    assert get_type(type_hints, type) == expected
+
+
+@pytest.mark.parametrize(("type_hints", "expected"), [
+    ({Literal[1, 2]}, {
+        "type": int,
+        "choices": [1, 2]
+    }),
+    ({Literal[1, "a"]}, Exception),
+])
+def test_literal_to_kwargs(type_hints, expected):
+    context = nullcontext()
+    if expected is Exception:
+        context = pytest.raises(expected)
+    with context:
+        assert literal_to_kwargs(type_hints) == expected
+
+
+@config
+@dataclass
+class DummyConfigClass:
+    regular_bool: bool = True
+    """Regular bool with default True"""
+    optional_bool: Optional[bool] = None
+    """Optional bool with default None"""
+    optional_literal: Optional[Literal["x", "y"]] = None
+    """Optional literal with default None"""
+    tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
+    """Tuple with variable length"""
+    tuple_2: tuple[int, int] = field(default_factory=lambda: (1, 2))
+    """Tuple with fixed length"""
+    list_n: list[int] = field(default_factory=lambda: [1, 2, 3])
+    """List with variable length"""
+    list_literal: list[Literal[1, 2]] = field(default_factory=list)
+    """List with literal choices"""
+    literal_literal: Literal[Literal[1], Literal[2]] = 1
+    """Literal of literals with default 1"""
+    json_tip: dict = field(default_factory=dict)
+    """Dict which will be JSON in CLI"""
+
+
+@pytest.mark.parametrize(("type_hint", "expected"), [
+    (int, False),
+    (DummyConfigClass, True),
+])
+def test_is_not_builtin(type_hint, expected):
+    assert is_not_builtin(type_hint) == expected
+
+
+def test_get_kwargs():
+    kwargs = get_kwargs(DummyConfigClass)
+    print(kwargs)
+
+    # bools should not have their type set
+    assert kwargs["regular_bool"].get("type") is None
+    assert kwargs["optional_bool"].get("type") is None
+    # optional literals should have None as a choice
+    assert kwargs["optional_literal"]["choices"] == ["x", "y", "None"]
+    # tuples should have the correct nargs
+    assert kwargs["tuple_n"]["nargs"] == "+"
+    assert kwargs["tuple_2"]["nargs"] == 2
+    # lists should work
+    assert kwargs["list_n"]["type"] is int
+    assert kwargs["list_n"]["nargs"] == "+"
+    # lists with literals should have the correct choices
+    assert kwargs["list_literal"]["type"] is int
+    assert kwargs["list_literal"]["nargs"] == "+"
+    assert kwargs["list_literal"]["choices"] == [1, 2]
+    # literals of literals should have merged choices
+    assert kwargs["literal_literal"]["choices"] == [1, 2]
+    # dict should have json tip in help
+    json_tip = "\n\nShould be a valid JSON string."
+    assert kwargs["json_tip"]["help"].endswith(json_tip)
+
+
 @pytest.mark.parametrize(("arg", "expected"), [
-    (None, None),
+    (None, dict()),
     ("image=16", {
         "image": 16
     }),
@@ -24,6 +159,10 @@
     }),
 ])
 def test_limit_mm_per_prompt_parser(arg, expected):
+    """This functionality is deprecated and will be removed in the future.
+    This argument should be passed as JSON string instead.
+    
+    TODO: Remove with nullable_kvs."""
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     if arg is None:
         args = parser.parse_args([])
@@ -53,12 +192,20 @@ def test_compilation_config():
     assert args.compilation_config.level == 3
 
     # set to string form of a dict
-    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
-    assert args.compilation_config.level == 3
+    args = parser.parse_args([
+        "--compilation-config",
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 
     # set to string form of a dict
-    args = parser.parse_args(["--compilation-config={'level': 3}"])
-    assert args.compilation_config.level == 3
+    args = parser.parse_args([
+        "--compilation-config="
+        "{'level': 3, 'cudagraph_capture_sizes': [1, 2, 4, 8]}",
+    ])
+    assert (args.compilation_config.level == 3 and
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
 
 
 def test_prefix_cache_default():
@@ -80,17 +227,6 @@ def test_prefix_cache_default():
     assert not engine_args.enable_prefix_caching
 
 
-def test_valid_pooling_config():
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    args = parser.parse_args([
-        '--override-pooler-config',
-        '{"pooling_type": "MEAN"}',
-    ])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.override_pooler_config == PoolerConfig(
-        pooling_type="MEAN", )
-
-
 @pytest.mark.parametrize(
     ("arg"),
     [
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
new file mode 100644
index 00000000000..0cf4f69d56a
--- /dev/null
+++ b/tests/engine/test_options.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+from contextlib import nullcontext
+
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_skip_tokenizer_initialization(model: str):
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(
+        model=model,
+        skip_tokenizer_init=True,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
+        llm.generate("abc", sampling_params)
+
+    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
+                           sampling_params=sampling_params)
+    assert len(outputs) > 0
+    completions = outputs[0].outputs
+    assert len(completions) > 0
+    assert completions[0].text == ""
+    assert completions[0].token_ids
+
+
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
+def test_enable_prompt_embeds(hf_runner, model: str,
+                              enable_prompt_embeds: bool):
+    prompt = "abc"
+
+    with hf_runner(model) as hf_model:
+        token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids
+        token_ids = token_ids.to(hf_model.model.device)
+
+        embed_layer = hf_model.model.get_input_embeddings()
+        prompt_embeds = embed_layer(token_ids).squeeze(0)
+
+    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
+        ValueError, match="set `--enable-prompt-embeds`"))
+
+    # This test checks if the flag skip_tokenizer_init skips the initialization
+    # of tokenizer and detokenizer. The generated output is expected to contain
+    # token ids.
+    llm = LLM(
+        model=model,
+        enable_prompt_embeds=enable_prompt_embeds,
+        enforce_eager=True,
+    )
+
+    with ctx:
+        llm.generate({"prompt_embeds": prompt_embeds})
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
deleted file mode 100644
index 5e197f5ffe5..00000000000
--- a/tests/engine/test_skip_tokenizer_init.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from vllm.entrypoints.llm import LLM
-from vllm.sampling_params import SamplingParams
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_skip_tokenizer_initialization(model: str):
-    # This test checks if the flag skip_tokenizer_init skips the initialization
-    # of tokenizer and detokenizer. The generated output is expected to contain
-    # token ids.
-    llm = LLM(
-        model=model,
-        skip_tokenizer_init=True,
-    )
-    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-
-    with pytest.raises(ValueError, match="cannot pass text prompts when"):
-        llm.generate("abc", sampling_params)
-
-    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
-                           sampling_params=sampling_params)
-    assert len(outputs) > 0
-    completions = outputs[0].outputs
-    assert len(completions) > 0
-    assert completions[0].text == ""
-    assert completions[0].token_ids
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index e96081c167e..742a6668344 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,15 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
+import weakref
 
 import pytest
 
 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
 
 from ..openai.test_vision import TEST_IMAGE_URLS
 
 
-def test_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+@pytest.fixture(scope="function")
+def text_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+              enforce_eager=True,
+              seed=0)
 
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def test_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     messages = [
         {
@@ -21,13 +37,11 @@ def test_chat():
             "content": prompt1
         },
     ]
-    outputs = llm.chat(messages)
+    outputs = text_llm.chat(messages)
     assert len(outputs) == 1
 
 
-def test_multi_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
-
+def test_multi_chat(text_llm):
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
 
@@ -55,13 +69,14 @@ def test_multi_chat():
 
     messages = [conversation1, conversation2]
 
-    outputs = llm.chat(messages)
+    outputs = text_llm.chat(messages)
     assert len(outputs) == 2
 
 
-@pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
-def test_chat_multi_image(image_urls: list[str]):
+@pytest.fixture(scope="function")
+def vision_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
         max_model_len=4096,
@@ -69,8 +84,20 @@ def test_chat_multi_image(image_urls: list[str]):
         enforce_eager=True,
         trust_remote_code=True,
         limit_mm_per_prompt={"image": 2},
+        seed=0,
     )
 
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(vision_llm, image_urls: list[str]):
     messages = [{
         "role":
         "user",
@@ -87,5 +114,83 @@ def test_chat_multi_image(image_urls: list[str]):
             },
         ],
     }]
-    outputs = llm.chat(messages)
+    outputs = vision_llm.chat(messages)
     assert len(outputs) >= 0
+
+
+def test_llm_chat_tokenization_no_double_bos(text_llm):
+    """
+    LLM.chat() should not add special tokens when using chat templates.
+    Check we get a single BOS token for llama chat.
+    """
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello!"
+        },
+    ]
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    bos_token = text_llm.get_tokenizer().bos_token_id
+
+    # Ensure we have a single BOS
+    assert prompt_token_ids[0] == bos_token
+    assert prompt_token_ids[1] != bos_token, "Double BOS"
+
+
+@pytest.fixture(scope="function")
+def thinking_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        max_model_len=4096,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("enable_thinking", [True, False])
+def test_chat_extra_kwargs(thinking_llm, enable_thinking):
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "What is 1+1?"
+        },
+    ]
+
+    outputs = thinking_llm.chat(
+        messages,
+        chat_template_kwargs={"enable_thinking": enable_thinking},
+    )
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
+
+    if enable_thinking:
+        assert think_id not in prompt_token_ids
+    else:
+        # The chat template includes dummy thinking process
+        assert think_id in prompt_token_ids
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index e43e9826e8f..fdbdccd4654 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -16,10 +16,11 @@
 
 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = [
-    "outlines",
-    "lm-format-enforcer",
-    "xgrammar:disable-any-whitespace",
-    "guidance:disable-any-whitespace",
+    # (backend, disable_any_whitespace),
+    ("outlines", False),
+    ("lm-format-enforcer", False),
+    ("xgrammar", True),
+    ("guidance", True),
 ]
 
 
@@ -36,13 +37,17 @@ def llm():
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
-                                         backend=guided_decoding_backend))
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
+def test_guided_regex(sample_regex, llm, guided_decoding_backend: str,
+                      disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(
+            regex=sample_regex,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(prompts=[
         f"Give an example IPv4 address with this regex: {sample_regex}"
     ] * 2,
@@ -62,14 +67,18 @@ def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
 def test_guided_json_completion(sample_json_schema, llm,
-                                guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_json_schema,
-                                         backend=guided_decoding_backend))
+                                guided_decoding_backend: str,
+                                disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an employee profile "
         f"that fits this schema: {sample_json_schema}"
@@ -92,14 +101,18 @@ def test_guided_json_completion(sample_json_schema, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
 def test_guided_complex_json_completion(sample_complex_json_schema, llm,
-                                        guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_complex_json_schema,
-                                         backend=guided_decoding_backend))
+                                        guided_decoding_backend: str,
+                                        disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_complex_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an assignment grade "
         f"that fits this schema: {sample_complex_json_schema}"
@@ -123,14 +136,18 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
 def test_guided_definition_json_completion(sample_definition_json_schema, llm,
-                                           guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_definition_json_schema,
-                                         backend=guided_decoding_backend))
+                                           guided_decoding_backend: str,
+                                           disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_definition_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for solving 8x + 7 = -23 "
         f"that fits this schema: {sample_definition_json_schema}"
@@ -154,14 +171,18 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
 def test_guided_enum_json_completion(sample_enum_json_schema, llm,
-                                     guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_enum_json_schema,
-                                         backend=guided_decoding_backend))
+                                     guided_decoding_backend: str,
+                                     disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_enum_json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(prompts=[
         "Create a bug report JSON that fits this schema: "
         f"{sample_enum_json_schema}. Make it for a high priority critical bug."
@@ -195,14 +216,18 @@ def test_guided_enum_json_completion(sample_enum_json_schema, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
 def test_guided_choice_completion(sample_guided_choice, llm,
-                                  guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         choice=sample_guided_choice,
-                                         backend=guided_decoding_backend))
+                                  guided_decoding_backend: str,
+                                  disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(
+            choice=sample_guided_choice,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
@@ -221,15 +246,19 @@ def test_guided_choice_completion(sample_guided_choice, llm,
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
 def test_guided_grammar(sample_sql_statements, llm,
-                        guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_statements,
-                                         backend=guided_decoding_backend))
+                        guided_decoding_backend: str,
+                        disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            grammar=sample_sql_statements,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(
         prompts=("Generate a sql state that select col_1 from "
                  "table_1 where it is equals to 1"),
@@ -300,26 +329,31 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
                                      top_p=0.95,
                                      guided_decoding=GuidedDecodingParams(
                                          json=unsupported_json,
-                                         backend="xgrammar:no-fallback"))
+                                         backend="xgrammar",
+                                         disable_fallback=True))
 
     with pytest.raises(
             ValueError,
             match="xgrammar does not support advanced JSON schema features "
-            "like enums, patterns or numeric ranges."):
+            "like string length, item limits, or property bounds."):
         llm.generate(prompts="This should fail",
                      sampling_params=sampling_params,
                      use_tqdm=True)
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-def test_guided_json_object(llm, guided_decoding_backend: str):
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=100,
-                                     n=2,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json_object=True,
-                                         backend=guided_decoding_backend))
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
+def test_guided_json_object(llm, guided_decoding_backend: str,
+                            disable_any_whitespace: bool):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        n=2,
+        guided_decoding=GuidedDecodingParams(
+            json_object=True,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
 
     outputs = llm.generate(
         prompts=("Generate a JSON object with curly braces for a person with "
@@ -337,7 +371,7 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
             print(generated_text)
             assert generated_text is not None
 
-            if 'disable-any-whitespace' in guided_decoding_backend:
+            if disable_any_whitespace:
                 assert "\n" not in generated_text
 
             # Parse to verify it is valid JSON
@@ -359,14 +393,18 @@ class CarDescription(BaseModel):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str,
+                                          disable_any_whitespace: bool):
     json_schema = CarDescription.model_json_schema()
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=json_schema,
-                                         backend=guided_decoding_backend))
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=json_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace))
     outputs = llm.generate(
         prompts="Generate a JSON with the brand, model and car_type of"
         "the most iconic car from the 90's",
@@ -383,4 +421,124 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
         assert generated_text is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend,disable_any_whitespace",
+                         GUIDED_DECODING_BACKENDS)
+def test_guided_number_range_json_completion(llm, guided_decoding_backend: str,
+                                             disable_any_whitespace: bool):
+    sample_output_schema = {
+        "type": "object",
+        "properties": {
+            "age": {
+                "type": "integer",
+                "minimum": 18,
+                "maximum": 99
+            },
+            "score": {
+                "type": "number",
+                "minimum": 0.0,
+                "maximum": 100.0
+            },
+            "zipcode": {
+                "type": "string",
+                "pattern": r"^\d{5}(-\d{4})?$"
+            },
+        },
+        "required": ["age", "score", "zipcode"],
+    }
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(
+            json=sample_output_schema,
+            backend=guided_decoding_backend,
+            disable_any_whitespace=disable_any_whitespace),
+    )
+    outputs = llm.generate(
+        prompts=[
+            "Create a JSON object for a user with age, score, and zipcode."
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_output_schema)
+        assert 18 <= output_json["age"] <= 99
+        assert 0.0 <= output_json["score"] <= 100.0
+        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
+                is not None)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(llm):
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
+        "<|im_end|>\n<|im_start|>assistant\n")
+
+    def generate_with_backend(backend, disable_additional_properties):
+        guided_params = GuidedDecodingParams(
+            json=schema,
+            backend=backend,
+            disable_any_whitespace=True,
+            disable_additional_properties=disable_additional_properties)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+
+    base_generated = generate_with_backend("guidance", False)
+    assert "a1" in base_generated
+    assert "a2" in base_generated
+    assert "a3" in base_generated
+    # by default additional keys are generated
+    assert "a4" in base_generated
+    assert "a5" in base_generated
+    assert "a6" in base_generated
+
+    generated = generate_with_backend("guidance", True)
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index eca5d184f5d..642c204b9ff 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -150,6 +150,7 @@ def test_wer_correctness(model_name,
                          expected_wer,
                          n_examples=-1,
                          max_concurrent_request=None):
+    # TODO refactor to use `ASRDataset`
     with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
         dataset = load_hf_dataset(dataset_repo)
 
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index b13002a5b68..72e61665677 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -27,7 +29,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"audio={MAXIMUM_AUDIOS}",
+        json.dumps({"audio": MAXIMUM_AUDIOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -102,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": audio_url
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 53df1d9241b..e00f001ef73 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -13,9 +13,9 @@
 @pytest.fixture(scope="module")
 def server():  # noqa: F811
     args = [
-        "--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
-        "--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
-        "--tool-call-parser", "hermes"
+        "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
+        "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
+        "hermes"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index e0285b5e556..8d1abe28a02 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -122,31 +122,23 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
     """Ensure validation fails if reasoning is enabled with auto tool choice"""
     args = serve_parser.parse_args(args=[
         "--enable-auto-tool-choice",
-        "--enable-reasoning",
+        "--reasoning-parser",
+        "deepseek_r1",
     ])
     with pytest.raises(TypeError):
         validate_parsed_serve_args(args)
 
 
-def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
+def test_passes_with_reasoning_parser(serve_parser):
     """Ensure validation passes if reasoning is enabled 
     with a reasoning parser"""
     args = serve_parser.parse_args(args=[
-        "--enable-reasoning",
         "--reasoning-parser",
         "deepseek_r1",
     ])
     validate_parsed_serve_args(args)
 
 
-def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
-    """Ensure validation fails if reasoning is enabled 
-    without a reasoning parser"""
-    args = serve_parser.parse_args(args=["--enable-reasoning"])
-    with pytest.raises(TypeError):
-        validate_parsed_serve_args(args)
-
-
 def test_chat_template_validation_for_happy_paths(serve_parser):
     """Ensure validation passes if the chat template exists"""
     args = serve_parser.parse_args(
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 2cdeb684f75..1019bfd5893 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,11 +11,12 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.embedding.utils import check_embeddings_close
+from ...models.utils import run_embedding_correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+DTYPE = "bfloat16"
 
 
 @pytest.fixture(scope="module")
@@ -25,7 +26,7 @@ def server():
         "embed",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",
+        DTYPE,
         "--enforce-eager",
         "--max-model-len",
         "512",
@@ -43,9 +44,17 @@ async def client(server):
         yield async_client
 
 
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL_NAME, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
+                                model_name: str):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
@@ -66,6 +75,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
     assert embeddings.usage.prompt_tokens == 11
     assert embeddings.usage.total_tokens == 11
 
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
     embedding_response = await client.embeddings.create(
@@ -86,7 +98,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
+                               model_name: str):
     # test list[str]
     input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
@@ -107,6 +120,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     assert embeddings.usage.prompt_tokens == 33
     assert embeddings.usage.total_tokens == 33
 
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
     # test list[list[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
@@ -181,7 +197,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
+async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
                                       model_name: str):
     input_texts = [
         "Hello my name is",
@@ -192,6 +208,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
                                                      model=model_name,
                                                      encoding_format="float")
     float_data = [d.embedding for d in responses_float.data]
+    run_embedding_correctness_test(hf_model, input_texts, float_data)
 
     responses_base64 = await client.embeddings.create(input=input_texts,
                                                       model=model_name,
@@ -202,24 +219,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
             np.frombuffer(base64.b64decode(data.embedding),
                           dtype="float32").tolist())
 
-    check_embeddings_close(
-        embeddings_0_lst=float_data,
-        embeddings_1_lst=base64_data,
-        name_0="float",
-        name_1="base64",
-    )
+    run_embedding_correctness_test(hf_model, input_texts, base64_data)
 
     # Default response is float32 decoded from base64 by OpenAI Client
     responses_default = await client.embeddings.create(input=input_texts,
                                                        model=model_name)
     default_data = [d.embedding for d in responses_default.data]
-
-    check_embeddings_close(
-        embeddings_0_lst=float_data,
-        embeddings_1_lst=default_data,
-        name_0="float",
-        name_1="default",
-    )
+    run_embedding_correctness_test(hf_model, input_texts, default_data)
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 79d43a2231f..332fa332a4a 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -3,80 +3,122 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
 
-from typing import NamedTuple
+from typing import Optional
 
 import openai
 import pytest
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
+from ...conftest import HfRunner
+from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
 from ...utils import RemoteOpenAIServer
 
-
-class ModelInfo(NamedTuple):
-    name: str
-    is_matryoshka: bool
-
-
 MODELS = [
-    ModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
-    ModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   matryoshka_dimensions=[256]),
 ]
 
 input_texts = [
     "The chef prepared a delicious meal.",
-] * 3
+]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model", MODELS)
-async def test_validating_dimensions(model: ModelInfo):
+@pytest.fixture(scope="module", params=MODELS)
+def model_info(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["bfloat16"])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(model_info, dtype: str):
     args = [
         "--task",
         "embed",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",
+        dtype,
         "--enforce-eager",
         "--max-model-len",
-        "512",
-        "--trust_remote_code"
+        "512"
     ]
-    with RemoteOpenAIServer(model.name, args) as remote_server:
-        client = remote_server.get_async_client()
-
-        async def make_request(dimensions):
-            embedding_response = await client.embeddings.create(
-                model=model.name,
-                input=input_texts,
-                dimensions=dimensions,
-                encoding_format="float",
-            )
-            embeddings = EmbeddingResponse.model_validate(
-                embedding_response.model_dump(mode="json"))
-
-            assert embeddings.id is not None
-            assert len(embeddings.data) == 3
-            assert len(embeddings.data[0].embedding) > 0
-            assert embeddings.usage.completion_tokens == 0
-            assert embeddings.usage.prompt_tokens > 0
-            assert embeddings.usage.total_tokens > 0
-
-            if dimensions is not None:
-                assert len(embeddings.data[0].embedding) == dimensions
-
-        if model.is_matryoshka:
-            for dimensions in [None, 16]:
-                await make_request(dimensions)
 
+    if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
+        # Manually enable Matryoshka Embeddings
+        args.extend([
+            "--trust_remote_code", "--hf_overrides",
+            '{"matryoshka_dimensions":[256]}'
+        ])
+
+    with RemoteOpenAIServer(model_info.name, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner, model_info, dtype: str):
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+async def test_matryoshka(model_info: EmbedModelInfo,
+                          server: RemoteOpenAIServer, hf_model: HfRunner):
+    client = server.get_async_client()
+
+    async def make_request_and_correctness_test(dimensions):
+        prompts = input_texts * 3
+
+        embedding_response = await client.embeddings.create(
+            model=model_info.name,
+            input=prompts,
+            dimensions=dimensions,
+            encoding_format="float",
+        )
+        embeddings = EmbeddingResponse.model_validate(
+            embedding_response.model_dump(mode="json"))
+
+        assert embeddings.id is not None
+        assert len(embeddings.data) == 3
+        assert len(embeddings.data[0].embedding) > 0
+        assert embeddings.usage.completion_tokens == 0
+        assert embeddings.usage.prompt_tokens > 0
+        assert embeddings.usage.total_tokens > 0
+
+        if dimensions is not None:
+            assert len(embeddings.data[0].embedding) == dimensions
+
+        vllm_outputs = [d.embedding for d in embeddings.data]
+        run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
+                                       dimensions)
+
+    if model_info.is_matryoshka:
+        valid_dimensions: list[Optional[int]] = [None]
+        if model_info.matryoshka_dimensions is not None:
+            valid_dimensions += model_info.matryoshka_dimensions[:2]
+
+        for dimensions in valid_dimensions:
+            await make_request_and_correctness_test(dimensions)
+
+        invalid_dimensions: list[Optional[int]] = [-1]
+        if model_info.matryoshka_dimensions is not None:
+            assert 5 not in model_info.matryoshka_dimensions
+            invalid_dimensions.append(5)
+
+        for dimensions in invalid_dimensions:
             with pytest.raises(openai.BadRequestError):
-                for dimensions in [-1]:
-                    await make_request(dimensions)
+                await make_request_and_correctness_test(dimensions)
 
-        else:
-            for dimensions in [None]:
-                await make_request(dimensions)
+    else:
+        for dimensions in [None]:
+            await make_request_and_correctness_test(dimensions)
 
+        for dimensions in [-1, 16]:
             with pytest.raises(openai.BadRequestError):
-                for dimensions in [-1, 16]:
-                    await make_request(dimensions)
+                await make_request_and_correctness_test(dimensions)
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
new file mode 100644
index 00000000000..c96151349eb
--- /dev/null
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import suppress
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Optional
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import MultiModalConfig
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import (BaseModelPath,
+                                                    OpenAIServingModels)
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL_NAME = "openai-community/gpt2"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+MOCK_RESOLVER_NAME = "mock_test_resolver"
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    """Minimal mock ModelConfig for testing."""
+    model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
+    trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
+    max_model_len: int = 100
+    tokenizer_revision: Optional[str] = None
+    multimodal_config: MultiModalConfig = field(
+        default_factory=MultiModalConfig)
+    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
+    logits_processor_pattern: Optional[str] = None
+    diff_sampling_param: Optional[dict] = None
+    allowed_local_media_path: str = ""
+    encoder_config = None
+    generation_config: str = "auto"
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+class MockLoRAResolver(LoRAResolver):
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test-lora":
+            return LoRARequest(lora_name="test-lora",
+                               lora_int_id=1,
+                               lora_local_path="/fake/path/test-lora")
+        elif lora_name == "invalid-lora":
+            return LoRARequest(lora_name="invalid-lora",
+                               lora_int_id=2,
+                               lora_local_path="/fake/path/invalid-lora")
+        return None
+
+
+@pytest.fixture(autouse=True)
+def register_mock_resolver():
+    """Fixture to register and unregister the mock LoRA resolver."""
+    resolver = MockLoRAResolver()
+    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
+    yield
+    # Cleanup: remove the resolver after the test runs
+    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
+        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
+
+
+@pytest.fixture
+def mock_serving_setup():
+    """Provides a mocked engine and serving completion instance."""
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    def mock_add_lora_side_effect(lora_request: LoRARequest):
+        """Simulate engine behavior when adding LoRAs."""
+        if lora_request.lora_name == "test-lora":
+            # Simulate successful addition
+            return
+        elif lora_request.lora_name == "invalid-lora":
+            # Simulate failure during addition (e.g. invalid format)
+            raise ValueError(f"Simulated failure adding LoRA: "
+                             f"{lora_request.lora_name}")
+
+    mock_engine.add_lora.side_effect = mock_add_lora_side_effect
+    mock_engine.generate.reset_mock()
+    mock_engine.add_lora.reset_mock()
+
+    mock_model_config = MockModelConfig()
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+
+    serving_completion = OpenAIServingCompletion(mock_engine,
+                                                 mock_model_config,
+                                                 models,
+                                                 request_logger=None)
+
+    return mock_engine, serving_completion
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_with_lora_resolver(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    # Suppress potential errors during the mocked generate call,
+    # as we are primarily checking for add_lora and generate calls
+    with suppress(Exception):
+        await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+    mock_engine.generate.assert_called_once()
+    called_lora_request = mock_engine.generate.call_args[1]['lora_request']
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_not_found(mock_serving_setup,
+                                                     monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    non_existent_model = "non-existent-lora-adapter"
+    req = CompletionRequest(
+        model=non_existent_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
+
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.NOT_FOUND.value
+    assert non_existent_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_add_lora_fails(
+        mock_serving_setup, monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    invalid_model = "invalid-lora"
+    req = CompletionRequest(
+        model=invalid_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    # Assert add_lora was called before the failure
+    mock_engine.add_lora.assert_called_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == invalid_model
+
+    # Assert generate was *not* called due to the failure
+    mock_engine.generate.assert_not_called()
+
+    # Assert the correct error response
+    assert isinstance(response, ErrorResponse)
+    assert response.code == HTTPStatus.BAD_REQUEST.value
+    assert invalid_model in response.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_flag_not_set(mock_serving_setup):
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
new file mode 100644
index 00000000000..1ccb803a328
--- /dev/null
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import schemathesis
+from schemathesis import GenerationConfig
+
+from ...utils import RemoteOpenAIServer
+
+schemathesis.experimental.OPEN_API_3_1.enable()
+
+MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
+MAXIMUM_IMAGES = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def get_schema(server):
+    # avoid generating null (\x00) bytes in strings during test case generation
+    return schemathesis.openapi.from_uri(
+        f"{server.url_root}/openapi.json",
+        generation_config=GenerationConfig(allow_x00=False),
+    )
+
+
+schema = schemathesis.from_pytest_fixture("get_schema")
+
+
+@schema.parametrize()
+@schema.override(headers={"Content-Type": "application/json"})
+async def test_openapi_stateless(case):
+    #No need to verify SSL certificate for localhost
+    await case.call_and_validate(verify=False)
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 19d16713b20..5e11af8cf89 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -272,3 +272,43 @@ def test_serving_chat_could_load_correct_generation_config():
 
     assert mock_engine.generate.call_args.args[1].temperature == 0.0
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+
+def test_serving_chat_did_set_correct_cache_salt():
+    mock_model_config = MockModelConfig()
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=mock_model_config)
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+
+    # Test cache_salt
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+    )
+
+    # By default cache_salt in the engine prompt is not set
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert "cache_salt" not in mock_engine.generate.call_args.args[0]
+
+    # Test with certain cache_salt
+    req.cache_salt = "test_salt"
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+    assert mock_engine.generate.call_args.args[0]["cache_salt"] == "test_salt"
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 29571bcd764..5c48df3cebb 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -192,3 +192,36 @@ async def post_with_stream(*args, **kwargs):
                 else:
                     continuous = continuous and hasattr(chunk, 'usage')
             assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked. 
+    """
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.8,
+            extra_body=dict(seed=42,
+                            repetition_penalty=1.9,
+                            top_k=12,
+                            top_p=0.4,
+                            min_p=0.5,
+                            frequency_penalty=1.8,
+                            presence_penalty=2.0))
+
+        greedy_transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            temperature=0.0,
+            extra_body=dict(seed=42))
+
+        assert greedy_transcription.text != transcription.text
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py
new file mode 100644
index 00000000000..137ed9db858
--- /dev/null
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input = """Immerse yourself in the enchanting chronicle of calculus, a 
+    mathematical domain that has radically transformed our comprehension of 
+    change and motion. Despite its roots in ancient civilizations, the 
+    formal birth of calculus predominantly occurred in the 17th century, 
+    primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+    Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+    ancient Greek mathematics,most notably in the works of Eudoxus and 
+    Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+    technique for computing areas and volumes through the use of finite sums. 
+    This methodology laid crucial foundational work for integral calculus. 
+    In the 17th century, both Newton and Leibniz independently pioneered 
+    calculus, each contributing unique perspectives that would shape this new 
+    field."""
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embed",
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        str(max_model_len),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 10
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    response = await client.post(path="embeddings",
+                                 cast_to=object,
+                                 body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == truncation_size
+
+
+@pytest.mark.asyncio
+async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = max_model_len + 1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    with pytest.raises(openai.BadRequestError) as err:
+        err = await client.post(path="embeddings",
+                                cast_to=object,
+                                body={**kwargs})
+
+        assert str(err) == f"""openai.BadRequestError: 
+                    Error code: 400 - {{'object': 'error', 
+                    'message': 'truncate_prompt_tokens value 
+                    ({truncation_size}) 
+                    is greater than max_model_len ({max_model_len}). 
+                    Please, select a smaller truncation size.', 
+                    'type': 'BadRequestError', 
+                    'param': None, 'code': 400}}"""
+
+
+@pytest.mark.asyncio
+async def test_max_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = -1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+
+    response = await client.post(path="embeddings",
+                                 cast_to=object,
+                                 body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == max_model_len
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index f9ccce9c1c3..53f057a294c 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -31,7 +33,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"video={MAXIMUM_VIDEOS}",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -106,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": video_url
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # video_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 4b9029ded41..1ab50b41c7e 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import openai
 import pytest
 import pytest_asyncio
@@ -35,7 +37,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        json.dumps({"image": MAXIMUM_IMAGES}),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -135,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": image_url
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # image_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 3e6f13e10ac..26c68e06c19 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
+
 import pytest
 import requests
 from PIL import Image
@@ -37,7 +39,7 @@ def server():
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}",
+        json.dumps({"image": MAXIMUM_IMAGES}),
         "--chat-template",
         str(vlm2vec_jinja_path),
     ]
diff --git a/tests/kernels/conftest.py b/tests/kernels/attention/conftest.py
similarity index 100%
rename from tests/kernels/conftest.py
rename to tests/kernels/attention/conftest.py
diff --git a/tests/kernels/test_attention.py b/tests/kernels/attention/test_attention.py
similarity index 99%
rename from tests/kernels/test_attention.py
rename to tests/kernels/attention/test_attention.py
index 0d7898a900e..e5650136f25 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -6,13 +6,12 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 if not current_platform.is_rocm():
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
new file mode 100644
index 00000000000..b0414244c21
--- /dev/null
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.cpu import CpuPlatform
+from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+# Define MLA and non-MLA backends separately
+DEVICE_MLA_BACKENDS = {
+    "cuda": ["TRITON_MLA", "FLASHMLA"],
+    "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
+    "cpu": [],
+}
+
+DEVICE_REGULAR_ATTN_BACKENDS = {
+    "cuda": ["XFORMERS", "FLASHINFER"],
+    "hip": ["ROCM_FLASH"],
+    "cpu": ["TORCH_SDPA"],
+}
+
+DEVICE_MLA_BLOCK_SIZES = {
+    "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
+    "hip": [16, 1],  # HIP requires special handling for block_size=1
+    "cpu": [16]  # CPU uses fixed block size from test cases
+}
+
+
+def generate_params():
+    params = []
+    for use_mla in [True, False]:
+        for device in ["cuda", "hip", "cpu"]:
+            backends = DEVICE_MLA_BACKENDS[
+                device] if use_mla else DEVICE_REGULAR_ATTN_BACKENDS[device]
+            for name in backends:
+                block_sizes = DEVICE_MLA_BLOCK_SIZES[device] if use_mla else [
+                    16
+                ]
+                for block_size in block_sizes:
+                    params.append(
+                        pytest.param(
+                            device,
+                            name,
+                            use_mla,
+                            block_size,
+                            id=
+                            f"{device}_{name}_mla_{str(use_mla)[0]}_blks{block_size}"
+                        ))
+    return params
+
+
+@pytest.mark.parametrize("device, name, use_mla, block_size",
+                         generate_params())
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_env(
+    device: str,
+    name: str,
+    use_mla: bool,
+    block_size: int,
+    use_v1: bool,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test attention backend selection with valid device-backend pairs."""
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           block_size, False)
+            assert backend.get_name() == "TORCH_SDPA"
+
+        elif device == "hip":
+            with patch("vllm.attention.selector.current_platform",
+                       RocmPlatform()):
+                if use_mla:
+                    # Validate HIP MLA backend-block_size combinations
+                    valid_combination = (
+                        (name == "TRITON_MLA" and block_size != 1)
+                        or (name == "ROCM_AITER_MLA" and block_size == 1))
+
+                    if valid_combination:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        assert backend.get_name() == name
+                    else:
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(16,
+                                             torch.float16,
+                                             torch.float16,
+                                             block_size,
+                                             False,
+                                             use_mla=use_mla)
+                        assert f"The selected backend, {name}" in str(
+                            exc_info.value)
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    assert backend.get_name() == expected
+
+        elif device == "cuda":
+            with patch("vllm.attention.selector.current_platform",
+                       CudaPlatform()):
+                if use_mla:
+                    if name == "FLASHMLA" and block_size == 64:
+                        from vllm.attention.backends.flashmla import (
+                            is_flashmla_supported)
+
+                        # only on cuda platforms with specific capability.
+                        is_supported, _ = is_flashmla_supported()
+
+                        if not is_supported:
+                            # if platform is not supported then skip this case.
+                            pytest.skip()
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       torch.float16,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = f"{name}_VLLM_V1" if use_v1 else name
+                            assert backend.get_name() == expected
+                    else:
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = ("TRITON_MLA_VLLM_V1"
+                                    if use_v1 else "TRITON_MLA")
+                        assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
+                else:
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
+
+
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
+    """Test FlashAttn validation."""
+    # TODO: When testing for v1, pipe in `use_v1` as an argument to
+    # get_attn_backend
+
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
+
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
+                            (7, 5))
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()
+
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # flash-attn is not installed
+        import sys
+        original_module = sys.modules.get('vllm_flash_attn')
+        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
+        backend = get_attn_backend(16, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+                                original_module)
+        else:
+            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
+
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Attention-free models should bypass env and use PlaceholderAttention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, patch(
+            "vllm.attention.selector.current_platform", CudaPlatform()):
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+
+        # Test with head size 32
+        backend = get_attn_backend(32, torch.float16, None, 16, False)
+        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
+        assert backend.get_name() == EXPECTED
+
+        # when block size == 16, backend will fall back to XFORMERS
+        # this behavior is not yet supported on V1.
+        if use_v1:
+            # TODO: support fallback on V1!
+            # https://github.com/vllm-project/vllm/issues/14524
+            pass
+        else:
+            backend = get_attn_backend(16, torch.float16, None, 16, False)
+            assert backend.get_name() == "XFORMERS"
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
similarity index 99%
rename from tests/kernels/test_blocksparse_attention.py
rename to tests/kernels/attention/test_blocksparse_attention.py
index 3025ae0f921..82d03825757 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -6,14 +6,13 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn)
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
diff --git a/tests/kernels/test_cache.py b/tests/kernels/attention/test_cache.py
similarity index 93%
rename from tests/kernels/test_cache.py
rename to tests/kernels/attention/test_cache.py
index 899122818e0..2f2212dd2b0 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -16,6 +16,7 @@
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
+CACHE_LAYOUTS = ["NHD", "HND"]
 
 # Parameters for MLA tests.
 KV_LORA_RANKS = [512]
@@ -220,6 +221,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("kv_cache_layout", CACHE_LAYOUTS)
 @torch.inference_mode()
 def test_reshape_and_cache_flash(
     kv_cache_factory_flashinfer,
@@ -232,17 +234,21 @@ def test_reshape_and_cache_flash(
     seed: int,
     device: str,
     kv_cache_dtype: str,
+    kv_cache_layout: str,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
 
+    # fp8 conversion requires continugous memory buffer. Reduce the number of
+    # blocks and tokens to consume less memory.
+    num_tokens = num_tokens // 2
+    num_blocks = num_blocks // 2
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
     slot_mapping = torch.tensor(slot_mapping_lst,
                                 dtype=torch.long,
                                 device=device)
-
     qkv = torch.randn(num_tokens,
                       3,
                       num_heads,
@@ -261,27 +267,35 @@ def test_reshape_and_cache_flash(
         kv_cache_dtype,
         dtype,
         device=device,
+        cache_layout=kv_cache_layout,
     )
-    key_cache, value_cache = key_caches[0].contiguous(
-    ), value_caches[0].contiguous()
+    key_cache, value_cache = key_caches[0], value_caches[0]
     del key_caches
     del value_caches
 
     k_scale = (key.amax() / 64.0).to(torch.float32)
     v_scale = (value.amax() / 64.0).to(torch.float32)
 
+    def permute_and_compact(x):
+        y = x if kv_cache_layout == "NHD" else x.permute(0, 2, 1, 3)
+        return y.contiguous()
+
+    key_cache_compact = permute_and_compact(key_cache)
+    value_cache_compact = permute_and_compact(value_cache)
+
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
-        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache, k_scale.item(),
-                        kv_cache_dtype)
-        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache, v_scale.item(),
+        cloned_key_cache = torch.empty_like(key_cache_compact,
+                                            dtype=torch.float16)
+        ops.convert_fp8(cloned_key_cache, key_cache_compact, k_scale.item(),
                         kv_cache_dtype)
+        cloned_value_cache = torch.empty_like(value_cache_compact,
+                                              dtype=torch.float16)
+        ops.convert_fp8(cloned_value_cache, value_cache_compact,
+                        v_scale.item(), kv_cache_dtype)
     else:
-        cloned_key_cache = key_cache.clone()
-        cloned_value_cache = value_cache.clone()
-
+        cloned_key_cache = key_cache_compact.clone()
+        cloned_value_cache = value_cache_compact.clone()
     # Call the reshape_and_cache kernel.
     opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash,
             (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
@@ -289,16 +303,20 @@ def test_reshape_and_cache_flash(
             cond=(head_size == HEAD_SIZES[0]))
     ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
                                 slot_mapping, kv_cache_dtype, k_scale, v_scale)
+    key_cache_compact = permute_and_compact(key_cache)
+    value_cache_compact = permute_and_compact(value_cache)
 
     if kv_cache_dtype == "fp8":
-        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        result_key_cache = torch.empty_like(key_cache_compact,
+                                            dtype=torch.float16)
         ops.convert_fp8(result_key_cache,
-                        key_cache,
+                        key_cache_compact,
                         k_scale.item(),
                         kv_dtype=kv_cache_dtype)
-        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        result_value_cache = torch.empty_like(value_cache_compact,
+                                              dtype=torch.float16)
         ops.convert_fp8(result_value_cache,
-                        value_cache,
+                        value_cache_compact,
                         v_scale.item(),
                         kv_dtype=kv_cache_dtype)
 
@@ -310,8 +328,12 @@ def test_reshape_and_cache_flash(
     for i in range(num_tokens):
         block_idx = block_indicies_lst[i]
         block_offset = block_offsets_lst[i]
-        cloned_key_cache[block_idx, block_offset, :, :] = key[i]
-        cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+        if kv_cache_layout == "NHD":
+            cloned_key_cache[block_idx, block_offset, :, :] = key[i]
+            cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+        else:
+            cloned_key_cache[block_idx, :, block_offset, :] = key[i]
+            cloned_value_cache[block_idx, :, block_offset, :] = value[i]
 
     if kv_cache_dtype == "fp8":
         torch.testing.assert_close(result_key_cache,
@@ -323,8 +345,8 @@ def test_reshape_and_cache_flash(
                                    atol=0.001,
                                    rtol=0.1)
     else:
-        torch.testing.assert_close(key_cache, cloned_key_cache)
-        torch.testing.assert_close(value_cache, cloned_value_cache)
+        torch.testing.assert_close(key_cache_compact, cloned_key_cache)
+        torch.testing.assert_close(value_cache_compact, cloned_value_cache)
 
 
 @pytest.mark.parametrize("direction", COPYING_DIRECTION)
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
similarity index 100%
rename from tests/kernels/test_cascade_flash_attn.py
rename to tests/kernels/attention/test_cascade_flash_attn.py
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
similarity index 100%
rename from tests/kernels/test_encoder_decoder_attn.py
rename to tests/kernels/attention/test_encoder_decoder_attn.py
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
similarity index 99%
rename from tests/kernels/test_flash_attn.py
rename to tests/kernels/attention/test_flash_attn.py
index 572563c0bd8..88516b75cde 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -145,7 +145,7 @@ def test_flash_attn_with_paged_kv(
     v_descale = None
     if q_dtype is not None:
         # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
-        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_query = q.to(q_dtype)
         maybe_quantized_key_cache = key_cache.to(q_dtype)
         maybe_quantized_value_cache = value_cache.to(q_dtype)
 
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
similarity index 100%
rename from tests/kernels/test_flashinfer.py
rename to tests/kernels/attention/test_flashinfer.py
diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
similarity index 100%
rename from tests/kernels/test_flashmla.py
rename to tests/kernels/attention/test_flashmla.py
diff --git a/tests/kernels/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
similarity index 100%
rename from tests/kernels/test_lightning_attn.py
rename to tests/kernels/attention/test_lightning_attn.py
diff --git a/tests/kernels/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
similarity index 100%
rename from tests/kernels/test_merge_attn_states.py
rename to tests/kernels/attention/test_merge_attn_states.py
diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
similarity index 100%
rename from tests/kernels/test_mha_attn.py
rename to tests/kernels/attention/test_mha_attn.py
diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
similarity index 100%
rename from tests/kernels/test_mla_decode_cpu.py
rename to tests/kernels/attention/test_mla_decode_cpu.py
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
similarity index 100%
rename from tests/kernels/test_prefix_prefill.py
rename to tests/kernels/attention/test_prefix_prefill.py
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
new file mode 100644
index 00000000000..4cf7bcb01d4
--- /dev/null
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
+
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform",
+                            RocmPlatform())
+
+        # Test standard ROCm attention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert (backend.get_name() == "ROCM_FLASH"
+                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
+
+        # MLA test for deepseek related
+
+        # change the attention backend to triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
+                                   False, True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # The selected backend is triton MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
+                                   False, True)
+        assert backend.get_name() == "TRITON_MLA"
+
+        # change the attention backend to AITER MLA
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
+                                   False, True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
+
+        # If attention backend is None
+        # If use_mla is true
+        # If VLLM_ROCM_USE_AITER is enabled
+        # The selected backend is ROCM_AITER_MLA
+        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ROCM_USE_AITER", "1")
+        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False,
+                                   False, True)
+        assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
similarity index 100%
rename from tests/kernels/test_triton_decode_attention.py
rename to tests/kernels/attention/test_triton_decode_attention.py
diff --git a/tests/kernels/test_activation.py b/tests/kernels/core/test_activation.py
similarity index 97%
rename from tests/kernels/test_activation.py
rename to tests/kernels/core/test_activation.py
index cf0f21ce065..79f838a954e 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
@@ -12,8 +13,6 @@
                                                    SiluAndMul)
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
similarity index 100%
rename from tests/kernels/test_fused_quant_layernorm.py
rename to tests/kernels/core/test_fused_quant_layernorm.py
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/core/test_layernorm.py
similarity index 100%
rename from tests/kernels/test_layernorm.py
rename to tests/kernels/core/test_layernorm.py
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
new file mode 100644
index 00000000000..c9a9679c5d8
--- /dev/null
+++ b/tests/kernels/core/test_opcheck.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tests for miscellaneous utilities
+"""
+
+import torch
+
+from tests.kernels.utils import opcheck
+
+
+def test_convert_fp8_opcheck():
+    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
+    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
+    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
+
+
+# TODO: Add this back, currently fails with
+# csrc/cuda_utils_kernels.cu:15 'invalid argument'
+# @pytest.mark.skipif(not current_platform.is_cuda(),
+#                     reason="Only supported for CUDA")
+# def test_cuda_utils_opcheck():
+#     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
+#     opcheck(
+#         torch.ops._C_cuda_utils.
+#         get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
similarity index 100%
rename from tests/kernels/test_permute_cols.py
rename to tests/kernels/core/test_permute_cols.py
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
similarity index 99%
rename from tests/kernels/test_pos_encoding.py
rename to tests/kernels/core/test_pos_encoding.py
index eb83b4d612c..2b7bf755ec2 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -6,11 +6,10 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 112, 120, 256]
diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
similarity index 100%
rename from tests/kernels/test_rotary_embedding.py
rename to tests/kernels/core/test_rotary_embedding.py
diff --git a/tests/kernels/test_uva.py b/tests/kernels/core/test_uva.py
similarity index 100%
rename from tests/kernels/test_uva.py
rename to tests/kernels/core/test_uva.py
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
similarity index 100%
rename from tests/kernels/test_causal_conv1d.py
rename to tests/kernels/mamba/test_causal_conv1d.py
diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
similarity index 100%
rename from tests/kernels/test_mamba_mixer2.py
rename to tests/kernels/mamba/test_mamba_mixer2.py
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm.py
rename to tests/kernels/mamba/test_mamba_ssm.py
diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm_ssd.py
rename to tests/kernels/mamba/test_mamba_ssm_ssd.py
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
new file mode 100644
index 00000000000..975cd418a17
--- /dev/null
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -0,0 +1,364 @@
+# SPDX-License-Identifier: Apache-2.0
+import dataclasses
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
+                                                            fused_topk)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 3072, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+    (224, 3072, 1024),
+    (224, 3072, 1536),
+]
+
+
+@dataclasses.dataclass
+class MOETensors:
+    a: torch.Tensor
+    w1: torch.Tensor
+    w2: torch.Tensor
+    ab_strides1: torch.Tensor
+    c_strides1: torch.Tensor
+    ab_strides2: torch.Tensor
+    c_strides2: torch.Tensor
+
+    @staticmethod
+    def make_moe_tensors(m: int, k: int, n: int, e: int,
+                         dtype: torch.dtype) -> "MOETensors":
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        return MOETensors(a=a,
+                          w1=w1,
+                          w2=w2,
+                          ab_strides1=ab_strides1,
+                          c_strides1=c_strides1,
+                          ab_strides2=ab_strides2,
+                          c_strides2=c_strides2)
+
+
+@dataclasses.dataclass
+class MOETensors8Bit(MOETensors):
+    # quantized
+    a_q: Optional[torch.Tensor] = None  # a -> a_q
+    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
+    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
+    a_scale: Optional[torch.Tensor] = None
+    w1_scale: Optional[torch.Tensor] = None
+    w2_scale: Optional[torch.Tensor] = None
+    # dequantized
+    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
+    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
+    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+
+    @staticmethod
+    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
+                              per_act_token: bool,
+                              per_out_channel: bool) -> "MOETensors8Bit":
+        dtype = torch.half
+        q_dtype = torch.float8_e4m3fn
+
+        moe_tensors_fp16 = MOETensors.make_moe_tensors(m, k, n, e, dtype)
+
+        # a -> a_q, w1 -> w1_q, w2 -> w2_q
+        n_b_scales = 2 * n if per_out_channel else 1
+        k_b_scales = k if per_out_channel else 1
+        # Get the right scale for tests.
+        _, a_scale = ops.scaled_fp8_quant(
+            moe_tensors_fp16.a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(moe_tensors_fp16.a,
+                                      a_scale,
+                                      use_per_token_if_dynamic=per_act_token)
+        w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
+
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w1[expert],
+                use_per_token_if_dynamic=per_out_channel)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                moe_tensors_fp16.w2[expert],
+                use_per_token_if_dynamic=per_out_channel)
+
+        # a_q -> a_d, w1_q -> w1_d, w2_q -> w2_d
+        a_d = a_q.float().mul(a_scale).to(dtype)
+        w1_d = torch.empty_like(moe_tensors_fp16.w1)
+        w2_d = torch.empty_like(moe_tensors_fp16.w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
+
+        return MOETensors8Bit(a=moe_tensors_fp16.a,
+                              w1=moe_tensors_fp16.w1,
+                              w2=moe_tensors_fp16.w2,
+                              ab_strides1=moe_tensors_fp16.ab_strides1,
+                              c_strides1=moe_tensors_fp16.c_strides1,
+                              ab_strides2=moe_tensors_fp16.ab_strides2,
+                              c_strides2=moe_tensors_fp16.c_strides2,
+                              a_q=a_q,
+                              w1_q=w1_q,
+                              w2_q=w2_q,
+                              a_scale=a_scale,
+                              w1_scale=w1_scale,
+                              w2_scale=w2_scale,
+                              a_d=a_d,
+                              w1_d=w1_d,
+                              w2_d=w2_d)
+
+
+def run_with_expert_maps(num_experts: int, num_local_experts: int,
+                         **cutlass_moe_kwargs):
+
+    def slice_experts():
+        slice_params = [
+            "w1_q", "w2_q", "ab_strides1", "ab_strides2", "c_strides1",
+            "c_strides2", "w1_scale", "w2_scale"
+        ]
+        full_tensors = {
+            k: v
+            for k, v in cutlass_moe_kwargs.items()
+            if k in slice_params and k in cutlass_moe_kwargs
+        }
+
+        for i in range(0, num_experts, num_local_experts):
+            s, e = i, i + num_local_experts
+
+            # make expert map
+            expert_map = [-1] * num_experts
+            expert_map[s:e] = list(range(num_local_experts))
+            expert_map = torch.tensor(expert_map,
+                                      dtype=torch.int32,
+                                      device="cuda")
+
+            # update cutlass moe arg with expert_map
+            cutlass_moe_kwargs["expert_map"] = expert_map
+            # update cutlass moe arg tensors
+            for k, t in full_tensors.items():
+                cutlass_moe_kwargs[k] = t[s:e]
+
+            yield cutlass_moe_kwargs
+
+    out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"])
+    for kwargs in slice_experts():
+        out_tensor = out_tensor + cutlass_moe_fp8(**kwargs)
+
+    return out_tensor
+
+
+def run_8_bit(moe_tensors: MOETensors8Bit,
+              topk_weights: torch.Tensor,
+              topk_ids: torch.Tensor,
+              num_local_experts: Optional[int] = None) -> torch.Tensor:
+    assert not any([
+        t is None for t in [
+            moe_tensors.w1_q, moe_tensors.w2_q, moe_tensors.w1_scale,
+            moe_tensors.w2_scale, moe_tensors.a_scale
+        ]
+    ])
+
+    kwargs = {
+        'a': moe_tensors.a,
+        'w1_q': moe_tensors.w1_q.transpose(1, 2),  # type: ignore[union-attr]
+        'w2_q': moe_tensors.w2_q.transpose(1, 2),  # type: ignore[union-attr]
+        'topk_weights': topk_weights,
+        'topk_ids_': topk_ids,
+        'ab_strides1': moe_tensors.ab_strides1,
+        'c_strides1': moe_tensors.c_strides1,
+        'ab_strides2': moe_tensors.ab_strides2,
+        'c_strides2': moe_tensors.c_strides2,
+        'w1_scale': moe_tensors.w1_scale,
+        'w2_scale': moe_tensors.w2_scale,
+        'a1_scale': moe_tensors.a_scale
+    }
+
+    num_experts = moe_tensors.w1.size(0)
+    with_ep = num_local_experts is not None or num_local_experts == num_experts
+    if not with_ep:
+        return cutlass_moe_fp8(**kwargs)
+
+    assert num_local_experts is not None
+    return run_with_expert_maps(
+        num_experts,
+        num_local_experts,  # type: ignore[arg-type]
+        **kwargs)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_cuda_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        dtype = torch.half
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_ch)
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            cutlass_output = run_8_bit(mt, topk_weights, topk_ids)
+
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=9e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m", [64])
+@pytest.mark.parametrize("n", [1024])
+@pytest.mark.parametrize("k", [4096])
+@pytest.mark.parametrize("e", [16])
+@pytest.mark.parametrize("topk", [1, 8])
+@pytest.mark.parametrize("per_act_token", [True])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [1, 2, 4, 8, 16])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
+                                                  per_out_channel)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.half)
+        topk_weights, topk_ids = fused_topk(mt.a,
+                                            score,
+                                            topk,
+                                            renormalize=False)
+
+        # Note that we are using the dequantized versions of the tensors.
+        # Using a, w1 and w2 directly results in minor output differences.
+        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
+                                      topk_ids)
+
+        assert e % ep_size == 0, "Cannot distribute experts evenly"
+        cutlass_output = run_8_bit(mt,
+                                   topk_weights,
+                                   topk_ids,
+                                   num_local_experts=e // ep_size)
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/moe/test_moe.py
similarity index 99%
rename from tests/kernels/test_moe.py
rename to tests/kernels/moe/test_moe.py
index 425f36984a3..f2cca65ae42 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -420,7 +420,8 @@ def test_fused_marlin_moe(
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
-    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, False)
 
     torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
 
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
new file mode 100644
index 00000000000..dfcd61f7758
--- /dev/null
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the MOE permute/unpermute kernel
+
+Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
+"""
+
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
+    moe_permute, moe_unpermute)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [16, 64]
+TOP_KS = [2, 4, 6, 8]
+EP_SIZE = [1, 4, 16]
+current_platform.seed_everything(0)
+
+
+def torch_permute(hidden_states: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  token_expert_indices: torch.Tensor,
+                  topk: int,
+                  n_expert: int,
+                  n_local_expert: int,
+                  start_expert: int,
+                  expert_map: Optional[torch.Tensor] = None,
+                  align_block_size: Optional[int] = None,
+                  fill_invalid_expert: int = -1) -> list[torch.Tensor]:
+    n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
+    if expert_map is not None:
+        is_local_expert = (expert_map[topk_ids] != -1)
+        not_local_expert = (expert_map[topk_ids] == -1)
+        topk_ids = is_local_expert * (
+            topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert)
+
+    sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(),
+                                                 stable=True)
+    dst_row_id2src_row_id_map = token_expert_indices.flatten()[sorted_indices]
+
+    expert_first_token_offset = torch.zeros(n_local_expert + 1,
+                                            dtype=torch.int64,
+                                            device="cuda")
+    idx = 0
+    for i in range(0, n_local_expert):
+        cnt = 0
+        while idx < sorted_topk_ids.numel() and sorted_topk_ids[idx] == i:
+            cnt += 1
+            idx += 1
+        expert_first_token_offset[i + 1] = expert_first_token_offset[i] + cnt
+
+    _, src2dst_idx = torch.sort(dst_row_id2src_row_id_map)
+    valid_row_idx = []
+    if align_block_size is None:
+
+        permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map %
+                                               n_token, ...]
+        permuted_row_size = permuted_hidden_states.shape[0]
+        m_indices = torch.empty(permuted_row_size,
+                                device="cuda",
+                                dtype=torch.int32).fill_(fill_invalid_expert)
+        for i in range(1, n_local_expert + 1):
+            first_token_offset = expert_first_token_offset[i - 1]
+            last_token_offset = expert_first_token_offset[i]
+            m_indices[first_token_offset:last_token_offset] = i - 1
+        src_row_id2dst_row_id_map = torch.arange(
+            0, n_token * topk, device="cuda",
+            dtype=torch.int32)[src2dst_idx].reshape((n_token, topk))
+        valid_row_idx += [i for i in range(expert_first_token_offset[-1])]
+        return [
+            permuted_hidden_states, expert_first_token_offset,
+            src_row_id2dst_row_id_map, m_indices, valid_row_idx
+        ]
+    else:
+        permuted_row_size = (topk * n_token + n_expert *
+                             (align_block_size - 1) + align_block_size -
+                             1) // align_block_size * align_block_size
+        permuted_hidden_states = torch.empty((permuted_row_size, n_hidden),
+                                             device="cuda",
+                                             dtype=hidden_states.dtype)
+        align_src_row_id2dst_row_id = torch.empty(n_token * topk,
+                                                  device="cuda",
+                                                  dtype=torch.int32)
+        align_expert_first_token_offset = torch.zeros_like(
+            expert_first_token_offset)
+        m_indices = torch.empty(permuted_row_size,
+                                device="cuda",
+                                dtype=torch.int32).fill_(fill_invalid_expert)
+        # get align_permuted_hidden_states,
+        # valid row_idx and align_expert_first_token_offset
+        for i in range(1, n_local_expert + 1):
+            first_token_offset = expert_first_token_offset[i - 1]
+            last_token_offset = expert_first_token_offset[i]
+            n_token_in_expert = last_token_offset - first_token_offset
+            align_expert_first_token_offset[
+                i] = align_expert_first_token_offset[
+                    i - 1] + (n_token_in_expert + align_block_size -
+                              1) // align_block_size * align_block_size
+            align_first_token_offset = align_expert_first_token_offset[i - 1]
+            align_last_token_offset = align_expert_first_token_offset[i]
+            dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[
+                first_token_offset:first_token_offset +
+                n_token_in_expert] % n_token
+            # store token in current expert with align_first_token_offset
+            permuted_hidden_states[align_first_token_offset:\
+                                   align_first_token_offset+n_token_in_expert,\
+                                      ...] = hidden_states[\
+                                       dst_row_id2src_row_id_in_expert, ...]
+            # set current expert m_indices
+            m_indices[align_first_token_offset:align_last_token_offset] = i - 1
+            valid_row_idx += [
+                i for i in range(align_first_token_offset,
+                                 align_first_token_offset + n_token_in_expert)
+            ]
+        # get align_src_row_id2dst_row_id
+        for i in range(n_token * topk):
+            eid = sorted_topk_ids[i]
+            if (eid >= n_local_expert):
+                # check token not in local expert
+                align_src_row_id2dst_row_id[
+                    i] = align_expert_first_token_offset[-1]
+                continue
+            first_token_offset = expert_first_token_offset[eid]
+            align_first_token_offset = align_expert_first_token_offset[eid]
+            token_offset = i - first_token_offset
+            align_src_row_id2dst_row_id[
+                i] = align_first_token_offset + token_offset
+        align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[\
+            src2dst_idx].reshape((n_token, topk))
+        return [
+            permuted_hidden_states, align_expert_first_token_offset,
+            align_src_row_id2dst_row_id, m_indices, valid_row_idx
+        ]
+
+
+def torch_unpermute(permuted_hidden_states: torch.Tensor,
+                    topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                    token_expert_indices: torch.Tensor,
+                    src_row_id2dst_row_id_map: torch.Tensor,
+                    valid_row_idx: torch.Tensor, topk: int,
+                    n_expert: int) -> torch.Tensor:
+    # ignore invalid row
+    mask = torch.zeros(permuted_hidden_states.shape[0],
+                       dtype=bool,
+                       device="cuda")
+    mask[valid_row_idx] = True
+    permuted_hidden_states[~mask] = 0
+    idx = src_row_id2dst_row_id_map.flatten()[
+        token_expert_indices.flatten()].reshape(token_expert_indices.shape)
+    output = permuted_hidden_states[idx, ...] * topk_weights[..., None]
+    output = output.sum(dim=1).to(permuted_hidden_states.dtype)
+    return output
+
+
+@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000])
+@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168])
+@pytest.mark.parametrize("n_expert", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("align_block_size", [None, 128])
+def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
+                               n_expert: int, ep_size: int, dtype: torch.dtype,
+                               align_block_size: Optional[int]):
+    fill_invalid_expert = 0
+    ep_rank = np.random.randint(0, ep_size)
+    expert_map = None
+    n_local_expert = n_expert
+    if (ep_size != 1):
+        n_local_expert, expert_map = determine_expert_map(
+            ep_size, ep_rank, n_expert)
+        expert_map = expert_map.cuda()
+    start_expert = n_local_expert * ep_rank
+    current_platform.seed_everything(0)
+    hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
+    gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        hidden_states, gating_output, topk, False)
+    gold0, gold1, gold2, gold3, valid_row_idx = torch_permute(
+        hidden_states,
+        topk_ids,
+        token_expert_indices,
+        topk,
+        n_expert,
+        n_local_expert,
+        start_expert,
+        expert_map=expert_map,
+        align_block_size=align_block_size,
+        fill_invalid_expert=fill_invalid_expert)
+
+    result0, result1, result2, result3 = moe_permute(
+        hidden_states, topk_weights, topk_ids, token_expert_indices, topk,
+        n_expert, n_local_expert, expert_map, align_block_size,
+        fill_invalid_expert)
+
+    # check expert_first_token_offset
+    torch.testing.assert_close(gold1, result1, atol=0, rtol=0)
+    # check src_row_id2dst_row_id_map
+    torch.testing.assert_close(gold2, result2, atol=0, rtol=0)
+    # check mindice
+    torch.testing.assert_close(gold3, result3, atol=0, rtol=0)
+    # check permuted_hidden_states, only valid token
+    torch.testing.assert_close(gold0[valid_row_idx],
+                               result0[valid_row_idx],
+                               atol=0,
+                               rtol=0)
+
+    # add a random tensor to simulate group gemm
+    result0 = 0.5 * result0 + torch.randn_like(result0)
+
+    result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1,
+                            topk, n_expert, n_local_expert)
+    gold4 = torch_unpermute(result0, topk_weights, topk_ids,
+                            token_expert_indices, result2, valid_row_idx, topk,
+                            n_local_expert)
+
+    # check unpermuted hidden
+    torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0)
diff --git a/tests/kernels/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
similarity index 100%
rename from tests/kernels/test_triton_moe_ptpc_fp8.py
rename to tests/kernels/moe/test_triton_moe_ptpc_fp8.py
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 498da6001ae..764924f2678 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -87,3 +87,63 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
     ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
         fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
     return ref_out, ref_scale.view((1, ))
+
+
+def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
+                             As: torch.Tensor, Bs: torch.Tensor, block_size,
+                             output_dtype):
+    """This function performs matrix multiplication with block-wise
+    quantization using native torch.
+    It is agnostic to the input data type and can be used for both int8 and
+    fp8 data types.
+
+    It takes two input tensors `A` and `B` (int8) with scales `As` and
+    `Bs` (float32).
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
similarity index 100%
rename from tests/kernels/test_allspark_gemm.py
rename to tests/kernels/quantization/test_allspark_gemm.py
diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
similarity index 100%
rename from tests/kernels/test_aqlm.py
rename to tests/kernels/quantization/test_aqlm.py
diff --git a/tests/kernels/test_awq.py b/tests/kernels/quantization/test_awq.py
similarity index 100%
rename from tests/kernels/test_awq.py
rename to tests/kernels/quantization/test_awq.py
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py
similarity index 98%
rename from tests/kernels/test_awq_marlin.py
rename to tests/kernels/quantization/test_awq_marlin.py
index 939b0e7157b..c30fe60becd 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/quantization/test_awq_marlin.py
@@ -84,7 +84,8 @@ def test_fused_marlin_moe_awq(
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
 
-    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        a, score, topk, False)
     marlin_output = torch.ops.vllm.fused_marlin_moe(
         a,
         qweight1,
diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
similarity index 100%
rename from tests/kernels/test_awq_triton.py
rename to tests/kernels/quantization/test_awq_triton.py
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
similarity index 98%
rename from tests/kernels/test_block_fp8.py
rename to tests/kernels/quantization/test_block_fp8.py
index c450048bf66..38c7e461bb9 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -18,8 +19,6 @@
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
-from .utils_block import native_w8a8_block_matmul
-
 dg_available = False
 try:
     import deep_gemm
@@ -339,7 +338,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
     M, K = a.shape
     N = w2.shape[-1]
 
-    topk_weight, topk_ids = fused_topk(a, score.float(), topk, False)
+    topk_weight, topk_ids, token_expert_indices = fused_topk(
+        a, score.float(), topk, False)
 
     block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
 
@@ -436,7 +436,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
             ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
                                                topk, block_size)
 
-        topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
+        topk_weights, topk_ids, token_expert_indices = fused_topk(
+            a, score.float(), topk, False)
 
         out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
 
diff --git a/tests/kernels/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
similarity index 99%
rename from tests/kernels/test_block_int8.py
rename to tests/kernels/quantization/test_block_int8.py
index 9447f9d6916..104f23fd7cd 100644
--- a/tests/kernels/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -13,8 +14,6 @@
     w8a8_block_int8_matmul)
 from vllm.platforms import current_platform
 
-from .utils_block import native_w8a8_block_matmul
-
 if current_platform.get_device_capability() < (7, 0):
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
                 allow_module_level=True)
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
similarity index 99%
rename from tests/kernels/test_cutlass_2of4_sparse.py
rename to tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 2890e15d6cb..d67d2dbb899 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -7,13 +7,12 @@
 import pytest
 import torch
 
+from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
similarity index 99%
rename from tests/kernels/test_cutlass.py
rename to tests/kernels/quantization/test_cutlass_scaled_mm.py
index f11ce6f45a9..8084d9bf2c2 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -8,13 +8,11 @@
 import pytest
 import torch
 
-from tests.kernels.utils import opcheck
+from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
similarity index 100%
rename from tests/kernels/test_fp8_quant.py
rename to tests/kernels/quantization/test_fp8_quant.py
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/quantization/test_ggml.py
similarity index 100%
rename from tests/kernels/test_ggml.py
rename to tests/kernels/quantization/test_ggml.py
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/quantization/test_gguf.py
similarity index 100%
rename from tests/kernels/test_gguf.py
rename to tests/kernels/quantization/test_gguf.py
diff --git a/tests/kernels/test_gptq.py b/tests/kernels/quantization/test_gptq.py
similarity index 100%
rename from tests/kernels/test_gptq.py
rename to tests/kernels/quantization/test_gptq.py
diff --git a/tests/kernels/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
similarity index 100%
rename from tests/kernels/test_int8_kernel.py
rename to tests/kernels/quantization/test_int8_kernel.py
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
similarity index 100%
rename from tests/kernels/test_int8_quant.py
rename to tests/kernels/quantization/test_int8_quant.py
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
similarity index 100%
rename from tests/kernels/test_machete_mm.py
rename to tests/kernels/quantization/test_machete_mm.py
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
similarity index 100%
rename from tests/kernels/test_marlin_gemm.py
rename to tests/kernels/quantization/test_marlin_gemm.py
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
similarity index 100%
rename from tests/kernels/test_nvfp4_quant.py
rename to tests/kernels/quantization/test_nvfp4_quant.py
diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_nvfp4_scaled_mm.py
rename to tests/kernels/quantization/test_nvfp4_scaled_mm.py
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
new file mode 100644
index 00000000000..622079c3944
--- /dev/null
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import ref_dynamic_per_tensor_fp8_quant
+from vllm.platforms import current_platform
+
+DTYPES = [torch.bfloat16, torch.float16]
+M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192]
+K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]  # k % 8 == 0
+N = [1, 2, 3, 4]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("n", [1])  # only test for batch size 1
+@pytest.mark.parametrize("k", K)
+@pytest.mark.parametrize("m", M)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+@torch.inference_mode()
+def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
+    torch.manual_seed(seed)
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.LLMM1(B, A, rows_per_block)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K + [9216, 10240, 16384])
+@pytest.mark.parametrize("m", [8] + M)  # m >= 8
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+    cu_count = current_platform.get_cu_count()
+
+    A = torch.rand(n, k, dtype=dtype, device="cuda")
+    B = torch.rand(m, k, dtype=dtype, device="cuda")
+
+    ref_out = torch.matmul(A, B.t())
+    out = ops.wvSplitK(B, A, cu_count)
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
+
+
+@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
+@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768])  # k % 16 == 0
+@pytest.mark.parametrize("m", M + [28672])  # m >= 16
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test for rocm")
+def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
+    torch.manual_seed(seed)
+
+    A = torch.rand(n, k, device="cuda")
+    B = torch.rand(m, k, device="cuda")
+
+    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
+    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+
+    ref_out = torch._scaled_mm(A,
+                               B.t(),
+                               out_dtype=dtype,
+                               scale_a=scale_a,
+                               scale_b=scale_b)
+    out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b,
+                        current_platform.get_cu_count())
+
+    assert torch.allclose(out, ref_out, rtol=0.01)
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_triton_scaled_mm.py
rename to tests/kernels/quantization/test_triton_scaled_mm.py
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
deleted file mode 100644
index a51e70d45ee..00000000000
--- a/tests/kernels/test_attention_selector.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
-from vllm.platforms.cpu import CpuPlatform
-from vllm.platforms.cuda import CudaPlatform
-from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
-
-
-@pytest.fixture(autouse=True)
-def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
-    _cached_get_attn_backend.cache_clear()
-
-
-@pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("use_v1", [True, False])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
-def test_env(
-    name: str,
-    use_v1: bool,
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Test that the attention selector can be set via environment variable.
-    Note that we do not test FlashAttn because it is the default backend.
-    """
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-        m.setenv(STR_BACKEND_ENV_VAR, name)
-
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            assert backend.get_name() == "TORCH_SDPA"
-        elif device == "hip":
-            with patch("vllm.attention.selector.current_platform",
-                       RocmPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
-            assert backend.get_name() == EXPECTED
-        else:
-            if name in ["XFORMERS", "FLASHINFER"]:
-                with patch("vllm.attention.selector.current_platform",
-                           CudaPlatform()):
-                    backend = get_attn_backend(16, torch.float16,
-                                               torch.float16, 16, False)
-                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
-                assert backend.get_name() == EXPECTED
-
-
-def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
-    """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
-
-        # Unsupported CUDA arch
-        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
-                            (7, 5))
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Reset the monkeypatch for subsequent tests
-        monkeypatch.undo()
-
-        # Unsupported data type
-        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Unsupported kv cache data type
-        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Unsupported block size
-        backend = get_attn_backend(16, torch.float16, None, 8, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # flash-attn is not installed
-        import sys
-        original_module = sys.modules.get('vllm_flash_attn')
-        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Restore the original module if it existed
-        if original_module is not None:
-            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
-                                original_module)
-        else:
-            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
-
-        # Unsupported head size
-        backend = get_attn_backend(17, torch.float16, None, 16, False)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-        # Attention-free models should bypass env and use PlaceholderAttention
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
-
-
-@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
-
-    with monkeypatch.context() as m, patch(
-            "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
-
-        # Test with head size 32
-        backend = get_attn_backend(32, torch.float16, None, 16, False)
-        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
-        assert backend.get_name() == EXPECTED
-
-        # when block size == 16, backend will fall back to XFORMERS
-        # this behavior is not yet supported on V1.
-        if use_v1:
-            # TODO: support fallback on V1!
-            # https://github.com/vllm-project/vllm/issues/14524
-            pass
-        else:
-            backend = get_attn_backend(16, torch.float16, None, 16, False)
-            assert backend.get_name() == "XFORMERS"
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
new file mode 100644
index 00000000000..87e4bd4b096
--- /dev/null
+++ b/tests/kernels/test_cutlass_mla_decode.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Cutlass MLA Requires compute capability of 10 or above.",
+        allow_module_level=True)
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
+@pytest.mark.parametrize("bs", [1, 2, 4])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("block_size", [16, 64, 128])
+def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
+                            varlen: bool, block_size: int):
+    torch.set_default_dtype(dtype)
+    torch.set_default_device('cuda')
+    torch.manual_seed(42)
+
+    d = 576
+    h_q = 128
+    dv = 512
+
+    q_nope_dim = 128
+    q_pe_dim = 64
+    scale = (q_nope_dim + q_pe_dim)**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    block_num = (max_seq_len + block_size - 1) // block_size
+
+    # Pad block_num so that small blocks can be packed into full 128-sized
+    # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small
+    # blocks.
+    pack_factor = 128 // block_size
+    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
+
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.randint(0,
+                                bs * block_num, (bs, block_num),
+                                dtype=torch.int32)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+    out_ans = torch.zeros_like(out_ref)
+    q_nope = q[:, :, :dv].clone()
+    q_pe = q[:, :, dv:].clone()
+    ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens,
+                           block_table, scale)
+
+    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/test_cutlass_moe.py
deleted file mode 100644
index 3cfed6ae853..00000000000
--- a/tests/kernels/test_cutlass_moe.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
-from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
-                                                            fused_topk)
-from vllm.platforms import current_platform
-
-NUM_EXPERTS = [40, 64]
-TOP_KS = [6, 8]
-
-
-def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
-        w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
-        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
-        ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-        return cutlass_moe_fp8(a,
-                               w1_q,
-                               w2_q,
-                               w1_scale,
-                               w2_scale,
-                               topk_weights,
-                               topk_ids,
-                               ab_strides1,
-                               c_strides1,
-                               ab_strides2,
-                               c_strides2,
-                               a1_scale=a_scale)
-
-
-@pytest.mark.parametrize("m", [2, 64, 224])
-@pytest.mark.parametrize("n", [1024, 3072])
-@pytest.mark.parametrize("k", [1024, 1536])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.skipif(
-    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_moe_no_graph(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-):
-    current_platform.seed_everything(7)
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
-        dtype = torch.half
-
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-        # Get the right scale for tests.
-        _, a_scale1 = ops.scaled_fp8_quant(
-            a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(a,
-                                      a_scale1,
-                                      use_per_token_if_dynamic=per_act_token)
-
-        a_d = a_q.float().mul(a_scale1).to(dtype)
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
-        w1_q = w1_q.transpose(1, 2)
-        w2_q = w2_q.transpose(1, 2)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
-
-        score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
-
-        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
-
-        cutlass_output = cutlass_moe_fp8(a,
-                                         w1_q,
-                                         w2_q,
-                                         w1_scale,
-                                         w2_scale,
-                                         topk_weights,
-                                         topk_ids,
-                                         ab_strides1,
-                                         c_strides1,
-                                         ab_strides2,
-                                         c_strides2,
-                                         a1_scale=a_scale1)
-
-        #print(triton_output)
-        #print(cutlass_output)
-        #print("*")
-
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=5e-2,
-                                   rtol=1e-2)
-
-
-@pytest.mark.parametrize("m", [2, 64, 224])
-@pytest.mark.parametrize("n", [1024, 3072])
-@pytest.mark.parametrize("k", [1024, 1536])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("per_act_token", [True, False])
-@pytest.mark.parametrize("per_out_ch", [True, False])
-@pytest.mark.skipif(
-    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
-        current_platform.get_device_capability()),
-    reason="Grouped gemm is not supported on this GPU type.")
-def test_cutlass_moe_cuda_graph(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    per_act_token: bool,
-    per_out_ch: bool,
-):
-    current_platform.seed_everything(7)
-    with set_current_vllm_config(
-            VllmConfig(parallel_config=ParallelConfig(
-                pipeline_parallel_size=1))):
-
-        dtype = torch.half
-
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-        # Get the right scale for tests.
-        _, a_scale1 = ops.scaled_fp8_quant(
-            a, use_per_token_if_dynamic=per_act_token)
-        a_q, _ = ops.scaled_fp8_quant(a,
-                                      a_scale1,
-                                      use_per_token_if_dynamic=per_act_token)
-
-        a_d = a_q.float().mul(a_scale1).to(dtype)
-
-        n_b_scales = 2 * n if per_out_ch else 1
-        k_b_scales = k if per_out_ch else 1
-
-        w1_q = torch.empty((e, 2 * n, k),
-                           device="cuda",
-                           dtype=torch.float8_e4m3fn)
-        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
-        w1_scale = torch.empty((e, n_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-        w2_scale = torch.empty((e, k_b_scales, 1),
-                               device="cuda",
-                               dtype=torch.float32)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        for expert in range(e):
-            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-                w1[expert], use_per_token_if_dynamic=per_out_ch)
-            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-                w2[expert], use_per_token_if_dynamic=per_out_ch)
-        w1_q = w1_q.transpose(1, 2)
-        w2_q = w2_q.transpose(1, 2)
-
-        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
-        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
-        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
-
-        w1_d = torch.empty_like(w1)
-        w2_d = torch.empty_like(w2)
-        for expert in range(e):
-            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
-            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
-
-        score = torch.randn((m, e), device="cuda", dtype=dtype)
-        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
-
-        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
-
-        stream = torch.cuda.Stream()
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph, stream=stream):
-            cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
-                                 topk_weights, topk_ids, ab_strides1,
-                                 c_strides1, ab_strides2, c_strides2)
-        torch.cuda.synchronize()
-        graph.replay()
-        torch.cuda.synchronize()
-
-        #print(triton_output)
-        #print(cutlass_output)
-        #print("*")
-
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=9e-2,
-                                   rtol=1e-2)
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
new file mode 100644
index 00000000000..fa84ad74cd8
--- /dev/null
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+
+DTYPES = [torch.bfloat16, torch.float16]
+QUANT_DTYPES = [torch.float8_e4m3fn]
+NUM_TOKENS = [1, 17, 86, 1234, 3045]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 48, 128, 1562, 4096]  # Arbitrary values for testing
+SEEDS = [0]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    out, scales = ops.scaled_fp8_quant(silu_and_mul_out, scale)
+    return out
+
+
+def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 2)
+    out = torch.empty(out_shape,
+                      dtype=torch.torch.float8_e4m3fn,
+                      device=x.device)
+    torch.ops._C.silu_and_mul_quant(out, x, scale)
+    return out
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_silu_and_mul(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    quant_dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+
+    layer = SiluAndMul()
+
+    # Make inputs
+    scale = (torch.randn((1), device=device, dtype=torch.float32))
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+
+    ref_out = ref_impl(layer, x, scale)
+    ops_out = ops_impl(x, scale)
+
+    assert ref_out.dtype == quant_dtype
+    assert ops_out.dtype == quant_dtype
+    assert ref_out.shape == ops_out.shape
+    assert torch.allclose(ref_out.to(dtype=torch.float32),
+                          ops_out.to(dtype=torch.float32))
+    opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
deleted file mode 100644
index 90b483b4a41..00000000000
--- a/tests/kernels/test_rocm_attention_selector.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-import torch
-
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
-from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-
-@pytest.fixture(autouse=True)
-def clear_cache():
-    """Clear lru cache to ensure each test case runs without caching.
-    """
-    _cached_get_attn_backend.cache_clear()
-
-
-def test_selector(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
-
-        # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform",
-                            RocmPlatform())
-
-        # Test standard ROCm attention
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-        assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
-
-        # mla test for deepseek related
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
-                                   False, True)
-        assert backend.get_name() == "TRITON_MLA"
diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py
new file mode 100644
index 00000000000..cf2bdc908e4
--- /dev/null
+++ b/tests/kernels/test_triton_flash_attention.py
@@ -0,0 +1,499 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for the triton_flash_attention kernel
+
+Run `pytest tests/kernels/test_triton_flash_attention.py`.
+"""
+import pytest
+import torch
+
+from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
+                                                       MetaData,
+                                                       compute_alibi_tensor,
+                                                       scale_fp8,
+                                                       triton_attention_rocm)
+from vllm.platforms import current_platform
+
+
+class ReferenceAttention:
+
+    def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
+                 input_metadata):
+        self.Z = Z
+        self.HQ = HQ
+        self.HK = HK
+        self.N_CTX_Q = N_CTX_Q
+        self.N_CTX_K = N_CTX_K
+        self.D_HEAD = D_HEAD
+        self.use_alibi = use_alibi
+        self.dtype = dtype
+        self.input_metadata = input_metadata
+
+    def fwd(self, q, k, v):
+        scores = torch.einsum('bhqd,bhkd->bhqk', q,
+                              k).float() * self.input_metadata.sm_scale
+        if self.input_metadata.causal:
+            mask = torch.tril(torch.ones(self.N_CTX_Q,
+                                         self.N_CTX_K,
+                                         device="cuda"),
+                              diagonal=self.N_CTX_K - self.N_CTX_Q)
+            scores[:, :, mask == 0] = float("-inf")
+
+        if self.input_metadata.bias is not None:
+            scores += self.input_metadata.bias
+
+        if self.use_alibi:
+            scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
+                                           self.N_CTX_Q, self.N_CTX_K)
+
+        p = torch.softmax(scores, dim=-1)
+        if self.input_metadata.causal:
+            # If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
+            # into softmax. This creates a row of NaNs as -inf - -inf == NaN.
+            # So we fix this by converting the NaNs to 0s, which is what they
+            # should be out of the softmax.
+            nan_mask = torch.isnan(p)
+            p[nan_mask == 1] = 0
+        ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
+        # compare
+        if self.input_metadata.layout == 'bshd':
+            ref_out = ref_out.transpose(1, 2).clone()
+        return ref_out
+
+    def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
+        q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
+            self.dtype)
+        k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
+            self.dtype)
+        v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
+            self.dtype)
+        result = self.fwd(q, k, v)
+        if self.input_metadata.o_scale is not None:
+            result, _ = scale_fp8(result, self.input_metadata.o_scale)
+        return result
+
+    def fwd_fp8_kv(self, q, k_quantized, v_quantized):
+        k_descale, v_descale = (self.input_metadata.k_descale,
+                                self.input_metadata.v_descale)
+        k_dequantized = (k_quantized.to(torch.float32) *
+                         k_descale.to(torch.float32)).to(self.dtype)
+        v_dequantized = (v_quantized.to(torch.float32) *
+                         v_descale.to(torch.float32)).to(self.dtype)
+        return self.fwd(q, k_dequantized, v_dequantized)
+
+    def varlen_fwd(self, q, k, v, is_mqa=False):
+        ref_out = torch.empty_like(q)
+        if is_mqa:
+            # Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
+            # the size aligns with Q.
+            k_ref = k.view(k.shape[0], k.shape[1], 1,
+                           k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+            v_ref = v.view(v.shape[0], v.shape[1], 1,
+                           v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
+        else:
+            k_ref = k
+            v_ref = v
+
+        for i in range(0, self.input_metadata.num_contexts):
+            start_q, start_k = self.input_metadata.cu_seqlens_q[
+                i], self.input_metadata.cu_seqlens_k[i]
+            end_q, end_k = self.input_metadata.cu_seqlens_q[
+                i + 1], self.input_metadata.cu_seqlens_k[i + 1]
+            k_curr = k_ref[start_k:end_k]
+            v_curr = v_ref[start_k:end_k]
+            if is_mqa:
+                k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
+                v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
+            scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
+                                  k_curr).float()
+            p = torch.softmax(scores * self.input_metadata.sm_scale,
+                              dim=-1).half()
+            ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
+        return ref_out
+
+
+def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
+    q_descale = None
+    if not fp8_kv:
+        q, q_descale = scale_fp8(q)
+    k, k_descale = scale_fp8(k)
+    v, v_descale = scale_fp8(v)
+
+    # In real world use case, the p scale would be a parameter trained by the
+    # model.
+    p_scale = None
+
+    o_scale = torch.rand(1, device="cuda",
+                         requires_grad=False) if use_o_scale else None
+
+    return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
+
+
+def input_helper(
+    Z,
+    HQ,
+    HK,
+    N_CTX_Q,
+    N_CTX_K,
+    D_HEAD,
+    dtype,
+    layout=None,
+    use_alibi=None,
+    causal=None,
+    is_fp8=False,
+    fp8_kv=False,
+    use_o_scale=False,
+    use_bias=False,
+):
+    assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
+
+    current_platform.seed_everything(0)
+
+    # Initialize q, k, v
+    if layout == 'bhsd':
+        q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
+        k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
+    elif layout == 'bshd':
+        q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
+        k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
+
+    if use_alibi:
+        # for n heads the set of slopes is the geometric sequence that starts
+        # 2^(-8/n)
+        alibi_slopes = torch.tensor(
+            [2**(-8 / HQ * i) for i in range(1, HQ + 1)],
+            dtype=torch.float32,
+            device="cuda").repeat(Z, 1)
+    else:
+        alibi_slopes = None
+
+    if use_bias:
+        bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
+                           dtype=dtype,
+                           device="cuda",
+                           requires_grad=False)
+    else:
+        bias = None
+
+    q = torch.randn(q_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+    k = torch.randn(k_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+    v = torch.randn(k_tensor_shape,
+                    dtype=dtype,
+                    device="cuda",
+                    requires_grad=False)
+
+    if is_fp8:
+        (q, k, v, q_descale, k_descale, v_descale, p_scale,
+         o_scale) = quantize_input(q,
+                                   k,
+                                   v,
+                                   use_o_scale=use_o_scale,
+                                   fp8_kv=fp8_kv)
+    else:
+        q_descale = k_descale = v_descale = p_scale = o_scale = None
+
+    input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
+                              max_seqlens_q=N_CTX_Q,
+                              max_seqlens_k=N_CTX_K,
+                              layout=layout,
+                              alibi_slopes=alibi_slopes,
+                              alibi_batch=Z,
+                              alibi_nheads=HQ,
+                              q_descale=q_descale,
+                              k_descale=k_descale,
+                              v_descale=v_descale,
+                              p_scale=p_scale,
+                              o_scale=o_scale,
+                              bias=bias,
+                              seqlen_q=N_CTX_Q,
+                              seqlen_k=N_CTX_K)
+    return q, k, v, input_metadata
+
+
+def varlen_input_helper(Z,
+                        HQ,
+                        HK,
+                        N_CTX_Q,
+                        N_CTX_K,
+                        D_HEAD,
+                        dtype,
+                        equal_seqlens=False):
+    current_platform.seed_everything(0)
+
+    # Random sequence lengths. Using N_CTX as kind of max of sum of individual
+    # seqs
+    if not equal_seqlens:
+        max_seqlens_q = N_CTX_Q // Z
+        max_seqlens_k = N_CTX_K // Z
+        seqlens_q = torch.randint(1,
+                                  max_seqlens_q + 1, (Z, ),
+                                  dtype=torch.int32)
+        seqlens_k = torch.randint(1,
+                                  max_seqlens_k + 1, (Z, ),
+                                  dtype=torch.int32)
+    else:
+        seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
+        seqlens_k = torch.full((Z, ), N_CTX_K // Z)
+
+    # Calculate cumulative sequence lengths
+    cu_seqlens_q = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        seqlens_q.cumsum(dim=0, dtype=torch.int32)
+    ])
+    cu_seqlens_k = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        seqlens_k.cumsum(dim=0, dtype=torch.int32)
+    ])
+    cu_seqlens_q = cu_seqlens_q.to(device="cuda")
+    cu_seqlens_k = cu_seqlens_k.to(device="cuda")
+
+    # Initialize q, k, v with variable lengths
+    total_q = cu_seqlens_q[-1].item()
+    total_k = cu_seqlens_k[-1].item()
+    q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
+                    device="cuda").normal_(mean=0., std=0.5).requires_grad_()
+    sm_scale = D_HEAD**-0.5
+    input_metadata = MetaData(sm_scale=sm_scale)
+    input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
+    return q, k, v, input_metadata
+
+
+@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (1, 48, 12, 1, 1, 64),
+    (4, 4, 4, 128, 128, 65),
+    (16, 48, 48, 1, 1, 128),
+    (64, 48, 24, 3, 3, 128),
+    (4, 4, 4, 113, 123, 1),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('use_alibi', [True, False])
+@pytest.mark.parametrize('layout', ['bshd'])
+def test_op_fwd(Z,
+                HQ,
+                HK,
+                N_CTX_Q,
+                N_CTX_K,
+                D_HEAD,
+                causal,
+                use_alibi,
+                layout,
+                dtype=torch.float16):
+    current_platform.seed_everything(0)
+    q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
+                                           dtype, layout, use_alibi, causal)
+
+    o = torch.empty_like(q)
+
+    # triton implementation
+    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
+
+    # Transpose here if layout is bshd so we have same reference code for all
+    # layouts
+    if layout == 'bshd':
+        q = q.transpose(1, 2).clone()
+        k = k.transpose(1, 2).clone()
+        v = v.transpose(1, 2).clone()
+    # Replicate K and V if using MQA/GQA
+    if HQ != HK:
+        k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
+                   k.shape[3]).expand(-1, -1, HQ // HK, -1,
+                                      -1).reshape(k.shape[0], -1, k.shape[2],
+                                                  k.shape[3])
+        v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
+                   v.shape[3]).expand(-1, -1, HQ // HK, -1,
+                                      -1).reshape(v.shape[0], -1, v.shape[2],
+                                                  v.shape[3])
+
+    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
+                                  use_alibi, dtype, input_metadata)
+    ref_out = ref_impl.fwd(q, k, v)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('layout', ['bhsd'])
+@pytest.mark.parametrize('use_o_scale', [True, False])
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
+                    reason="Triton FP8 requires CUDA 9.0 or higher")
+def test_op_fwd_fp8(Z,
+                    H,
+                    N_CTX_Q,
+                    N_CTX_K,
+                    D_HEAD,
+                    causal,
+                    layout,
+                    use_o_scale,
+                    dtype=torch.float32):
+    current_platform.seed_everything(0)
+
+    # Disable grad to save memory it won't run into OOM on CI machine.
+    # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
+    # dtype, layout)
+
+    q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
+        Z,
+        H,
+        H,
+        N_CTX_Q,
+        N_CTX_K,
+        D_HEAD,
+        dtype,
+        causal=causal,
+        layout=layout,
+        is_fp8=True,
+        use_o_scale=use_o_scale)
+
+    o = torch.empty_like(q_quantized) if use_o_scale else None
+
+    tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
+                                       o, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
+
+    # compare
+    torch.testing.assert_close(ref_out.to(torch.float32),
+                               tri_out.to(torch.float32),
+                               atol=7e-2,
+                               rtol=2e-1)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+    (4, 4, 113, 123, 1),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('layout', ['bhsd'])
+def test_op_fwd_fp8_kv(Z,
+                       H,
+                       N_CTX_Q,
+                       N_CTX_K,
+                       D_HEAD,
+                       causal,
+                       layout,
+                       dtype=torch.float32):
+    current_platform.seed_everything(0)
+
+    q, k_quantized, v_quantized, input_metadata = input_helper(Z,
+                                                               H,
+                                                               H,
+                                                               N_CTX_Q,
+                                                               N_CTX_K,
+                                                               D_HEAD,
+                                                               dtype,
+                                                               causal=causal,
+                                                               layout=layout,
+                                                               is_fp8=True,
+                                                               fp8_kv=True)
+
+    o = torch.empty_like(q)
+
+    tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
+                                       input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
+
+
+@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
+    (4, 48, 1, 1, 64),
+    (4, 48, 1, 1, 128),
+    (4, 48, 3, 3, 128),
+    (4, 4, 128, 128, 65),
+])
+@pytest.mark.parametrize('causal', [True, False])
+@pytest.mark.parametrize('use_bias', [True])
+@pytest.mark.parametrize('dtype', [torch.bfloat16])
+def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
+    current_platform.seed_everything(0)
+    q, k, v, input_metadata = input_helper(Z,
+                                           H,
+                                           H,
+                                           N_CTX_Q,
+                                           N_CTX_K,
+                                           D_HEAD,
+                                           dtype,
+                                           layout='bhsd',
+                                           causal=causal,
+                                           use_bias=use_bias)
+    o = torch.empty_like(q)
+
+    # triton implementation
+    tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.fwd(q, k, v)
+
+    # compare
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+# NOTE: Uses thd layout, so also tests thd.
+@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
+                                                 (4, 48, 512, 64),
+                                                 (16, 48, 512, 64),
+                                                 (64, 48, 128, 128)])
+@pytest.mark.parametrize('causal', [True, False])
+def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
+
+    q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
+                                                  D_HEAD, dtype)
+
+    tri_out = torch.empty_like(q)
+    triton_attention_rocm(q, k, v, tri_out, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
+                                  input_metadata)
+    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
+
+
+# NOTE: Uses thd layout, so also tests thd.
+@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
+                                                      (4, 48, 12, 256, 64),
+                                                      (4, 48, 4, 512, 64),
+                                                      (4, 64, 16, 128, 128)])
+@pytest.mark.parametrize('causal', [False])
+def test_op_varlen_mqa_fwd(Z,
+                           HQ,
+                           HK,
+                           N_CTX,
+                           D_HEAD,
+                           causal,
+                           dtype=torch.float16):
+    q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
+                                                  D_HEAD, dtype)
+
+    tri_out = torch.empty_like(q)
+    triton_attention_rocm(q, k, v, tri_out, input_metadata)
+
+    ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
+                                  dtype, input_metadata)
+    ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
+
+    torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
deleted file mode 100644
index d3f03200265..00000000000
--- a/tests/kernels/test_utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm.platforms import current_platform
-
-
-def test_convert_fp8_opcheck():
-    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
-    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
-    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
-
-
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="Only supported for CUDA")
-def test_cuda_utils_opcheck():
-    opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
-    opcheck(
-        torch.ops._C_cuda_utils.
-        get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/tests/kernels/utils_block.py b/tests/kernels/utils_block.py
deleted file mode 100644
index c16cba50967..00000000000
--- a/tests/kernels/utils_block.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-
-
-def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
-                             As: torch.Tensor, Bs: torch.Tensor, block_size,
-                             output_dtype):
-    """This function performs matrix multiplication with block-wise
-    quantization using native torch.
-    It is agnostic to the input data type and can be used for both int8 and
-    fp8 data types.
-
-    It takes two input tensors `A` and `B` (int8) with scales `As` and 
-    `Bs` (float32).
-    The output is returned in the specified `output_dtype`.
-    """
-    A = A.to(torch.float32)
-    B = B.to(torch.float32)
-    assert A.shape[-1] == B.shape[-1]
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1]
-
-    M = A.numel() // A.shape[-1]
-    N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
-    A = A.reshape(M, A.shape[-1])
-    As = As.reshape(M, As.shape[-1])
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-    assert n_tiles == Bs.shape[0]
-    assert k_tiles == Bs.shape[1]
-
-    C_shape = (M, N)
-    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
-
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
-    ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            a = A_tiles[i]
-            b = B_tiles[j][i]
-            c = C_tiles[j]
-            s = As_tiles[i] * Bs[j][i]
-            c[:, :] += torch.matmul(a, b.t()) * s
-
-    C = C.reshape(origin_C_shape).to(output_dtype)
-    return C
diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py
index 5b9ea6dba40..dc948a48bf3 100644
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
@@ -14,8 +14,8 @@
 # Fixture to set up environment variables and teardown servers after tests
 @pytest.fixture(scope="module", autouse=True)
 def setup_servers():
-    if torch.cuda.device_count() < 4:
-        pytest.skip("Skipping test: fewer than 4 GPUs available")
+    if torch.cuda.device_count() < 2:
+        pytest.skip("Skipping test: fewer than 2 GPUs available")
 
     # Set up environment variables
     VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index cdb8c893b8b..e3a054bd620 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -47,6 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     ]
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
+                                          skip_special_tokens=False,
                                           stop=["[/assistant]"])
     outputs = llm.generate(
         prompts,
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 576d95a4715..52b0834cacb 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -31,6 +31,8 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
+DEFAULT_DTYPE = torch.get_default_dtype()
+
 
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch: pytest.MonkeyPatch):
@@ -125,8 +127,10 @@ def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
     manager = LoRAModelManager(
         model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
-        torch.device(DEVICES[0]))
+        LoRAConfig(max_lora_rank=8,
+                   max_cpu_loras=8,
+                   max_loras=8,
+                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
     model = manager.model
     assert isinstance(model.get_submodule("dense1"),
                       ColumnParallelLinearWithLoRA)
@@ -155,7 +159,8 @@ def test_lora_model_manager(dist_init, dummy_model, device):
                                2,
                                LoRAConfig(max_lora_rank=8,
                                           max_cpu_loras=3,
-                                          max_loras=2),
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
                                device=device)
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
@@ -221,7 +226,8 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
                                        2,
                                        LoRAConfig(max_lora_rank=8,
                                                   max_cpu_loras=3,
-                                                  max_loras=2),
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
                                        device=device)
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
@@ -316,7 +322,8 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
                                        2,
                                        LoRAConfig(max_lora_rank=8,
                                                   max_cpu_loras=2,
-                                                  max_loras=2),
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
                                        device=device)
 
     assert all(x is None for x in manager.lora_index_to_id)
@@ -424,7 +431,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
@@ -504,7 +514,10 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
     # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
     worker_adapter_manager = WorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
@@ -600,7 +613,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
                                2,
                                LoRAConfig(max_lora_rank=8,
                                           max_cpu_loras=2,
-                                          max_loras=2),
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
                                device=device)
     model = manager.model
 
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
new file mode 100644
index 00000000000..8ebc2ae98fc
--- /dev/null
+++ b/tests/lora/test_resolver.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import pytest
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+
+
+class DummyLoRAResolver(LoRAResolver):
+    """A dummy LoRA resolver for testing."""
+
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        if lora_name == "test_lora":
+            return LoRARequest(
+                lora_name=lora_name,
+                lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
+                lora_int_id=abs(hash(lora_name)))
+        return None
+
+
+def test_resolver_registry_registration():
+    """Test basic resolver registration functionality."""
+    registry = LoRAResolverRegistry
+    resolver = DummyLoRAResolver()
+
+    # Register a new resolver
+    registry.register_resolver("dummy", resolver)
+    assert "dummy" in registry.get_supported_resolvers()
+
+    # Get registered resolver
+    retrieved_resolver = registry.get_resolver("dummy")
+    assert retrieved_resolver is resolver
+
+
+def test_resolver_registry_duplicate_registration():
+    """Test registering a resolver with an existing name."""
+    registry = LoRAResolverRegistry
+    resolver1 = DummyLoRAResolver()
+    resolver2 = DummyLoRAResolver()
+
+    registry.register_resolver("dummy", resolver1)
+    registry.register_resolver("dummy", resolver2)
+
+    assert registry.get_resolver("dummy") is resolver2
+
+
+def test_resolver_registry_unknown_resolver():
+    """Test getting a non-existent resolver."""
+    registry = LoRAResolverRegistry
+
+    with pytest.raises(KeyError, match="not found"):
+        registry.get_resolver("unknown_resolver")
+
+
+@pytest.mark.asyncio
+async def test_dummy_resolver_resolve():
+    """Test the dummy resolver's resolve functionality."""
+    dummy_resolver = DummyLoRAResolver()
+    base_model_name = "base_model_test"
+    lora_name = "test_lora"
+
+    # Test successful resolution
+    result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
+    assert isinstance(result, LoRARequest)
+    assert result.lora_name == lora_name
+    assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
+
+    # Test failed resolution
+    result = await dummy_resolver.resolve_lora(base_model_name,
+                                               "nonexistent_lora")
+    assert result is None
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index d605ab73468..8845eb33d20 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -5,17 +5,14 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import get_lora_tokenizer
-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
-
-from ..conftest import get_tokenizer_pool_config
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
 async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
     reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
+    tokenizer_group = TokenizerGroup(
         tokenizer_id="gpt2",
         enable_lora=True,
         max_num_seqs=1,
@@ -60,8 +57,7 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path):
 @pytest.mark.parametrize("max_num_seqs", [1, 2])
 @pytest.mark.parametrize("max_loras", [1, 2])
 def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(None),
+    tokenizer_group = TokenizerGroup(
         tokenizer_id="gpt2",
         enable_lora=enable_lora,
         max_num_seqs=max_num_seqs,
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 1c90cedf1a1..67f3866beff 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -39,6 +39,18 @@ def test_parse_fine_tuned_lora_name_valid():
             False,
             False,
         ),
+        (
+            "language_model.layers.9.mlp.down_proj.lora_A.weight",
+            "language_model.layers.9.mlp.down_proj",
+            True,
+            False,
+        ),
+        (
+            "language_model.layers.9.mlp.down_proj.lora_B.weight",
+            "language_model.layers.9.mlp.down_proj",
+            False,
+            False,
+        ),
     }
     for name, module_name, is_lora_a, is_bias in fixture:
         assert (module_name, is_lora_a,
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index ac2e0f3542e..2d9cf1d48fd 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -11,6 +11,8 @@
     dispatch_fused_experts_func, dispatch_topk_func,
     torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
     vllm_topk_softmax)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
@@ -100,11 +102,10 @@ def test_enabled_ops_invalid(env: str):
 def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     topk_func = dispatch_topk_func()
-
+    is_rocm_aiter_moe_enabled.cache_clear()
     if current_platform.is_rocm() and int(use_rocm_aiter):
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             rocm_aiter_topk_softmax)
-
         assert topk_func == rocm_aiter_topk_softmax
     else:
         assert topk_func == vllm_topk_softmax
@@ -116,11 +117,11 @@ def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
                                 monkeypatch):
 
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    is_rocm_aiter_moe_enabled.cache_clear()
     fused_experts_func = dispatch_fused_experts_func(inplace)
     if current_platform.is_rocm() and int(use_rocm_aiter):
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             rocm_aiter_fused_experts)
-
         assert fused_experts_func == rocm_aiter_fused_experts
     elif inplace:
         assert fused_experts_func == torch_vllm_inplace_fused_experts
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 59da575e37b..6cd966f8480 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -202,12 +202,15 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
 
 def test_guided_decoding_backend_options():
     """Test backend-specific options"""
-    params = GuidedDecodingParams(
-        backend="xgrammar:option-1,option-2,option-3")
-    assert params.backend_options() == ["option-1", "option-2", "option-3"]
-
-    no_fallback = GuidedDecodingParams(backend="xgrammar:option-1,no-fallback")
-    assert no_fallback.no_fallback()
+    with pytest.warns(DeprecationWarning):
+        guided_decoding_params = GuidedDecodingParams(
+            backend=
+            "xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
+        )
+    assert guided_decoding_params.backend == "xgrammar"
+    assert guided_decoding_params.disable_fallback
+    assert guided_decoding_params.disable_any_whitespace
+    assert guided_decoding_params.disable_additional_properties
 
 
 def test_pickle_xgrammar_tokenizer_data():
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py
deleted file mode 100644
index 60eb3830c6d..00000000000
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from tests.utils import multi_gpu_test
-from vllm.engine.arg_utils import EngineArgs
-from vllm.sampling_params import SamplingParams
-
-from ...utils import check_outputs_equal
-
-# This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
-# Bamba at Fp32 is too big for the CI (L4 GPU).
-# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    # numeric error produces different generation
-    if "Bamba" in model:
-        example_prompts.pop(3)
-
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_batching(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # To pass the small model tests, we need full precision.
-    for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        for prompt in example_prompts:
-            for_loop_outputs.append(
-                vllm_model.generate_greedy([prompt], max_tokens)[0])
-
-        batched_outputs = vllm_model.generate_greedy(example_prompts,
-                                                     max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=for_loop_outputs,
-        outputs_1_lst=batched_outputs,
-        name_0="for_loop_vllm",
-        name_1="batched_vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float16"])
-@pytest.mark.parametrize("max_tokens", [10])
-def test_mamba_prefill_chunking_with_parallel_sampling(
-        hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
-        max_tokens: int) -> None:
-    # Tests prefill chunking in conjunction with n>1, in this case,
-    # prefill is populated with decoding tokens and we test that it
-    # doesn't fail This test might fail if cache is not allocated
-    # correctly for n > 1 decoding steps inside a
-    # chunked prefill forward pass (where we have both prefills
-    # and decoding together )
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_chunked_prefill=True,
-            max_num_batched_tokens=30,
-            max_num_seqs=10  # forces prefill chunks with decoding
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [7])
-def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
-                                model: str, dtype: str,
-                                max_tokens: int) -> None:
-    # numeric error during prefill chunking produces different generation
-    # compared to w/o prefill chunking for those examples, removed them for now
-    if "Jamba" in model:
-        example_prompts.pop(7)
-        example_prompts.pop(2)
-        example_prompts.pop(1)
-    elif "Bamba" in model:
-        example_prompts.pop(6)
-        example_prompts.pop(3)
-        example_prompts.pop(2)
-        dtype = "half"  # use a different dtype for Bamba
-    elif "Zamba2" in model:
-        example_prompts.pop(7)
-        dtype = "half"
-
-    model_kwargs = {
-        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
-        # don't use them
-    }
-    if "Zamba2" in model:
-        # Zamba2 HF implementation automatically checks if mamba kernels are
-        # installed
-        model_kwargs = {}
-
-    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
-        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(model,
-                     dtype=dtype,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=5,
-                     max_num_seqs=2) as vllm_model:
-        chunked = vllm_model.generate_greedy(example_prompts,
-                                             max_tokens=max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [15])
-def test_parallel_sampling(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        for_loop_outputs = []
-        for _ in range(10):
-            for_loop_outputs.append(
-                # using example_prompts index 1 instead of 0 since with 0 the
-                # logprobs get really close and the test doesn't pass
-                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
-                [0])
-        sampling_params = SamplingParams(n=10,
-                                         temperature=0.001,
-                                         seed=0,
-                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
-                                             sampling_params)
-    token_ids, texts = n_lt_1_outputs[0]
-    n_lt_1_outputs = [(token_id, text)
-                      for token_id, text in zip(token_ids, texts)]
-
-    check_outputs_equal(
-        outputs_0_lst=n_lt_1_outputs,
-        outputs_1_lst=for_loop_outputs,
-        name_0="vllm_n_lt_1_outputs",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_mamba_cache_cg_padding(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # This test is for verifying that mamba cache is padded to CG captured
-    # batch size. If it's not, a torch RuntimeError will be raised because
-    # tensor dimensions aren't compatible
-    vllm_config = EngineArgs(model=model).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
-        example_prompts.append(example_prompts[0])
-
-    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
-            vllm_model.generate_greedy(example_prompts, max_tokens)
-    except RuntimeError:
-        pytest.fail(
-            "Couldn't run batch size which is not equal to a Cuda Graph "
-            "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_models_preemption_recompute(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # Tests that outputs are identical with and w/o preemtions (recompute)
-    assert dtype == "float"
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the hybrid inner state management doesn't
-    # collapse in case where the number of incoming requests and
-    # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that hybrid does support
-    # statelessness mechanism where it can cleanup new incoming requests in
-    # a single step.
-    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
-            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
-    except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_state_cleanup(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the Hybrid state is cleaned up between
-    # steps, If its not cleaned, an error would be expected.
-    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
-            for _ in range(10):
-                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
-    except ValueError:
-        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
-
-
-@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_multistep(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is verifying that multistep works correctly
-    #on mamba-like models
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
-
-
-@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_multistep_correctness(vllm_runner, model: str, dtype: str,
-                               max_tokens: int, example_prompts) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_multistep = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with vllm_runner(model, num_scheduler_steps=1,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_single_step = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_outputs_multistep,
-        outputs_1_lst=vllm_outputs_single_step,
-        name_0="vllm_outputs_multistep",
-        name_1="vllm_outputs_single_step",
-    )
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_hybrid_distributed_produces_identical_generation(
-        vllm_runner, model: str, dtype: str, max_tokens: int,
-        example_prompts) -> None:
-
-    with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
-        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
-
-    with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
-        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
-                                                       max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_outputs_tp_1,
-        outputs_1_lst=vllm_outputs_tp_2,
-        name_0="vllm_tp_1",
-        name_1="vllm_tp_2",
-    )
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
deleted file mode 100644
index 47b9c0f69c3..00000000000
--- a/tests/models/decoder_only/language/test_mamba.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
-
-Run `pytest tests/models/test_mamba.py`.
-"""
-import pytest
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.sampling_params import SamplingParams
-
-from ...utils import check_outputs_equal
-
-MODELS = [
-    "state-spaces/mamba-130m-hf",
-    "tiiuae/falcon-mamba-tiny-dev",
-    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
-    # See https://github.com/huggingface/transformers/pull/35943
-    # "mistralai/Mamba-Codestral-7B-v0.1",
-]
-
-
-# Use lower-level interfaces to create this greedy generator, as mamba will
-# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
-def generate_greedy(model_name, example_prompts, max_tokens):
-    # Create a text generation pipeline
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-
-    # Set the device (GPU if available, else CPU)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-
-    # Generate texts from the prompts
-    outputs = []
-    for prompt in example_prompts:
-        # Tokenize the input prompt with truncation
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
-        input_ids = inputs["input_ids"].to(model.device)
-
-        # Generate text using the model's generate method directly
-        generated_ids = model.generate(input_ids,
-                                       max_new_tokens=max_tokens,
-                                       do_sample=False)
-        generated_text = tokenizer.decode(generated_ids[0],
-                                          skip_special_tokens=True)
-
-        outputs.append((generated_ids[0].tolist(), generated_text))
-
-    return outputs
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    hf_outputs = generate_greedy(model, example_prompts, max_tokens)
-
-    # Set max_num_seqs to keep Codestral from going OOM at fp32
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_batching(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # To pass the small model tests, we need full precision.
-    for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        for prompt in example_prompts:
-            for_loop_outputs.append(
-                vllm_model.generate_greedy([prompt], max_tokens)[0])
-
-        batched_outputs = vllm_model.generate_greedy(example_prompts,
-                                                     max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=for_loop_outputs,
-        outputs_1_lst=batched_outputs,
-        name_0="for_loop_vllm",
-        name_1="batched_vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [10])
-def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
-                                                model: str, dtype: str,
-                                                max_tokens: int) -> None:
-    # Tests chunked prefill in conjunction with n>1. In this case, prefill is
-    # populated with decoding tokens and we test that it doesn't fail.
-    # This test might fail if cache is not allocated correctly for n > 1
-    # decoding steps inside a chunked prefill forward pass (where we have both
-    # prefill and decode together )
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_chunked_prefill=True,
-            max_num_batched_tokens=30,
-            max_num_seqs=10  # forces prefill chunks with decoding
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
-                         max_tokens: int,
-                         chunked_prefill_token_size: int) -> None:
-    """
-    Checks exact match decode between huggingface model and vllm runner with
-    chunked prefill.
-    """
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    non_chunked = generate_greedy(model, example_prompts, max_tokens)
-
-    with vllm_runner(model,
-                     dtype=dtype,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy(example_prompts,
-                                             max_tokens=max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [15])
-def test_parallel_sampling(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    # Numerical differences produce slightly different output for these
-    if 'state-spaces' in model:
-        example_prompts.pop(0)
-        example_prompts.pop(0)
-        example_prompts.pop(0)
-
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        for_loop_outputs = []
-        for _ in range(10):
-            for_loop_outputs.append(
-                vllm_model.generate_greedy(example_prompts, max_tokens)[0])
-        sampling_params = SamplingParams(n=10,
-                                         temperature=0.001,
-                                         seed=0,
-                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
-    token_ids, texts = n_lt_1_outputs[0]
-    n_lt_1_outputs = [(token_id, text)
-                      for token_id, text in zip(token_ids, texts)]
-
-    check_outputs_equal(
-        outputs_0_lst=n_lt_1_outputs,
-        outputs_1_lst=for_loop_outputs,
-        name_0="vllm_n_lt_1_outputs",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_mamba_cache_cg_padding(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # This test is for verifying that mamba cache is padded to CG captured
-    # batch size. If it's not, a torch RuntimeError will be raised because
-    # tensor dimensions aren't compatible
-    vllm_config = EngineArgs(model=model).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
-        example_prompts.append(example_prompts[0])
-
-    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
-            vllm_model.generate_greedy(example_prompts, max_tokens)
-    except RuntimeError:
-        pytest.fail(
-            "Couldn't run batch size which is not equal to a Cuda Graph "
-            "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_models_preemption_recompute(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # Tests that outputs are identical with and w/o preemtions (recompute)
-    assert dtype == "float"
-
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the Mamba inner state management doesn't
-    # collapse in case where the number of incoming requests and
-    # finished_requests_ids is larger than the maximum Mamba block capacity.
-    # This could generally happen due to the fact that Mamba does support
-    # statelessness mechanism where it can cleanup new incoming requests in
-    # a single step.
-    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
-            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
-    except ValueError:
-        pytest.fail("Mamba inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_state_cleanup(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the Mamba state is cleaned up between
-    # steps, If its not cleaned, an error would be expected.
-    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-            for _ in range(10):
-                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
-    except ValueError:
-        pytest.fail("Mamba inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_multistep(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_multistep_correctness(vllm_runner, model: str, dtype: str,
-                               max_tokens: int, example_prompts) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_multistep = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with vllm_runner(model, num_scheduler_steps=1,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_single_step = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_outputs_multistep,
-        outputs_1_lst=vllm_outputs_single_step,
-        name_0="vllm_outputs_multistep",
-        name_1="vllm_outputs_single_step",
-    )
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
deleted file mode 100644
index 5aeeb517854..00000000000
--- a/tests/models/embedding/utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from collections.abc import Sequence
-
-import torch
-import torch.nn.functional as F
-
-
-def check_embeddings_close(
-    *,
-    embeddings_0_lst: Sequence[list[float]],
-    embeddings_1_lst: Sequence[list[float]],
-    name_0: str,
-    name_1: str,
-    tol: float = 1e-3,
-) -> None:
-    assert len(embeddings_0_lst) == len(embeddings_1_lst)
-
-    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
-            zip(embeddings_0_lst, embeddings_1_lst)):
-        assert len(embeddings_0) == len(embeddings_1), (
-            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
-
-        sim = F.cosine_similarity(torch.tensor(embeddings_0),
-                                  torch.tensor(embeddings_1),
-                                  dim=0)
-
-        fail_msg = (f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
-                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
-
-        assert sim >= 1 - tol, fail_msg
-
-
-def matryoshka_fy(tensor, dimensions):
-    tensor = torch.tensor(tensor)
-    tensor = tensor[..., :dimensions]
-    tensor = F.normalize(tensor, p=2, dim=1)
-    return tensor
diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/vision_language/__init__.py b/tests/models/encoder_decoder/vision_language/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py
deleted file mode 100644
index 8d986414eec..00000000000
--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-from ....utils import multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", [
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
-])
-def test_models(hf_runner, vllm_runner, image_assets,
-                distributed_executor_backend, model) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
-        from .test_mllama import models, run_test
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=models[0],
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
diff --git a/tests/models/decoder_only/__init__.py b/tests/models/language/__init__.py
similarity index 100%
rename from tests/models/decoder_only/__init__.py
rename to tests/models/language/__init__.py
diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/language/generation/__init__.py
similarity index 100%
rename from tests/models/decoder_only/audio_language/__init__.py
rename to tests/models/language/generation/__init__.py
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/language/generation/test_bart.py
similarity index 98%
rename from tests/models/encoder_decoder/language/test_bart.py
rename to tests/models/language/generation/test_bart.py
index e8070d28bef..8ab0167dc77 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
-"""
 from typing import Optional
 
 import pytest
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/language/generation/test_common.py
similarity index 65%
rename from tests/models/decoder_only/language/test_models.py
rename to tests/models/language/generation/test_common.py
index 79fa3fa9977..c755593c9ac 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/language/generation/test_common.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-Run `pytest tests/models/test_models.py`.
-"""
+import os
+from typing import Optional
 
 import pytest
 import torch
 
 from vllm.platforms import current_platform
 
+from ....utils import large_gpu_mark
+from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 # These have unsupported head_dim for FA. We do not
@@ -25,9 +25,9 @@
 AITER_MODEL_LIST = [
     "meta-llama/Llama-3.2-1B-Instruct",
     "openbmb/MiniCPM3-4B",
-    "Qwen/Qwen-7B",
+    "Qwen/Qwen-7B-Chat",
     "Qwen/Qwen2.5-0.5B-Instruct",
-    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
+    "TitanML/tiny-mixtral",
 ]
 
 
@@ -60,7 +60,8 @@
         pytest.param(
             "openbmb/MiniCPM3-4B",
             # fused_moe not supported on CPU
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model,
+                   large_gpu_mark(min_gb=32)],
         ),
         pytest.param(
             "facebook/opt-125m",  # opt
@@ -71,7 +72,7 @@
             marks=[pytest.mark.core_model],
         ),
         pytest.param(
-            "Qwen/Qwen-7B",  # qwen (text-only)
+            "Qwen/Qwen-7B-Chat",  # qwen (text-only)
         ),
         pytest.param(
             "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
@@ -80,18 +81,21 @@
         pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
-            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
+            "TitanML/tiny-mixtral",  # mixtral
             marks=[pytest.mark.cpu_model],
         )
     ])
-@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_models(hf_runner, vllm_runner, example_prompts, model: str,
-                dtype: str, max_tokens: int, num_logprobs: int,
-                use_rocm_aiter: bool, monkeypatch) -> None:
+                max_tokens: int, num_logprobs: int, use_rocm_aiter: bool,
+                monkeypatch) -> None:
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     if model in REQUIRES_V0:
         monkeypatch.setenv("VLLM_USE_V1", "0")
@@ -105,17 +109,38 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
         # in parts of the operators
         pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        if model.startswith("THUDM/chatglm3"):
-            hf_model.model.get_output_embeddings = lambda: \
-                hf_model.model.transformer.output_layer
+    use_prompt_embeds = os.getenv("VLLM_USE_V1") == "0"
 
+    with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+        prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds
+                                                       else None)
+
+        prompt_token_ids = []
+        for prompt in example_prompts:
+            token_ids = hf_model.tokenizer(prompt,
+                                           return_tensors="pt").input_ids.to(
+                                               hf_model.model.device)
+            prompt_token_ids.append(token_ids)
+            if prompt_embeds is not None:
+                prompt_embeds.append(hf_model.model.get_input_embeddings()(
+                    token_ids).squeeze(0))
+
+    with vllm_runner(
+            model,
+            tokenizer_name=model_info.tokenizer or model,
+            tokenizer_mode=model_info.tokenizer_mode,
+            trust_remote_code=model_info.trust_remote_code,
+            max_num_seqs=2,
+            enable_prompt_embeds=use_prompt_embeds,
+    ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
+        if prompt_embeds is not None:
+            vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
+                prompt_embeds, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
@@ -123,6 +148,14 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
         name_0="hf",
         name_1="vllm",
     )
+    if prompt_embeds is not None:
+        check_logprobs_close(
+            outputs_0_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs_from_embeds,
+            name_0="vllm",
+            name_1="vllm_from_embeds",
+        )
+
     if use_rocm_aiter:
         # this is to ensure that vllm engine
         # has deallocated the memory before running the next
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/language/generation/test_granite.py
similarity index 89%
rename from tests/models/decoder_only/language/test_granite.py
rename to tests/models/language/generation/test_granite.py
index 119b79d64c9..f381c34f44b 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
-
-Run `pytest tests/models/test_granite.py`.
-"""
 import pytest
 
 from ...utils import check_logprobs_close
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
new file mode 100644
index 00000000000..880967b4aed
--- /dev/null
+++ b/tests/models/language/generation/test_hybrid.py
@@ -0,0 +1,315 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from tests.utils import multi_gpu_test
+from vllm.engine.arg_utils import EngineArgs
+from vllm.sampling_params import SamplingParams
+
+from ...utils import check_logprobs_close, check_outputs_equal
+
+# NOTE: The first model in each list is taken as the primary model,
+# meaning that it will be used in all tests in this file
+# The rest of the models will only be tested by test_models
+
+SSM_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "tiiuae/falcon-mamba-tiny-dev",
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
+    # See https://github.com/huggingface/transformers/pull/35943
+    # "mistralai/Mamba-Codestral-7B-v0.1",
+]
+
+HYBRID_MODELS = [
+    "ai21labs/Jamba-tiny-dev",
+    # NOTE: Running Plamo2 in transformers implementation requires to install
+    # causal-conv1d package, which is not listed as a test dependency as it's
+    # not compatible with pip-compile.
+    "pfnet/plamo-2-1b",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "hmellor/bamba-tiny-random",
+]
+
+# Avoid OOM
+MAX_NUM_SEQS = 4
+
+
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_batching(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    for_loop_outputs = []
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        for prompt in example_prompts:
+            single_output, = vllm_model.generate_greedy_logprobs([prompt],
+                                                                 max_tokens,
+                                                                 num_logprobs)
+            for_loop_outputs.append(single_output)
+
+        batched_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=for_loop_outputs,
+        outputs_1_lst=batched_outputs,
+        name_0="for_loop_vllm",
+        name_1="batched_vllm",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+def test_chunked_prefill(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    chunked_prefill_token_size: int,
+) -> None:
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(model,
+                     enable_chunked_prefill=True,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                      max_tokens, num_logprobs)
+
+    with vllm_runner(model,
+                     enable_chunked_prefill=False,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        non_chunked = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=chunked,
+        outputs_1_lst=non_chunked,
+        name_0="chunked",
+        name_1="non_chunked",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_chunked_prefill_with_parallel_sampling(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    Tests chunked prefill in conjunction with n > 1. 
+    
+    In this case, prefill is populated with decoding tokens and
+    we test that it doesn't fail.
+
+    This test might fail if cache is not allocated correctly for n > 1
+    decoding steps inside a chunked prefill forward pass
+    (where we have both prefill and decode together)
+    """
+    sampling_params = SamplingParams(n=3,
+                                     temperature=1,
+                                     seed=0,
+                                     max_tokens=max_tokens)
+    with vllm_runner(
+            model,
+            enable_chunked_prefill=True,
+            # forces prefill chunks with decoding
+            max_num_batched_tokens=MAX_NUM_SEQS * 3,
+            max_num_seqs=MAX_NUM_SEQS,
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    This test is for verifying that mamba cache is padded to CG captured
+    batch size. If it's not, a torch RuntimeError will be raised because
+    tensor dimensions aren't compatible.
+    """
+    vllm_config = EngineArgs(model=model,
+                             trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(
+            len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models_preemption_recompute(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    Tests that outputs are identical with and w/o preemptions (recompute).
+    """
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        scheduler = vllm_model.model.llm_engine.scheduler[0]
+        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+        preempt_vllm_outputs = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=preempt_vllm_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="vllm_preepmtions",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
+    vllm_runner,
+    example_prompts,
+    model: str,
+) -> None:
+    """
+    This test is for verifying that the hybrid inner state management doesn't
+    collapse in case where the number of incoming requests and
+    finished_requests_ids is larger than the maximum mamba block capacity.
+
+    This could generally happen due to the fact that hybrid does support
+    statelessness mechanism where it can cleanup new incoming requests in
+    a single step.
+    """
+    try:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
+                    "steps finished requests registered unnecessarily ")
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+def test_state_cleanup(
+    vllm_runner,
+    example_prompts,
+    model: str,
+) -> None:
+    """ 
+    This test is for verifying that the Hybrid state is cleaned up between
+    steps.
+    
+    If its not cleaned, an error would be expected.
+    """
+    try:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
+                    "could be related to finished_requests_ids")
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_multistep_correctness(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, num_scheduler_steps=8,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_multistep = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with vllm_runner(model, num_scheduler_steps=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_single_step = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_multistep,
+        outputs_1_lst=vllm_outputs_single_step,
+        name_0="vllm_outputs_multistep",
+        name_1="vllm_outputs_single_step",
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_distributed_correctness(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model, tensor_parallel_size=1,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, tensor_parallel_size=2,
+                     max_num_seqs=2) as vllm_model:
+        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=vllm_outputs_tp_1,
+        outputs_1_lst=vllm_outputs_tp_2,
+        name_0="vllm_tp_1",
+        name_1="vllm_tp_2",
+    )
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/language/generation/test_mistral.py
similarity index 86%
rename from tests/models/decoder_only/language/test_mistral.py
rename to tests/models/language/generation/test_mistral.py
index ec885386dd9..c1b612ae213 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
-
-Run `pytest tests/models/test_mistral.py`.
-"""
 import copy
 import json
 
@@ -10,8 +6,8 @@
 import jsonschema.exceptions
 import pytest
 
-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
-    MistralToolParser)
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall, MistralToolParser)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
@@ -194,7 +190,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
     )
 
 
-@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
@@ -246,10 +241,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
             assert "�" not in outputs[0].outputs[0].text.strip()
 
 
-@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model",
-                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
@@ -270,7 +263,8 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
         parsed_message = tool_parser.extract_tool_calls(model_output, None)
 
         assert parsed_message.tools_called
-        assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
+
+        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
         assert parsed_message.tool_calls[
             0].function.name == "get_current_weather"
         assert parsed_message.tool_calls[
@@ -281,28 +275,38 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(vllm_runner, model: str,
-                                 guided_backend: str) -> None:
-    with vllm_runner(model, dtype='bfloat16',
-                     tokenizer_mode="mistral") as vllm_model:
+def test_mistral_guided_decoding(
+    monkeypatch: pytest.MonkeyPatch,
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with monkeypatch.context() as m:
+        # Guided JSON not supported in xgrammar + V1 yet
+        m.setenv("VLLM_USE_V1", "0")
 
-        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
-                                               backend=guided_backend)
-        params = SamplingParams(max_tokens=512,
-                                temperature=0.7,
-                                guided_decoding=guided_decoding)
+        with vllm_runner(
+                model,
+                dtype='bfloat16',
+                tokenizer_mode="mistral",
+                guided_decoding_backend=guided_backend,
+        ) as vllm_model:
+            guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
+            params = SamplingParams(max_tokens=512,
+                                    temperature=0.7,
+                                    guided_decoding=guided_decoding)
 
-        messages = [{
-            "role": "system",
-            "content": "you are a helpful assistant"
-        }, {
-            "role":
-            "user",
-            "content":
-            f"Give an example JSON for an employee profile that "
-            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
-        }]
-        outputs = vllm_model.model.chat(messages, sampling_params=params)
+            messages = [{
+                "role": "system",
+                "content": "you are a helpful assistant"
+            }, {
+                "role":
+                "user",
+                "content":
+                f"Give an example JSON for an employee profile that "
+                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+            }]
+            outputs = vllm_model.model.chat(messages, sampling_params=params)
 
         generated_text = outputs[0].outputs[0].text
         json_response = json.loads(generated_text)
diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/language/generation/test_phimoe.py
similarity index 96%
rename from tests/models/decoder_only/language/test_phimoe.py
rename to tests/models/language/generation/test_phimoe.py
index f9757d6ac29..603ca1cb12a 100644
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
-
-Run `pytest tests/models/test_phimoe.py`.
-"""
 import pytest
 import torch
 
diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/language/pooling/__init__.py
similarity index 100%
rename from tests/models/decoder_only/language/__init__.py
rename to tests/models/language/pooling/__init__.py
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/language/pooling/test_classification.py
similarity index 91%
rename from tests/models/embedding/language/test_cls_models.py
rename to tests/models/language/pooling/test_classification.py
index 6a3cd8a5c59..44af3df08a8 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the classification outputs of HF and vLLM models.
-
-Run `pytest tests/models/test_cls_models.py`.
-"""
 import pytest
 import torch
 from transformers import AutoModelForSequenceClassification
@@ -19,7 +15,7 @@
 )
 @pytest.mark.parametrize("dtype",
                          ["half"] if current_platform.is_rocm() else ["float"])
-def test_classification_models(
+def test_models(
     hf_runner,
     vllm_runner,
     example_prompts,
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/language/pooling/test_embedding.py
similarity index 94%
rename from tests/models/embedding/language/test_embedding.py
rename to tests/models/language/pooling/test_embedding.py
index 5deb35fa321..9db385e77bd 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the embedding outputs of HF and vLLM models.
-
-Run `pytest tests/models/embedding/language/test_embedding.py`.
-"""
 import pytest
 
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform
 
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 
 @pytest.mark.parametrize(
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
similarity index 64%
rename from tests/models/embedding/language/test_gritlm.py
rename to tests/models/language/pooling/test_gritlm.py
index 87a1dde9381..3ad6e719094 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -7,11 +7,10 @@
 
 import openai
 import pytest
-import pytest_asyncio
 from scipy.spatial.distance import cosine
 
-import vllm
-import vllm.config
+from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig
 from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ....utils import RemoteOpenAIServer
@@ -31,73 +30,45 @@ def _arr(arr):
     return array("i", arr)
 
 
-def test_find_array(monkeypatch: pytest.MonkeyPatch):
-    # GritLM embedding implementation is only supported by XFormers backend.
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-
-        from vllm.model_executor.models.gritlm import GritLMPooler
-
-        # Create an LLM object to get the model config.
-        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
-
-        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
-
-        with pytest.raises(ValueError):
-            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
-
-
-@pytest.fixture(scope="module")
-def server_embedding():
-    # GritLM embedding implementation is only supported by XFormers backend.
-    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-    with pytest.MonkeyPatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
-
-
-@pytest.fixture(scope="module")
-def server_generate():
-    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
-    with pytest.MonkeyPatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+def test_find_array():
+    from vllm.model_executor.models.gritlm import GritLMPooler
 
+    model_config = ModelConfig(
+        MODEL_NAME,
+        task="embed",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="bfloat16",
+        seed=0,
+    )
+    pooler = GritLMPooler(model_config=model_config)
 
-@pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
-    async with server_embedding.get_async_client() as async_client:
-        yield async_client
+    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
 
-@pytest_asyncio.fixture
-async def client_generate(server_generate: RemoteOpenAIServer):
-    async with server_generate.get_async_client() as async_client:
-        yield async_client
+    with pytest.raises(ValueError):
+        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
 
 
 def run_llm_encode(
-    llm: vllm.LLM,
+    llm: LLM,
     queries: list[str],
     instruction: str,
-) -> list[float]:
-    outputs = llm.encode([instruction + q for q in queries], )
+) -> list[list[float]]:
+    outputs = llm.embed([instruction + q for q in queries])
     return [output.outputs.embedding for output in outputs]
 
 
 async def run_client_embeddings(
-    client: vllm.LLM,
+    client: openai.AsyncOpenAI,
     queries: list[str],
     instruction: str,
-) -> list[float]:
+) -> list[list[float]]:
     outputs = await client.embeddings.create(
         model=MODEL_NAME,
         input=[instruction + q for q in queries],
@@ -132,7 +103,7 @@ def get_test_data():
     return queries, q_instruction, documents, d_instruction
 
 
-def validate_embed_output(q_rep: list[float], d_rep: list[float]):
+def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
     cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
     assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
 
@@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
     assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
 
     cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
+    assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
 
 
-def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
+                                  vllm_runner):
     # GritLM embedding implementation is only supported by XFormers backend.
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
         queries, q_instruction, documents, d_instruction = get_test_data()
 
-        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        with vllm_runner(
+                MODEL_NAME,
+                task="embed",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
+
+            d_rep = run_llm_encode(
+                llm,
+                documents,
+                d_instruction,
+            )
+            q_rep = run_llm_encode(
+                llm,
+                queries,
+                q_instruction,
+            )
+
+        validate_embed_output(q_rep, d_rep)
+
+
+@pytest.mark.asyncio
+async def test_gritlm_api_server_embedding():
+    queries, q_instruction, documents, d_instruction = get_test_data()
+
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_embedding = server.get_async_client()
 
-        d_rep = run_llm_encode(
-            llm,
+        d_rep = await run_client_embeddings(
+            client_embedding,
             documents,
             d_instruction,
         )
-        q_rep = run_llm_encode(
-            llm,
+        q_rep = await run_client_embeddings(
+            client_embedding,
             queries,
             q_instruction,
         )
 
-        validate_embed_output(q_rep, d_rep)
-
-
-@pytest.mark.asyncio
-async def test_gritlm_api_server_embedding(
-    client_embedding: openai.AsyncOpenAI, ):
-    queries, q_instruction, documents, d_instruction = get_test_data()
+    validate_embed_output(q_rep, d_rep)
 
-    d_rep = await run_client_embeddings(
-        client_embedding,
-        documents,
-        d_instruction,
-    )
-    q_rep = await run_client_embeddings(
-        client_embedding,
-        queries,
-        q_instruction,
-    )
 
-    validate_embed_output(q_rep, d_rep)
+def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
+    # GritLM embedding implementation is only supported by XFormers backend.
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
+        input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
-def test_gritlm_offline_gen():
-    input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
+        with vllm_runner(
+                MODEL_NAME,
+                task="generate",
+                max_model_len=MAX_MODEL_LEN,
+        ) as vllm_model:
+            llm = vllm_model.model
 
-    llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
-    sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
-    outputs = llm.generate(input, sampling_params=sampling_params)
+            sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
+            outputs = llm.generate(input, sampling_params=sampling_params)
 
-    assert outputs[0].outputs[0].text == "The capital of France is Paris."
+        assert outputs[0].outputs[0].text == "The capital of France is Paris."
 
 
 @pytest.mark.asyncio
-async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
+async def test_gritlm_api_server_generate():
     input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
 
-    outputs = await client_generate.completions.create(
-        model=MODEL_NAME,
-        prompt=input,
-        max_tokens=256,
-        temperature=0.0,
-    )
+    # GritLM embedding implementation is only supported by XFormers backend.
+    args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
+    env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
+        client_generate = server.get_async_client()
+
+        outputs = await client_generate.completions.create(
+            model=MODEL_NAME,
+            prompt=input,
+            max_tokens=256,
+            temperature=0.0,
+        )
 
     assert outputs.choices[0].text == "The capital of France is Paris."
diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/language/pooling/test_jina.py
similarity index 82%
rename from tests/models/embedding/language/test_jina.py
rename to tests/models/language/pooling/test_jina.py
index 881d0a75b15..5287ca37c0f 100644
--- a/tests/models/embedding/language/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -1,16 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-# ruff: noqa: E501
-"""Compare the scoring outputs of HF and vLLM models.
-
-Run `pytest tests/models/embedding/language/test_jina.py`.
-"""
 import math
 
 import pytest
 
-from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
 from vllm import PoolingParams
 
+from ...utils import check_embeddings_close, matryoshka_fy
+
 SCORING_MODELS = [
     "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
 ]
@@ -21,9 +17,9 @@
     "Organic skincare for sensitive skin with aloe vera and chamomile.",
     "New makeup trends focus on bold colors and innovative techniques",
     "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
-    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
-    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
-    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
+    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",  # noqa: E501
+    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",  # noqa: E501
+    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",  # noqa: E501
     "针对敏感肌专门设计的天然有机护肤产品",
     "新的化妆趋势注重鲜艳的颜色和创新的技巧",
     "敏感肌のために特別に設計された天然有機スキンケア製品",
@@ -153,14 +149,24 @@ def test_matryoshka(
 
     with vllm_runner(model, task="embed", dtype=dtype,
                      max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(
-            example_prompts,
-            pooling_params=PoolingParams(dimensions=dimensions))
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
+        matryoshka_dimensions = (
+            vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
+        assert matryoshka_dimensions is not None
+
+        if dimensions not in matryoshka_dimensions:
+            with pytest.raises(ValueError):
+                vllm_model.encode(
+                    example_prompts,
+                    pooling_params=PoolingParams(dimensions=dimensions))
+        else:
+            vllm_outputs = vllm_model.encode(
+                example_prompts,
+                pooling_params=PoolingParams(dimensions=dimensions))
+
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/language/pooling/test_scoring.py
similarity index 72%
rename from tests/models/embedding/language/test_scoring.py
rename to tests/models/language/pooling/test_scoring.py
index d6408258ffc..e9527700c3c 100644
--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the scoring outputs of HF and vLLM models.
-
-Run `pytest tests/models/embedding/language/test_scoring.py`.
-"""
 import math
 
 import pytest
 import torch
 import torch.nn.functional as F
 
-MODELS = [
+CROSS_ENCODER_MODELS = [
     "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Bert
     "BAAI/bge-reranker-v2-m3",  # Roberta
 ]
@@ -28,21 +24,21 @@
     "The capital of Germany is Berlin.",
 ]
 
+DTYPE = "half"
+
 
-@pytest.fixture(scope="module", params=MODELS)
+@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS)
 def model_name(request):
     yield request.param
 
 
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
-
+def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
     text_pair = [TEXTS_1[0], TEXTS_2[0]]
 
-    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict([text_pair]).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=dtype,
+    with vllm_runner(model_name, task="score", dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
@@ -52,18 +48,16 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
 
 
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
-
+def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
     text_pairs = [
         [TEXTS_1[0], TEXTS_2[0]],
         [TEXTS_1[0], TEXTS_2[1]],
     ]
 
-    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=dtype,
+    with vllm_runner(model_name, task="score", dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
@@ -74,18 +68,16 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
 
 
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
-
+def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
     text_pairs = [
         [TEXTS_1[0], TEXTS_2[0]],
         [TEXTS_1[1], TEXTS_2[1]],
     ]
 
-    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+    with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
         hf_outputs = hf_model.predict(text_pairs).tolist()
 
-    with vllm_runner(model_name, task="score", dtype=dtype,
+    with vllm_runner(model_name, task="score", dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
@@ -101,13 +93,10 @@ def emb_model_name(request):
     yield request.param
 
 
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
-                              dtype: str):
-
+def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
     text_pair = [TEXTS_1[0], TEXTS_2[0]]
 
-    with hf_runner(emb_model_name, dtype=dtype,
+    with hf_runner(emb_model_name, dtype=DTYPE,
                    is_sentence_transformer=True) as hf_model:
         hf_embeddings = hf_model.encode(text_pair)
         hf_outputs = [
@@ -116,7 +105,7 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
 
     with vllm_runner(emb_model_name,
                      task="embed",
-                     dtype=dtype,
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
 
@@ -126,16 +115,13 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
     assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
 
 
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
-                              dtype: str):
-
+def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
     text_pairs = [
         [TEXTS_1[0], TEXTS_2[0]],
         [TEXTS_1[0], TEXTS_2[1]],
     ]
 
-    with hf_runner(emb_model_name, dtype=dtype,
+    with hf_runner(emb_model_name, dtype=DTYPE,
                    is_sentence_transformer=True) as hf_model:
         hf_embeddings = [
             hf_model.encode(text_pair) for text_pair in text_pairs
@@ -147,7 +133,7 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
 
     with vllm_runner(emb_model_name,
                      task="embed",
-                     dtype=dtype,
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
 
@@ -158,16 +144,13 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
     assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
 
 
-@pytest.mark.parametrize("dtype", ["half"])
-def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
-                              dtype: str):
-
+def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
     text_pairs = [
         [TEXTS_1[0], TEXTS_2[0]],
         [TEXTS_1[1], TEXTS_2[1]],
     ]
 
-    with hf_runner(emb_model_name, dtype=dtype,
+    with hf_runner(emb_model_name, dtype=DTYPE,
                    is_sentence_transformer=True) as hf_model:
         hf_embeddings = [
             hf_model.encode(text_pair) for text_pair in text_pairs
@@ -179,7 +162,7 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
 
     with vllm_runner(emb_model_name,
                      task="embed",
-                     dtype=dtype,
+                     dtype=DTYPE,
                      max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
 
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
new file mode 100644
index 00000000000..c050b35b76b
--- /dev/null
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from ...utils import EmbedModelInfo, check_embeddings_close
+
+EMBEDDING_PROMPTS = [
+    'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
+    'Mexico City of Course!'
+]
+
+MODELS = [
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
+                   is_matryoshka=False,
+                   architecture="NomicBertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
+                   is_matryoshka=False,
+                   architecture="BertModel",
+                   enable_test=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   architecture="BertModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
+                   is_matryoshka=True,
+                   architecture="XLMRobertaModel",
+                   enable_test=True),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                   is_matryoshka=True,
+                   architecture="GteModel",
+                   enable_test=True),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_info: EmbedModelInfo,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    if not model_info.enable_test:
+        # A model family has many models with the same architecture,
+        # and we don't need to test each one.
+        pytest.skip("Skipping test.")
+
+    example_prompts = example_prompts + EMBEDDING_PROMPTS
+
+    vllm_extra_kwargs = {
+        "hf_overrides": {
+            "is_matryoshka": model_info.is_matryoshka
+        }
+    }
+
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model_info.name,
+                     task="embed",
+                     dtype=dtype,
+                     max_model_len=None,
+                     **vllm_extra_kwargs) as vllm_model:
+
+        assert (vllm_model.model.llm_engine.model_config.is_matryoshka ==
+                model_info.is_matryoshka)
+
+        if model_info.architecture:
+            assert (model_info.architecture
+                    in vllm_model.model.llm_engine.model_config.architectures)
+
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
new file mode 100644
index 00000000000..1b8ac395ed1
--- /dev/null
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input_str = """Immerse yourself in the enchanting chronicle of calculus, a 
+mathematical domain that has radically transformed our comprehension of 
+change and motion. Despite its roots in ancient civilizations, the 
+formal birth of calculus predominantly occurred in the 17th century, 
+primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+ancient Greek mathematics,most notably in the works of Eudoxus and 
+Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+technique for computing areas and volumes through the use of finite sums. 
+This methodology laid crucial foundational work for integral calculus. 
+In the 17th century, both Newton and Leibniz independently pioneered 
+calculus, each contributing unique perspectives that would shape this new 
+field."""
+
+
+def test_smaller_truncation_size(vllm_runner,
+                                 model_name=MODEL_NAME,
+                                 input_str=input_str):
+
+    truncate_prompt_tokens = 10
+
+    with vllm_runner(model_name, task="embed",
+                     max_model_len=max_model_len) as vllm_model:
+        vllm_output = vllm_model.model.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+    prompt_tokens = vllm_output[0].prompt_token_ids
+
+    assert len(prompt_tokens) == truncate_prompt_tokens
+
+
+def test_max_truncation_size(vllm_runner,
+                             model_name=MODEL_NAME,
+                             input_str=input_str):
+    truncate_prompt_tokens = -1
+
+    with vllm_runner(model_name, task="embed",
+                     max_model_len=max_model_len) as vllm_model:
+        vllm_output = vllm_model.model.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+    prompt_tokens = vllm_output[0].prompt_token_ids
+
+    assert len(prompt_tokens) == max_model_len
+
+
+def test_bigger_truncation_size(vllm_runner,
+                                model_name=MODEL_NAME,
+                                input_str=input_str):
+
+    truncate_prompt_tokens = max_model_len + 1
+
+    with pytest.raises(ValueError), vllm_runner(
+            model_name, task="embed",
+            max_model_len=max_model_len) as vllm_model:
+
+        llm_output = vllm_model.model.encode(
+            input_str, truncate_prompt_tokens=truncate_prompt_tokens)
+
+        assert llm_output == f"""truncate_prompt_tokens value 
+                ({truncate_prompt_tokens}) is greater than 
+                max_model_len ({max_model_len}). Please, select 
+                a smaller truncation size."""
diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/multimodal/generation/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/__init__.py
rename to tests/models/multimodal/generation/__init__.py
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/multimodal/generation/test_common.py
similarity index 91%
rename from tests/models/decoder_only/vision_language/test_models.py
rename to tests/models/multimodal/generation/test_common.py
index 5c87cefcd8e..6e915a9f600 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -8,13 +8,14 @@
 from pathlib import PosixPath
 
 import pytest
-from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
+from transformers import (AutoModelForImageTextToText,
+                          AutoModelForTextToWaveform, AutoModelForVision2Seq)
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
-                          _VideoAssets)
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
+                          VideoTestAssets, VllmRunner)
 from ....utils import (create_new_process_for_each_test, large_gpu_mark,
                        multi_gpu_marks)
 from ...utils import check_outputs_equal
@@ -139,6 +140,24 @@
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    "qwen2_5_omni": VLMTestInfo(
+        models=["Qwen/Qwen2.5-Omni-3B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForTextToWaveform,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
@@ -250,6 +269,7 @@
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
+        dtype="bfloat16",
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@@ -406,6 +426,8 @@
         get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
@@ -417,6 +439,8 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
@@ -428,6 +452,21 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        marks=[pytest.mark.skip("HF import fails")],
+    ),
+    "minimax_vl_01": VLMTestInfo(
+        models=["MiniMaxAI/MiniMax-VL-01"],
+        prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
+        img_idx_to_prompt=lambda _: "<image>",
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        max_model_len=8192,
+        max_num_seqs=4,
+        dtype="bfloat16",
+        hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
+        patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
+        auto_cls=AutoModelForImageTextToText,
+        marks=[large_gpu_mark(min_gb=80)],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
@@ -437,6 +476,18 @@
         max_num_seqs=2,
         patch_hf_runner=model_utils.molmo_patch_hf_runner,
     ),
+    "ovis2": VLMTestInfo(
+        models=["AIDC-AI/Ovis2-1B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
+        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
+        patch_hf_runner=model_utils.ovis2_patch_hf_runner,
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -642,7 +693,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
                              hf_runner: type[HfRunner],
                              vllm_runner: type[VllmRunner],
-                             image_assets: _ImageAssets, monkeypatch):
+                             image_assets: ImageTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -667,7 +718,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
-                            image_assets: _ImageAssets, monkeypatch):
+                            image_assets: ImageTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -692,7 +743,7 @@ def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
                                 hf_runner: type[HfRunner],
                                 vllm_runner: type[VllmRunner],
-                                image_assets: _ImageAssets, monkeypatch):
+                                image_assets: ImageTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -714,7 +765,7 @@ def test_image_embedding_models(model_type: str,
     ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      video_assets: _VideoAssets, monkeypatch):
+                      video_assets: VideoTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -765,7 +816,7 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
                                    hf_runner: type[HfRunner],
                                    vllm_runner: type[VllmRunner],
-                                   image_assets: _ImageAssets, monkeypatch):
+                                   image_assets: ImageTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -791,7 +842,7 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
                                   hf_runner: type[HfRunner],
                                   vllm_runner: type[VllmRunner],
-                                  image_assets: _ImageAssets, monkeypatch):
+                                  image_assets: ImageTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -817,7 +868,8 @@ def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
                                       hf_runner: type[HfRunner],
                                       vllm_runner: type[VllmRunner],
-                                      image_assets: _ImageAssets, monkeypatch):
+                                      image_assets: ImageTestAssets,
+                                      monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
@@ -840,7 +892,7 @@ def test_image_embedding_models_heavy(model_type: str,
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
-                            video_assets: _VideoAssets, monkeypatch):
+                            video_assets: VideoTestAssets, monkeypatch):
     if model_type in REQUIRES_V0_MODELS:
         monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py
similarity index 87%
rename from tests/models/encoder_decoder/vision_language/test_florence2.py
rename to tests/models/multimodal/generation/test_florence2.py
index a6ec333e2e9..b8225f5f124 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/multimodal/generation/test_florence2.py
@@ -9,16 +9,16 @@
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
 from ...utils import check_logprobs_close
 
 MODELS = ["microsoft/Florence-2-base"]
-# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-# Therefore, we borrow the BartTokenizer from the original Bart model
-TOKENIZER = "facebook/bart-base"
+# Florence-2 model repo's tokenizer config is missing some special tokens.
+# Therefore, we use a converted tokenizer from a forked repo
+TOKENIZER = "Isotr0py/Florence-2-tokenizer"
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
-    "<CAPTION>",  # special task token
+    "<OD>",  # special task token which will output special tokens
     "cherry_blossom":
     "Describe in detail what is shown in the image.",
 })
@@ -45,7 +45,6 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str,
     output_ids, output_str, out_logprobs = hf_output
 
     output_str = output_str.replace("</s>", "").replace("<s>", "")
-    output_ids = [ids for ids in output_ids if ids not in [0, 2]]
 
     return output_ids, output_str, out_logprobs
 
@@ -71,8 +70,11 @@ def run_test(
                      enforce_eager=True) as vllm_model:
         vllm_outputs_per_case = [
             vllm_model.generate_encoder_decoder_greedy_logprobs(
-                prompts, max_tokens, num_logprobs=num_logprobs)
-            for prompts in inputs
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                skip_special_tokens=False,
+            ) for prompts in inputs
         ]
 
     hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
@@ -93,6 +95,7 @@ def run_test(
             outputs_1_lst=vllm_outputs,
             name_0="hf",
             name_1="vllm",
+            num_outputs_0_skip_tokens=1,
         )
 
 
@@ -115,7 +118,7 @@ def run_test(
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                image_assets: _ImageAssets, model: str,
+                image_assets: ImageTestAssets, model: str,
                 size_factors: list[int], dtype: str, max_tokens: int,
                 num_logprobs: int) -> None:
     images = [asset.pil_image for asset in image_assets]
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
new file mode 100644
index 00000000000..96c444441e3
--- /dev/null
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Sequence
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSpeechSeq2Seq
+
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
+                          VllmRunner)
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    """Sanitize hf output to be comparable with vllm output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    hf_output_str = output_str + "<|end_of_text|>"
+
+    return output_ids, hf_output_str, out_logprobs
+
+
+MODEL_NAME = "ibm-granite/granite-speech-3.3-8b"
+# Audio lora co-exists directly in the model directory, but
+# currently still needs to be passed directly to vLLM.
+audio_lora_path = MODEL_NAME
+models = [MODEL_NAME]
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptAudioInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the audio fixtures for the test are from AUDIO_ASSETS.
+    For huggingface runner, we provide the audio as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=1,
+            dtype=dtype,
+            limit_mm_per_prompt={"audio": 1},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=64,
+            enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("audio", 1, audio_lora_path)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                audios=audios,
+                                                lora_request=lora_request)
+            for prompts, audios in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    audios=[audios],
+                                                    eos_token_id=eos_token_id)
+            for prompts, audios in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(output) for output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_model_len", [2048])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, model: str,
+                audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
+                max_tokens: int, num_logprobs: int) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    audio, sr = audio_assets[0].audio_and_sample_rate
+    # This model expects 16k sample rate, which our test audio
+    # already is; if this changes, it may break this test,
+    # so we check it directly
+    assert sr == 16000
+    run_test(
+        hf_runner,
+        vllm_runner,
+        [
+            ([HF_AUDIO_PROMPT], [audio]),
+        ],
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
similarity index 96%
rename from tests/models/decoder_only/vision_language/test_interleaved.py
rename to tests/models/multimodal/generation/test_interleaved.py
index 8804497ae61..eec84751e45 100644
--- a/tests/models/decoder_only/vision_language/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -16,6 +16,7 @@ def base_prompt(modalities_str: str) -> str:
 NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
 
 
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["float16"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -28,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
     image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
     images = [image_cherry, image_stop]
-    video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays
+    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
 
     inputs = [
         (
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
similarity index 95%
rename from tests/models/encoder_decoder/vision_language/test_mllama.py
rename to tests/models/multimodal/generation/test_mllama.py
index d94c2e885cb..99aa3c2d3bd 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -14,10 +14,11 @@
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                          _ImageAssets)
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
+                          PromptImageInput, VllmRunner)
 from ....quantization.utils import is_quant_method_supported
-from ....utils import large_gpu_test
+from ....utils import (create_new_process_for_each_test, large_gpu_test,
+                       multi_gpu_test)
 from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 3
@@ -89,7 +90,7 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
 
 
 def _get_inputs(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     *,
     size_factors: Optional[list[float]] = None,
     sizes: Optional[list[tuple[int, int]]] = None,
@@ -125,7 +126,7 @@ def _get_inputs(
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model: str,
     *,
     size_factors: list[float],
@@ -142,7 +143,7 @@ def run_test(
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model: str,
     *,
     sizes: list[tuple[int, int]],
@@ -158,7 +159,7 @@ def run_test(
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model: str,
     *,
     size_factors: Optional[list[float]] = None,
@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
         )
 
 
+@create_new_process_for_each_test()
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    distributed_executor_backend,
+    model,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=model,
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+
 @large_gpu_test(min_gb=48)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
@@ -401,7 +433,7 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
 def test_bnb_regression(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model: str,
     dtype: str,
     max_tokens: int,
@@ -441,7 +473,7 @@ def test_bnb_regression(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 def test_explicit_implicit_prompt(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model: str,
     dtype: str,
     max_tokens: int,
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/test_phi4mm.py
rename to tests/models/multimodal/generation/test_phi4mm.py
index 3cd83001507..11460a1a8d2 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -181,7 +181,7 @@ def patch_hf_processor(*args,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -225,7 +225,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@@ -258,7 +258,7 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
 
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
similarity index 98%
rename from tests/models/decoder_only/vision_language/test_pixtral.py
rename to tests/models/multimodal/generation/test_pixtral.py
index 6ebe75f0e81..506b71472f4 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
-
-Run `pytest tests/models/test_mistral.py`.
-"""
 import json
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/test_qwen2_vl.py
rename to tests/models/multimodal/generation/test_qwen2_vl.py
index 0b27a4caf6e..6be401b775e 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -50,7 +50,7 @@ def qwen2_vl_chat_template(*query):
 })
 
 VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "sample_demo_1":
+    "baby_reading":
     qwen2_vl_chat_template(
         VIDEO_PLACEHOLDER,
         "Describe this video with a short sentence ",
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py
similarity index 79%
rename from tests/models/decoder_only/audio_language/test_ultravox.py
rename to tests/models/multimodal/generation/test_ultravox.py
index a843e41aa26..322d886a593 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -1,22 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+import json
+from typing import Any, Optional
 
 import numpy as np
 import pytest
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer
 
-from vllm.multimodal.audio import resample_audio
+from vllm.multimodal.audio import resample_audio_librosa
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import HfRunner, VllmRunner
+from ....conftest import AUDIO_ASSETS, AudioTestAssets, HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
+    "mary_had_lamb":
+    "Transcribe this into English.",
+    "winning_call":
+    "What is happening in this audio clip?",
+})
+
+MULTI_AUDIO_PROMPT = "Describe each of the audios above."
+
 AudioTuple = tuple[np.ndarray, int]
 
 VLLM_PLACEHOLDER = "<|audio|>"
@@ -30,31 +40,28 @@
 }
 
 
-@pytest.fixture(scope="session")
-def audio_assets():
-    from vllm.assets.audio import AudioAsset
-    return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
-
-
-@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
-def audio(request):
-    from vllm.assets.audio import AudioAsset
-    return AudioAsset(request.param)
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
 
 
 @pytest.fixture(params=[
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
-def server(request, audio_assets):
+def server(request, audio_assets: AudioTestAssets):
     args = [
-        "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
-        f"--limit-mm-per-prompt=audio={len(audio_assets)}",
-        "--trust-remote-code"
-    ] + [
-        f"--{key.replace('_','-')}={value}"
-        for key, value in request.param.items()
-    ]
+        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+    ] + params_kwargs_to_cli_args(request.param)
 
     with RemoteOpenAIServer(MODEL_NAME,
                             args,
@@ -135,9 +142,9 @@ def run_test(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(resample_audio(audio[0],
-                                        orig_sr=audio[1],
-                                        target_sr=16000), 16000)])
+                audios=[(resample_audio_librosa(audio[0],
+                                                orig_sr=audio[1],
+                                                target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 
@@ -195,15 +202,19 @@ def run_multi_audio_test(
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
-def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
-                num_logprobs: int, vllm_kwargs: dict) -> None:
+def test_models(hf_runner, vllm_runner, audio_assets: AudioTestAssets,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                vllm_kwargs: dict) -> None:
+    audio_inputs = [(
+        _get_prompt(1, audio, VLLM_PLACEHOLDER),
+        _get_prompt(1, audio, HF_PLACEHOLDER),
+        audio.audio_and_sample_rate,
+    ) for audio in audio_assets]
 
-    vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
-    hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
     run_test(
         hf_runner,
         vllm_runner,
-        [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
+        audio_inputs,
         MODEL_NAME,
         dtype=dtype,
         max_tokens=max_tokens,
@@ -220,12 +231,12 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
 ])
-def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
+def test_models_with_multiple_audios(vllm_runner,
+                                     audio_assets: AudioTestAssets, dtype: str,
                                      max_tokens: int, num_logprobs: int,
                                      vllm_kwargs: dict) -> None:
 
-    vllm_prompt = _get_prompt(len(audio_assets),
-                              "Describe each of the audios above.",
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
                               VLLM_PLACEHOLDER)
     run_multi_audio_test(
         vllm_runner,
@@ -240,7 +251,7 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
 
 
 @pytest.mark.asyncio
-async def test_online_serving(client, audio_assets):
+async def test_online_serving(client, audio_assets: AudioTestAssets):
     """Exercises online serving with/without chunked prefill enabled."""
 
     messages = [{
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
similarity index 84%
rename from tests/models/encoder_decoder/audio_language/test_whisper.py
rename to tests/models/multimodal/generation/test_whisper.py
index 7897bf113d3..4e48bdbd042 100644
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,15 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
-
-Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
-"""
 from typing import Optional
 
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 
+from ....conftest import VllmRunner
 from ....utils import create_new_process_for_each_test, multi_gpu_test
 
 PROMPTS = [
@@ -92,6 +89,7 @@
 
 
 def run_test(
+    vllm_runner: type[VllmRunner],
     model: str,
     *,
     tensor_parallel_size: int,
@@ -100,38 +98,52 @@ def run_test(
     prompt_list = PROMPTS * 10
     expected_list = EXPECTED[model] * 10
 
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
+    with vllm_runner(
+            model,
+            max_model_len=448,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        llm = vllm_model.model
 
-    sampling_params = SamplingParams(
-        temperature=0,
-        top_p=1.0,
-        max_tokens=200,
-    )
+        sampling_params = SamplingParams(
+            temperature=0,
+            top_p=1.0,
+            max_tokens=200,
+        )
 
-    outputs = llm.generate(prompt_list, sampling_params)
+        outputs = llm.generate(prompt_list, sampling_params)
 
     for output, expected in zip(outputs, expected_list):
         print(output.outputs[0].text)
         assert output.outputs[0].text == expected
 
 
-@create_new_process_for_each_test()
 @pytest.mark.core_model
 @pytest.mark.parametrize(
     "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
-def test_models(model) -> None:
-    run_test(model, tensor_parallel_size=1)
+@create_new_process_for_each_test()
+def test_models(vllm_runner, model) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=1,
+    )
 
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-def test_models_distributed(model, distributed_executor_backend) -> None:
-    run_test(model,
-             tensor_parallel_size=2,
-             distributed_executor_backend=distributed_executor_backend)
+@create_new_process_for_each_test()
+def test_models_distributed(
+    vllm_runner,
+    model,
+    distributed_executor_backend,
+) -> None:
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=2,
+        distributed_executor_backend=distributed_executor_backend,
+    )
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/__init__.py b/tests/models/multimodal/generation/vlm_utils/__init__.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/__init__.py
rename to tests/models/multimodal/generation/vlm_utils/__init__.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
similarity index 97%
rename from tests/models/decoder_only/vision_language/vlm_utils/builders.py
rename to tests/models/multimodal/generation/vlm_utils/builders.py
index bf5f87ebf98..e3ba955a96a 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -11,7 +11,7 @@
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                    sample_frames_from_video)
 
-from .....conftest import _ImageAssets, _VideoAssets
+from .....conftest import ImageTestAssets, VideoTestAssets
 from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
                     TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
                     ImageSizeWrapper, SizeType, VLMTestInfo)
@@ -69,7 +69,7 @@ def get_model_prompts(base_prompts: Iterable[str],
 
 def build_single_image_inputs_from_test_info(
         test_info: VLMTestInfo,
-        image_assets: _ImageAssets,
+        image_assets: ImageTestAssets,
         size_wrapper: ImageSizeWrapper,
         tmp_path: Optional[PosixPath] = None):
     if test_info.prompt_formatter is None:
@@ -116,7 +116,7 @@ def build_single_image_inputs(images, model_prompts,
 
 def build_multi_image_inputs_from_test_info(
         test_info: VLMTestInfo,
-        image_assets: _ImageAssets,
+        image_assets: ImageTestAssets,
         size_wrapper: ImageSizeWrapper,
         tmp_path: Optional[PosixPath] = None):
     if test_info.prompt_formatter is None:
@@ -159,7 +159,7 @@ def build_multi_image_inputs(image_lists, model_prompts,
 
 def build_embedding_inputs_from_test_info(
     test_info: VLMTestInfo,
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     size_wrapper: ImageSizeWrapper,
 ):
     # These conditions will always be true if invoked through filtering,
@@ -192,7 +192,7 @@ def build_embedding_inputs_from_test_info(
 
 def build_video_inputs_from_test_info(
     test_info: VLMTestInfo,
-    video_assets: _VideoAssets,
+    video_assets: VideoTestAssets,
     size_wrapper: ImageSizeWrapper,
     num_frames: int,
 ):
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
rename to tests/models/multimodal/generation/vlm_utils/case_filtering.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
similarity index 99%
rename from tests/models/decoder_only/vision_language/vlm_utils/core.py
rename to tests/models/multimodal/generation/vlm_utils/core.py
index fd046f3cd8e..c3d20f56855 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -67,7 +67,7 @@ def run_test(
         "disable_mm_preprocessor_cache": True,
     }
     if model_info.tokenizer:
-        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
+        vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
     if model_info.tokenizer_mode:
         vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
     if model_info.hf_overrides:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
similarity index 100%
rename from tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
rename to tests/models/multimodal/generation/vlm_utils/custom_inputs.py
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
similarity index 91%
rename from tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
rename to tests/models/multimodal/generation/vlm_utils/model_utils.py
index 49305332726..f0f4ed98924 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -16,7 +16,7 @@
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
 
-from .....conftest import HfRunner, ImageAsset, _ImageAssets
+from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput
 
 
@@ -229,15 +229,23 @@ def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
     return output_ids, output_str, out_logprobs
 
 
+def minimax_vl_01_hf_output(hf_output: RunnerOutput,
+                            model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_sentence>"):
+        output_str = output_str.split("<end_of_sentence>")[0]
+    return output_ids, output_str, out_logprobs
+
+
 ####### Functions for converting image assets to embeddings
-def get_llava_embeddings(image_assets: _ImageAssets):
+def get_llava_embeddings(image_assets: ImageTestAssets):
     return [asset.image_embeds for asset in image_assets]
 
 
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
-                                                        _ImageAssets]) -> str:
+        tmp_path: PosixPath, prompt: str,
+        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
     """Given a temporary dir path, export one or more image assets into the
     tempdir & replace its contents with the local path to the string so that
     the HF version of Qwen-VL can resolve the path and load the image in its
@@ -627,6 +635,17 @@ def _generate(self, *args, image_sizes=None, **kwargs):
     return hf_model
 
 
+def minimax_vl_01_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
 def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Molmo."""
     hf_processor = hf_model.processor
@@ -657,3 +676,41 @@ def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
     hf_model.model.generate = types.MethodType(_generate, hf_model.model)
 
     return hf_model
+
+
+def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.visual_tokenizer.to(hf_model.dtype)
+    hf_model.model.vte.to(hf_model.dtype)
+    hf_model.model.llm.to(hf_model.dtype)
+
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, **kwargs):
+        text_tokenizer = hf_model.model.get_text_tokenizer()
+        images = [images] if isinstance(images, Image) else images
+
+        text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0]
+
+        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
+            text_or_conversations=text, images=images)
+        attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
+
+        inputs = {
+            "inputs": input_ids.unsqueeze(0),
+            "pixel_values": pixel_values.unsqueeze(0),
+            "attention_mask": attention_mask.unsqueeze(0),
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
+
+def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
+    thinker = hf_model.model.thinker
+    thinker.get_output_embeddings = lambda: thinker.lm_head
+    hf_model.model = thinker
+    return hf_model
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py
similarity index 94%
rename from tests/models/decoder_only/vision_language/vlm_utils/runners.py
rename to tests/models/multimodal/generation/vlm_utils/runners.py
index 023df5f1618..34753121ea9 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -4,7 +4,8 @@
 """
 from pathlib import PosixPath
 
-from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
+from .....conftest import (HfRunner, ImageTestAssets, VideoTestAssets,
+                           VllmRunner)
 from . import builders, core
 from .types import ExpandableVLMTestArgs, VLMTestInfo
 
@@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
                           test_case: ExpandableVLMTestArgs,
                           hf_runner: type[HfRunner],
                           vllm_runner: type[VllmRunner],
-                          image_assets: _ImageAssets):
+                          image_assets: ImageTestAssets):
     assert test_case.size_wrapper is not None
     inputs = builders.build_single_image_inputs_from_test_info(
         model_test_info, image_assets, test_case.size_wrapper, tmp_path)
@@ -37,7 +38,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
                          test_case: ExpandableVLMTestArgs,
                          hf_runner: type[HfRunner],
                          vllm_runner: type[VllmRunner],
-                         image_assets: _ImageAssets):
+                         image_assets: ImageTestAssets):
     assert test_case.size_wrapper is not None
     inputs = builders.build_multi_image_inputs_from_test_info(
         model_test_info, image_assets, test_case.size_wrapper, tmp_path)
@@ -60,7 +61,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
                        test_case: ExpandableVLMTestArgs,
                        hf_runner: type[HfRunner],
                        vllm_runner: type[VllmRunner],
-                       image_assets: _ImageAssets):
+                       image_assets: ImageTestAssets):
     assert test_case.size_wrapper is not None
     inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
         model_test_info, image_assets, test_case.size_wrapper)
@@ -86,7 +87,7 @@ def run_video_test(
     test_case: ExpandableVLMTestArgs,
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    video_assets: _VideoAssets,
+    video_assets: VideoTestAssets,
 ):
     assert test_case.size_wrapper is not None
     assert test_case.num_video_frames is not None
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
similarity index 97%
rename from tests/models/decoder_only/vision_language/vlm_utils/types.py
rename to tests/models/multimodal/generation/vlm_utils/types.py
index 1ae61ea4722..56629323394 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -15,7 +15,7 @@
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
+from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, ImageTestAssets
 from ....utils import check_logprobs_close
 
 # meta image tag; will be replaced by the appropriate tag for the model
@@ -85,7 +85,7 @@ class VLMTestInfo(NamedTuple):
 
     # Function for converting ImageAssets to image embeddings;
     # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
+    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
                                                     torch.Tensor]] = None
 
     # Exposed options for vLLM runner; we change these in a several tests,
@@ -141,7 +141,7 @@ class VLMTestInfo(NamedTuple):
     # for Qwen-VL, which requires encoding the image path / url into the prompt
     # for HF runner
     prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]],
+        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
                  str]] = None  # noqa: E501
 
     # Allows configuring a test to run with custom inputs
diff --git a/tests/models/embedding/__init__.py b/tests/models/multimodal/pooling/__init__.py
similarity index 100%
rename from tests/models/embedding/__init__.py
rename to tests/models/multimodal/pooling/__init__.py
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
similarity index 99%
rename from tests/models/embedding/vision_language/test_dse_qwen2_vl.py
rename to tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index 3c15b0b5526..ea1caec0ecf 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -10,7 +10,7 @@
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 HF_TEXT_PROMPTS = [
     # T -> X
diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
similarity index 84%
rename from tests/models/decoder_only/vision_language/test_intern_vit.py
rename to tests/models/multimodal/pooling/test_intern_vit.py
index a842d14fee2..76f9fbe0255 100644
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -1,33 +1,34 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional
-
 import pytest
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
-from ....conftest import _ImageAssets
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+
+from ....conftest import ImageTestAssets
 
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
 
 
+@torch.inference_mode()
 def run_intern_vit_test(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model_id: str,
     *,
     dtype: str,
-    distributed_executor_backend: Optional[str] = None,
 ):
     model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
+    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
     img_processor = CLIPImageProcessor.from_pretrained(model)
     images = [asset.pil_image for asset in image_assets]
     pixel_values = [
-        img_processor(images, return_tensors='pt').pixel_values.to(dtype)
+        img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
         for images in images
     ]
 
@@ -36,14 +37,13 @@ def run_intern_vit_test(
         config.norm_type = "rms_norm"
 
     hf_model = AutoModel.from_pretrained(model,
-                                         torch_dtype=dtype,
+                                         torch_dtype=torch_dtype,
                                          trust_remote_code=True).to("cuda")
     hf_outputs_per_image = [
         hf_model(pixel_value.to("cuda")).last_hidden_state
         for pixel_value in pixel_values
     ]
 
-    from vllm.distributed import cleanup_dist_env_and_memory
     from vllm.model_executor.models.intern_vit import InternVisionModel
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())
@@ -51,7 +51,7 @@ def run_intern_vit_test(
     del hf_model
     cleanup_dist_env_and_memory()
 
-    vllm_model = vllm_model.to("cuda", dtype)
+    vllm_model = vllm_model.to("cuda", torch_dtype)
     vllm_outputs_per_image = [
         vllm_model(pixel_values=pixel_value.to("cuda"))
         for pixel_value in pixel_values
@@ -69,8 +69,7 @@ def run_intern_vit_test(
     "OpenGVLab/InternViT-300M-448px",
     "OpenGVLab/InternViT-6B-448px-V1-5",
 ])
-@pytest.mark.parametrize("dtype", [torch.half])
-@torch.inference_mode()
+@pytest.mark.parametrize("dtype", ["half"])
 def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
     run_intern_vit_test(
         image_assets,
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py
similarity index 99%
rename from tests/models/embedding/vision_language/test_llava_next.py
rename to tests/models/multimodal/pooling/test_llava_next.py
index 4da59ff505e..77508738cc8 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -8,7 +8,7 @@
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 # Llava Next embedding implementation is only supported by CUDA.
 # If run on ROCm, hf_model.model.resize_token_embeddings will
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py
similarity index 98%
rename from tests/models/embedding/vision_language/test_phi3v.py
rename to tests/models/multimodal/pooling/test_phi3v.py
index f9985bd8a2e..cd58a5cb453 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -9,7 +9,7 @@
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
-from ..utils import check_embeddings_close
+from ...utils import check_embeddings_close
 
 HF_TEXT_PROMPTS = [
     # T -> X
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index b14e8a02bb1..772a2db3e48 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -254,6 +254,7 @@ def _test_processing_correctness_mistral(
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
+    "ibm-granite/granite-speech-3.3-8b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -269,17 +270,21 @@ def _test_processing_correctness_mistral(
     "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
+    "MiniMaxAI/MiniMax-VL-01",
     "allenai/Molmo-7B-D-0924",
     "allenai/Molmo-7B-O-0924",
     "nvidia/NVLM-D-72B",
+    "AIDC-AI/Ovis2-1B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-4-multimodal-instruct",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2.5-Omni-3B",
     "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 709a686577f..37142b6dd36 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -11,7 +11,7 @@
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -137,7 +137,7 @@ def _run_check(
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     model_id: str,
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index f5b5cf6b5ba..c35ce2f6ab2 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -5,7 +5,7 @@
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -21,7 +21,7 @@
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
     expected_toks_per_img: int,
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 5ac47ecc5cc..7ec81197a3d 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -11,7 +11,7 @@
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -94,7 +94,7 @@ def _run_check(
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     model_id: str,
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 2bfc2785feb..614f17dbbed 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -6,7 +6,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.transformers_utils.tokenizer import encode_tokens
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -17,7 +17,7 @@
 @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
 @pytest.mark.parametrize("tokenized_prompt", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model_id: str,
     mm_processor_kwargs: dict,
     num_imgs: int,
diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py
new file mode 100644
index 00000000000..9bd2b988729
--- /dev/null
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from PIL import Image
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.parse import ImageSize
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    num_imgs: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=(364, 364))
+    mm_data = {"image": [image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, {})
+    image_placeholders = processed_inputs["mm_placeholders"]["image"]
+
+    assert len(image_placeholders) == num_imgs
+
+
+def _validate_image_prompt_replacements_one(
+    processor: BaseMultiModalProcessor,
+    num_imgs: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    prompt = "<image>" * num_imgs
+    image = Image.new("RGB", size=image_size)
+    mm_data = {"image": [image] * num_imgs}
+
+    try:
+        processed_inputs = processor.apply(prompt, mm_data, {})
+
+        image_placeholders = processed_inputs["mm_placeholders"]["image"]
+        assert len(image_placeholders) == num_imgs
+
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+
+
+def _test_image_prompt_replacements(
+    processor,
+    *,
+    num_imgs: int,
+    image_sizes: list[ImageSize],
+) -> None:
+
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    for size in image_sizes:
+        _validate_image_prompt_replacements_one(processor, num_imgs,
+                                                failed_size_excs, size)
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
+@pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_prompt_replacements_regression(model_id, num_imgs):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
+                    (488, 183), (2560, 1669)]
+    image_sizes = [
+        size for w, h in image_ratios
+        for size in [ImageSize(w, h), ImageSize(h, w)]
+    ]
+
+    _test_image_prompt_replacements(
+        processor,
+        num_imgs=num_imgs,
+        image_sizes=image_sizes,
+    )
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index ed0d04c5c5f..b53351544c4 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -4,7 +4,7 @@
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -22,7 +22,7 @@
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, int],
     expected_toks_per_img: int,
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
new file mode 100644
index 00000000000..c6e272650e0
--- /dev/null
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config(
+    ).embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index d8c2ca414d4..02abe1ca8b0 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -4,7 +4,7 @@
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -19,7 +19,7 @@
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
     expected_toks_per_img: int,
diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py
index 56edc58a71b..224d1bcedb9 100644
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -5,7 +5,7 @@
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
-from ....conftest import _ImageAssets
+from ....conftest import ImageTestAssets
 from ...utils import build_model_context
 
 
@@ -21,7 +21,7 @@
 @pytest.mark.parametrize("num_imgs", [1, 2])
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, object],
     expected_toks_per_img: int,
diff --git a/tests/models/embedding/language/__init__.py b/tests/models/quantization/__init__.py
similarity index 100%
rename from tests/models/embedding/language/__init__.py
rename to tests/models/quantization/__init__.py
diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/quantization/test_aqlm.py
similarity index 94%
rename from tests/models/decoder_only/language/test_aqlm.py
rename to tests/models/quantization/test_aqlm.py
index 85557b30d8b..548053b7ae4 100644
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/quantization/test_aqlm.py
@@ -1,9 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of a AQLM model between vLLM and HF Transformers
-
-Run `pytest tests/models/test_aqlm.py`.
-"""
-
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -39,7 +34,6 @@
 ]
 
 
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/quantization/test_awq.py
similarity index 96%
rename from tests/models/decoder_only/vision_language/test_awq.py
rename to tests/models/quantization/test_awq.py
index 6cc81d2b9ed..597c8e48fb6 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -7,8 +7,8 @@
 
 from vllm.multimodal.image import rescale_image_size
 
-from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
-from ...utils import check_logprobs_close
+from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
+from ..utils import check_logprobs_close
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
@@ -20,7 +20,7 @@
 
 def run_awq_test(
     vllm_runner: type[VllmRunner],
-    image_assets: _ImageAssets,
+    image_assets: ImageTestAssets,
     source_model: str,
     quant_model: str,
     *,
@@ -85,7 +85,6 @@ def run_awq_test(
         )
 
 
-@pytest.mark.quant_model
 @pytest.mark.parametrize(
     ("source_model", "quant_model"),
     [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py
new file mode 100644
index 00000000000..f0781394d81
--- /dev/null
+++ b/tests/models/quantization/test_bitblas.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_bitblas: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_bitblas,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="bitblas",
+    )
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/quantization/test_fp8.py
similarity index 97%
rename from tests/models/decoder_only/language/test_fp8.py
rename to tests/models/quantization/test_fp8.py
index 51abcb7172c..4d15675a3ab 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -4,20 +4,15 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
-import os
-from typing import Optional
-
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/quantization/test_gguf.py
similarity index 97%
rename from tests/models/decoder_only/language/test_gguf.py
rename to tests/models/quantization/test_gguf.py
index 925e7104eae..3ff36502df5 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -14,9 +14,9 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-from ....conftest import VllmRunner
-from ....utils import multi_gpu_test
-from ...utils import check_logprobs_close
+from ...conftest import VllmRunner
+from ...utils import multi_gpu_test
+from ..utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -38,7 +38,6 @@ def gguf_model(self):
     original_model="meta-llama/Llama-3.2-1B-Instruct",
     gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
     gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
-    marks=[pytest.mark.quant_model],
 )
 
 QWEN2_CONFIG = GGUFTestConfig(
diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py
new file mode 100644
index 00000000000..c8e96455fd0
--- /dev/null
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the outputs of a GPTQ model to a bitblas model.
+
+Note: GPTQ and bitblas do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+bitblas/GPTQ models are in the top 3 selections of each other.
+
+Note: bitblas internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for bitblas. As a result, we re-run the 
+test up to 3 times to see if we pass.
+"""
+from dataclasses import dataclass
+
+import pytest
+
+from ..utils import check_logprobs_close
+
+
+@dataclass
+class ModelPair:
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(model_pair.model_gptq,
+                     dtype=dtype,
+                     quantization="bitblas") as bitblas_model:
+        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=bitblas_outputs,
+        name_0="gptq",
+        name_1="gptq_bitblas",
+    )
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py
similarity index 94%
rename from tests/models/decoder_only/language/test_gptq_marlin.py
rename to tests/models/quantization/test_gptq_marlin.py
index 0f61466c399..680134c6eae 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -1,13 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Compares the outputs of gptq vs gptq_marlin 
+"""Compares the outputs of gptq vs gptq_marlin.
+
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 5 selections of each other.
 Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
-
-Run `pytest tests/models/test_gptq_marlin.py`.
 """
 import os
 
@@ -16,7 +15,7 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
 
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -34,7 +33,6 @@
 ]
 
 
-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py
similarity index 95%
rename from tests/models/decoder_only/language/test_gptq_marlin_24.py
rename to tests/models/quantization/test_gptq_marlin_24.py
index c8162614849..ce28f964d54 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -4,8 +4,6 @@
 Note: GPTQ and Marlin_24 do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
-
-Run `pytest tests/models/test_marlin_24.py`.
 """
 from dataclasses import dataclass
 
@@ -13,7 +11,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 
-from ...utils import check_logprobs_close
+from ..utils import check_logprobs_close
 
 
 @dataclass
@@ -39,7 +37,6 @@ class ModelPair:
 ]
 
 
-@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                     reason="Marlin24 is not supported on this GPU type.")
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/quantization/test_modelopt.py
similarity index 99%
rename from tests/models/decoder_only/language/test_modelopt.py
rename to tests/models/quantization/test_modelopt.py
index a997b9e6640..1d9aa4fa8ad 100644
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -40,7 +40,6 @@
 @pytest.mark.skip(
     reason=
     "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
diff --git a/tests/models/decoder_only/language/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
similarity index 99%
rename from tests/models/decoder_only/language/test_nvfp4.py
rename to tests/models/quantization/test_nvfp4.py
index 442e8e93cfa..f94f3457c37 100644
--- a/tests/models/decoder_only/language/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -41,7 +41,6 @@
     reason=
     "Prevent unstable test based on golden strings from breaking the build "
     " and test input model being too large and hanging the system.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
                     reason="nvfp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 530da89cc72..cce2c82b3dc 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -72,12 +72,15 @@ def check_transformers_version(
             return
 
         current_version = TRANSFORMERS_VERSION
+        cur_base_version = Version(current_version).base_version
         min_version = self.min_transformers_version
         max_version = self.max_transformers_version
         msg = f"`transformers=={current_version}` installed, but `transformers"
-        if min_version and Version(current_version) < Version(min_version):
+        # Only check the base version for the min/max version, otherwise preview
+        # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
+        if min_version and Version(cur_base_version) < Version(min_version):
             msg += f">={min_version}` is required to run this model."
-        elif max_version and Version(current_version) > Version(max_version):
+        elif max_version and Version(cur_base_version) > Version(max_version):
             msg += f"<={max_version}` is required to run this model."
         else:
             return
@@ -120,10 +123,13 @@ def check_available_online(
                                          trust_remote_code=True),
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
                                          trust_remote_code=True),
-    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
-    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
+    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
+                                        extras={"tiny": "hmellor/bamba-tiny-random"}),  # noqa: E501
+    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
+                                        {"1b": "bigscience/bloomz-1b1"}),
     "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
-                                    trust_remote_code=True),
+                                    trust_remote_code=True,
+                                    max_transformers_version="4.48"),
     "ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3",  # noqa: E501
                                                        trust_remote_code=True),
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
@@ -141,24 +147,26 @@ def check_available_online(
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
-    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
+    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
-    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
-                                         min_transformers_version="4.50"),
+    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-Chat-0414",
+        "THUDM/GLM-4-32B-0414",
         is_available_online=False,
         min_transformers_version="4.52.dev0"
     ),
-    "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
-    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
-    "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
-    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
+    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
+                                       {"alias": "gpt2"}),
+    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
+                                             {"tiny": "bigcode/tiny_starcoder_py"}),  # noqa: E501
+    "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
+                                       {"6b": "EleutherAI/gpt-j-6b"}),
+    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
+                                          {"1b": "EleutherAI/pythia-1.4b"}),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts",  # noqa: E501
-                                                   min_transformers_version="4.49"),  # noqa: E501
+    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
@@ -186,7 +194,8 @@ def check_available_online(
     "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
                                                 trust_remote_code=True),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
-    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
+    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
+                                          {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
     "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
@@ -194,7 +203,8 @@ def check_available_online(
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
-    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
+    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m",
+                                      {"1b": "facebook/opt-iml-max-1.3b"}),
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                         trust_remote_code=True),
     "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
@@ -204,21 +214,15 @@ def check_available_online(
                                             trust_remote_code=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
+    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
+                                        trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
-    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
-                                        extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
+    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-0.5B-Instruct",
+                                        extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
-    "Qwen3ForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen3-8B",
-        is_available_online=False,
-        min_transformers_version="4.51"
-    ),
-    "Qwen3MoeForCausalLM": _HfExamplesInfo(
-        "Qwen/Qwen3-MoE-15B-A2B",
-        is_available_online=False,
-        min_transformers_version="4.51"
-    ),
+    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
                                      is_available_online=False),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
@@ -233,8 +237,7 @@ def check_available_online(
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                          is_available_online=False,
                                          trust_remote_code=True),
-    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
-                                         min_transformers_version="4.49"),
+    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
@@ -245,11 +248,15 @@ def check_available_online(
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
+    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
+                                               trust_remote_code=True),
     "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward",
                                                trust_remote_code=True),
     "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),  # noqa: E501
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long",  # noqa: E501
+                                               trust_remote_code=True),
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"),
@@ -273,6 +280,7 @@ def check_available_online(
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"),  # noqa: E501
 }
 
 _MULTIMODAL_EXAMPLE_MODELS = {
@@ -286,10 +294,11 @@ def check_available_online(
                                                 extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
                                                 max_transformers_version="4.48",  # noqa: E501
                                                 transformers_version_reason="HF model is not compatible.",  # noqa: E501
-                                               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
+                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
-    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
-                                                      min_transformers_version="4.50"),
+    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
+    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-8b",  # noqa: E501
+                                                             min_transformers_version="4.52.0"),  # noqa: E501
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
@@ -324,8 +333,9 @@ def check_available_online(
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
+    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
+                                              trust_remote_code=True),
     "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
-                                                        min_transformers_version="4.50",  # noqa: E501
                                                         extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         max_transformers_version="4.48",
@@ -341,6 +351,9 @@ def check_available_online(
                                         max_transformers_version="4.48",
                                         transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                         extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
+    "Ovis2ForConditionalGeneration": _HfExamplesInfo("AIDC-AI/Ovis2-1B",
+                                                    trust_remote_code=True,
+                                                    hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}), # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
@@ -351,8 +364,9 @@ def check_available_online(
                                                       hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
-                                                          min_transformers_version="4.49"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
+    "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
+                                        min_transformers_version="4.52"),
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
@@ -361,7 +375,7 @@ def check_available_online(
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
     # Therefore, we borrow the BartTokenizer from the original Bart model
     "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="facebook/bart-base",
+                                                         tokenizer="Isotr0py/Florence-2-tokenizer",
                                                          trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
@@ -381,6 +395,10 @@ def check_available_online(
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
+    "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
+                                            trust_remote_code=True,
+                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
+                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
 }
 
 _TRANSFORMERS_MODELS = {
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index cd2b8f00d52..446c4efbf6a 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -24,10 +24,7 @@ def test_can_initialize(model_arch):
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
 
-        if hasattr(hf_config, "text_config"):
-            text_config: PretrainedConfig = hf_config.text_config
-        else:
-            text_config = hf_config
+        text_config = hf_config.get_text_config()
 
         text_config.update({
             "num_layers": 1,
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index f1ed8a04cfa..b45a87d94b8 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -18,10 +18,9 @@ def test_plugin(
         m.setenv("VLLM_USE_V1", "0")
         m.setenv("VLLM_PLUGINS", "")
 
-        with pytest.raises(Exception) as excinfo:
+        match = "Cannot find model module"
+        with pytest.raises(ValueError, match=match):
             LLM(model=dummy_opt_path, load_format="dummy")
-        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
-        assert (error_msg in str(excinfo.value))
 
 
 @create_new_process_for_each_test()
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 65bb11d6b5e..6da488897be 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Test the functionality of the Transformers backend.
-
-Run `pytest tests/models/test_transformers.py`.
-"""
+"""Test the functionality of the Transformers backend."""
 import pytest
 
 from ..conftest import HfRunner, VllmRunner
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 5407540114b..bb87863d076 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,9 +2,10 @@
 
 import warnings
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import torch
+import torch.nn.functional as F
 
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
@@ -12,6 +13,9 @@
 
 from .registry import HF_EXAMPLE_MODELS
 
+if TYPE_CHECKING:
+    from ..conftest import HfRunner
+
 TokensText = tuple[list[int], str]
 
 
@@ -291,3 +295,63 @@ def build_model_context(
         **model_config_kwargs,
     )
     return InputContext(model_config)
+
+
+def check_embeddings_close(
+    *,
+    embeddings_0_lst: Sequence[list[float]],
+    embeddings_1_lst: Sequence[list[float]],
+    name_0: str,
+    name_1: str,
+    tol: float = 1e-3,
+) -> None:
+    assert len(embeddings_0_lst) == len(embeddings_1_lst)
+
+    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
+            zip(embeddings_0_lst, embeddings_1_lst)):
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+
+        sim = F.cosine_similarity(torch.tensor(embeddings_0),
+                                  torch.tensor(embeddings_1),
+                                  dim=0)
+
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+
+        assert sim >= 1 - tol, fail_msg
+
+
+def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
+
+
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+    matryoshka_dimensions: Optional[list[int]] = None
+    architecture: str = ""
+    enable_test: bool = True
+
+
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/tests/multimodal/assets/image1.png b/tests/multimodal/assets/image1.png
new file mode 100644
index 0000000000000000000000000000000000000000..17c7d4cdffe914614b9f53622dbf91c9df9db310
GIT binary patch
literal 1837
zcmeAS@N?(olHy`uVBq!ia0y~yV4MJC2XHV0Nrr!y|1mHyGX(gAxc>kDA1DAJK~ke&
zGz3Oc2&l||(+bQBFFaiwLn`LHnRRj2Vg(*I&rko}?}=A!VNtquZSJj^|L?3}WqqQ-
z`A!6X(0mY2JRf~se(t7K@#P;5ZCLl{Xk_K)<BvUaLzR^FhO}txjyRvCK3`ZV`S!2Z
zM<qMgZe>cFrxn)xYsZAS=YCERc=t5!#7y%QoD+?<6s{Gp(`~WWIwJdQ?|aUL?Gx&9
zcUOoNB`H)p#XT|SxcR<T^~l^$eFBXWZhYSKy>-Hxh2||Dow<9?<O^Nuel?Ms>34p|
z{;q9Hdpr-Xh}|pfyKU0(l9#1h-%2<qttfh%V)&-e=i7#irDo?``r96x-gdn7!Z+ze
zvD3ETyPxmAuiafbyJrGpk8_2I!|hCyZy}|bt-jf|8cQbx&A!X#xOdGC<2e&9D)F<H
z3mEa750~S(n7Tmoqm##*Ox2g4r*TzYvDJ5Zl*2BcGJRc3&-?g2kI!>Fe8S4-zQ?cn
zbxZh;wV_M7F4mXM%h@W?wqdVAsohp9(M?}7KHsU^{%z~rx8>5&8~b8;7Jdv{JoC%B
zdY*Z`G8~)%pD&8>Y}#?N<qD_P*PCY+)y!Az-V@Aok#&XN4!N9X$>*a2ZgGC((O0?i
z;E%yQHxDuAFsCIuQdO3P$8{W=X7j$NKewJaT`?p`v9@)wW5<UIw*JU#A7$?wu2|pt
zgLk*zVlf{li+-i6Q(W7UY_}RlC`&12mCksVuwqS%$n_7M*8eJQo%TN}P_6hYO3!e`
z^{dZXCd}(sb*vS+w0t&yYk|^}KeIjrDn8{^4DIwgt?g7*HRY&4o?KVko8_${>Q6dd
zf30r$(z=&B^JaF3;+=BolX^3CT_^9W_h|7D2#OQGB;OUUtrBO$`nBwGyOT>)$hnp)
zDvu&K<aHu5r)@1h9xv*A!bt9;`1`cTIq8uumyf8;_uC`z(&?AmC&i#W0=KwdiF}%|
zb%MP5$xVk=Ofzgbu}xO@^rg&;B^4gko}1<--d?^!FuzhLC`fTz@|)du+k<6VjvW6#
z&8bZwyo~FY^ZE|f_y0IsH!aAs{`Tdzr}&|FvrjZm*tW6oKG&UNmyQWcb6Bf&S@yZp
z54QP!6?)p-igi{c6V>HfZ+!Cq*t~k{qH2D1moTRt+=-p>tGaxCZceTD<(PQkJBK8f
zcZdmh;cv@H`zBpExnb+7t2f&Z{;3Z=(~;t2AuMF4CnkR`%JZXBNrCO#yS~X@EB5Of
zalUj<vRC~#@o>c4_i1hSi=E8O;y1qZa}%-3KbbJ^WvzC&t^W11C-2HX{C_&My#Ar{
zujj2X7tc&xeeAwikzn|`M*>C03M`n}^oQ9$R!}|nk43I>)?XJOn}Na8)z4*}Q$iB}
DJQVJy

literal 0
HcmV?d00001

diff --git a/tests/multimodal/assets/image2.png b/tests/multimodal/assets/image2.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f13ce5d983d15565e1d3930b041f6917d95ce5f
GIT binary patch
literal 1837
zcmeAS@N?(olHy`uVBq!ia0y~yV4MJC2XHV0Nrr!y|1mHyGX(gAxc>kDA0)!S0B1nB
zqvU7^jD!$4!8XYNm=|7nx;TbZ%y~2G;;h9AJZ_$!{=MH5uiC<*bnDvOTQmRPS;NZu
zM1%942>u}XAf9+W`nvqwO{?O|KOEYy?$Oc6%FV|gd*+5JDeVnu(byevK1+SRuu}5v
zU$2i!cCOvZlr~Q*tohfD33Jc=oFee<Y21mK<|{ZS8f__DD`2PFVy|^X_SxR|oD170
z)aCB35GzVjsCJ5bV$N~%eXZ(|xu5z38YkTNyy<)Egf$DzTRb{*_ngTWy43w@A~)0T
z{Eq!y+m`lt9$pc<SJ-#kq~j$oOSit2a86oL^ftxtO`*@X4H-+#&bjorJvP1Vc<F_2
z(urcHZNYax-+f=ZyL5KX1jZic3KNIhnI_*tN;6x1vu!n&P6(QPm(6kSnjOY-CR|kF
zXDt^n;yE8K$8j-rf#ydik2jgBFF#M?s=Q*W@A4>zT|Q;{x|W{z@p~Sh=Xm&pmCt>T
zU-j#j@EvPImvUXKFP)dORibUfUWHP-tyZF&zGi&BQ@8!w*12!XrK30Y#quot7`Axk
zmvi+z^Lk}CI0HUk6yw>n<7UehPOYyu&n&8$uiCvQnCBwv3cnq4InR>MM+MyC{K%uP
za_7MxgL`fsV$NYsOLnBHEDMk8I5y4ZeNlgIJ#)HZNRVP}>te@_4;5_vk=H)T-Zxya
zzV!$1ZokE1K28?>N>``2wk6qaHH=V}Qpzfw@h)M-nii4kA3Cl7Ropu5e^j7a@mG|d
z;fm{5pS4Vw*RSeWD{^W1Z2r~)r6+%8eF#*1%BvXK>33S&sj6zqQGq<UuC_PJTSe5L
zbh`dp-SVY%FL&n6><+~{<<ckhX6m|5-dFF@;vo<eCw@u3D_&bA&W81C+2wX8m#C0)
zEmu??MR3ULL}pIgT6{cS)cJ&w+(+^EX_0f%BV8^ZQJe3#N8qK?FSk#ML3;#lalaDz
zG-K-odG(W<4y~AG*m7c<tnTScnHNhcJgPl6%}u<$e1%|srBG0i;<n^ByY03I%d{Lh
z{(qWNn?QIO*DvSw9jx#Fakg$+kZ1kv%WY5bL+@svXq>QZW8r<SJI5{^6PV_(R_n6t
zbEh9{^ZhFHw7C`QtV$-T%eCJ4<o~gG_0~nz{OT@YPCK|0JL6Y%`TX3RTJOs-@xpfw
zNiOdY6Yj#_mXr2Px^i;E)>T(;wjcacA9|)E#mPcg$WBj8{#=yjN2ihk+qZXplf72#
z*EiyP>7Hb-`fuXlh`I06+U^%SnVH3JeCg*VVwHa~VcyGH?QmQD>t|2im4EpEbZB|~
zL+4-5TVpPsnY#MeeX%0J@O6&_ii{OlFth0ovwy6hO70(vT;;64E<iQ|gQu&X%Q~lo
FCIF(W?gIb-

literal 0
HcmV?d00001

diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
new file mode 100644
index 00000000000..17b36b36888
--- /dev/null
+++ b/tests/multimodal/test_hasher.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image, ImageDraw
+
+from vllm.multimodal.hasher import MultiModalHasher
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+# NOTE: Images that are the same visually are allowed to have the same hash
+@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")])
+def test_hash_collision_image_mode(mode_pair):
+    mode1, mode2 = mode_pair
+    image1 = Image.new(mode1, size=(10, 10), color=1)
+    image2 = Image.new(mode2, size=(10, 10), color=1)
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_palette():
+    # These images differ only in Image.palette._palette
+    image1 = Image.open(ASSETS_DIR / "image1.png")
+    image2 = Image.open(ASSETS_DIR / "image2.png")
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_image_transpose():
+    image1 = Image.new("1", size=(10, 20))
+    ImageDraw.Draw(image1).line([(0, 0), (10, 0)])
+
+    image2 = Image.new("1", size=(20, 10))
+    ImageDraw.Draw(image2).line([(0, 0), (0, 10)])
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2)
+
+
+def test_hash_collision_tensor_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = torch.zeros((5, 10, 20, 3))
+    arr2 = torch.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
+
+
+def test_hash_collision_array_shape():
+    # The hash should be different though the data is the same when flattened
+    arr1 = np.zeros((5, 10, 20, 3))
+    arr2 = np.zeros((10, 20, 5, 3))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2)
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index abc1c05de3c..0ea71aaf828 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -14,7 +14,7 @@
 from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import (
-    get_quantization_config, register_quantization_config)
+    QuantizationMethods, get_quantization_config, register_quantization_config)
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 
@@ -54,7 +54,7 @@ def __init__(self, num_bits: int = 8) -> None:
         """Initialize the quantization config."""
         self.num_bits = num_bits
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         """Name of the quantization method."""
         return "custom_quant"
 
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 314ec90e34f..1a20228765e 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -3,6 +3,7 @@
 import importlib.util
 
 import pytest
+import torch
 
 DTYPE = ["bfloat16"]
 
@@ -21,5 +22,30 @@ def test_pre_quantized_model(vllm_runner):
     print(output)
 
 
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.parametrize(
+    "pt_load_map_location",
+    [
+        "cuda:0",
+        # {"": "cuda"},
+    ])
+def test_opt_125m_int4wo_model_loading_with_params(vllm_runner,
+                                                   pt_load_map_location):
+    """
+    Test loading roberta-base model with no lm_head.
+    """
+    torch._dynamo.reset()
+    model_name = "jerryzh168/opt-125m-int4wo"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location=pt_load_map_location) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py
new file mode 100644
index 00000000000..95b7460d359
--- /dev/null
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "qwen3"
+start_token = "<think>"
+end_token = "</think>"
+
+REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def qwen3_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# 带 <think></think>，非stream
+WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# 带 <think></think>，stream
+WITH_THINK_STREAM = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# 不带 <think></think>，非stream
+WITHOUT_THINK = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+# 不带 <think></think>，stream
+WITHOUT_THINK_STREAM = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTILINE_REASONING = {
+    "output":
+    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "reasoning_content": "This is a reasoning\nsection",
+    "content": "This is the rest\nThat",
+}
+ONLY_OPEN_TAG = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": None,
+    "content": "<think>This is a reasoning section",
+}
+
+ONLY_OPEN_TAG_STREAM = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_THINK,
+        id="without_think",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_THINK_STREAM,
+        id="without_think_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        MULTILINE_REASONING,
+        id="multiline_reasoning",
+    ),
+    pytest.param(
+        True,
+        MULTILINE_REASONING,
+        id="multiline_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        ONLY_OPEN_TAG,
+        id="only_open_tag",
+    ),
+    pytest.param(
+        True,
+        ONLY_OPEN_TAG_STREAM,
+        id="only_open_tag_stream",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    qwen3_tokenizer,
+):
+    output = qwen3_tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(qwen3_tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index 1be0e00384e..5c60100e679 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "block_size": 8,
+        "block_size": 16,
         # 2 for small prompt, 256//8 for generated.
         "num_gpu_blocks_override": 2 + 256 // 8,
         "max_model_len": (2 + 256 // 8) * 8,
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 3efda40066b..7bf29349d67 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "block_size": 8,
+        "block_size": 16,
         # 2 for small prompt, 256//8 for generated.
         "num_gpu_blocks_override": 2 + 256 // 8,
         "max_model_len": (2 + 256 // 8) * 8,
@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "block_size": 8,
+        "block_size": 16,
         # 2 for small prompt, 256//8 for generated.
         "num_gpu_blocks_override": 2 + 256 // 8,
         "max_model_len": (2 + 256 // 8) * 8,
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index bb45be791fa..e187b6bc143 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "block_size": 8,
+        "block_size": 16,
         # 2 for small prompt, 256//8 for generated.
         "num_gpu_blocks_override": 2 + 256 // 8,
         "max_model_len": (2 + 256 // 8) * 8,
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
-        # As of this writing, vLLM only compiles with these 3 block sizes by
-        # default.
-        {
-            "block_size": 8,
-        },
+        # https://github.com/triton-lang/triton/issues/2266 tl.dot
+        # doesn't support embedding < 16
         {
             "block_size": 16,
         },
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 3af89dc74e7..eca433ffa1d 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -152,7 +152,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
-        "block_size": 8,
+        "block_size": 16,
         # 2 for small prompt, 256//8 for generated.
         "num_gpu_blocks_override": 2 + 256 // 8,
         "max_model_len": (2 + 256 // 8) * 8,
diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py
new file mode 100644
index 00000000000..7a205f2ab18
--- /dev/null
+++ b/tests/spec_decode/test_memory_usage.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""This docstring details important information on the testing methodology.
+
+This test verifies that memory usage remains constant (or never grows) when 
+we enable / disable speculation via --speculative-disable-by-batch-size. 
+
+There are a lot of things we try to keep track of between batches of requests
+and if certain tensors are not freed from memory, can result in CUDA ooms. 
+
+This is particularly relevant for production situations where speculation might 
+be enabled during off hours, but disabled once traffic peaks during the workday.
+Since traffic will stay high for a long period of time, verifying we do not 
+increase our memory usage over time is essential to prevent possible CUDA ooms. 
+"""
+
+import torch
+
+import vllm
+from tests.core.utils import create_dummy_prompt
+from vllm.sequence import SequenceGroup
+
+ITERATIONS = 100
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
+
+BATCH_SIZE = 5
+SPEC_DISABLE_BATCH_SIZE = 2
+
+
+def add_seq_group_to_engine(engine: vllm.LLMEngine, seq_group: SequenceGroup):
+    scheduler = engine.scheduler[0]
+    scheduler.add_seq_group(seq_group)
+
+
+"""
+Since we are using a batch size greater than the disabled batch size, 
+we can ensure we go through the _no_spec codepath for most of our engine steps.
+"""
+
+
+def test_memory_usage_no_spec():
+    previous_memory_allocated = None
+    llm = vllm.LLM(
+        model=MAIN_MODEL,
+        speculative_model=SPEC_MODEL,
+        num_speculative_tokens=3,
+        speculative_disable_by_batch_size=SPEC_DISABLE_BATCH_SIZE,
+    )
+
+    batch_sequences = set()
+    engine = llm.llm_engine
+
+    for i in range(ITERATIONS):
+        seq, seq_group = create_dummy_prompt(request_id=str(i),
+                                             prompt_length=10,
+                                             min_tokens=10,
+                                             max_tokens=10)
+
+        add_seq_group_to_engine(engine, seq_group)
+
+        batch_sequences.add(seq)
+        engine.step()
+        for seq in list(batch_sequences):
+            if seq.is_finished():
+                batch_sequences.remove(seq)
+
+        # If we aren't at our batch size yet, continue
+        if len(batch_sequences) <= BATCH_SIZE:
+            continue
+
+        # Otherwise, loop until at least one request is done
+        while not any(seq.is_finished() for seq in batch_sequences):
+            engine.step()
+
+        # Remove it from the set
+        for seq in list(batch_sequences):
+            if seq.is_finished():
+                batch_sequences.remove(seq)
+
+        # At this point, we are always at the case where we have finished
+        # processing some number of requests from the batch after running
+        # several _no_spec executions. The memory should not have
+        # increased between the previous  time this was recorded and the
+        # current time.
+        if previous_memory_allocated is None:
+            previous_memory_allocated = torch.cuda.memory_allocated()
+        else:
+            assert previous_memory_allocated == torch.cuda.memory_allocated()
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 161cc9fbf55..f73cf4b345f 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -62,9 +62,8 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     scorer_worker = create_worker(Worker, model_name, block_size,
                                   num_gpu_blocks, seed)
     scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
-    scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
-    scorer_worker.model_runner.model.sampler.\
-        should_modify_greedy_probs_inplace = True
+    scorer_worker.model_runner.sampler.include_gpu_probs_tensor = True
+    scorer_worker.model_runner.sampler.should_modify_greedy_probs_inplace = True
 
     vocab_size = scorer_worker.vocab_size
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 06264c5b99b..7db95e3f645 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,14 +1,70 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import asdict
+from dataclasses import MISSING, Field, asdict, dataclass, field
+from typing import Literal, Union
 
 import pytest
 
-from vllm.config import ModelConfig, PoolerConfig
+from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
+                         config, get_field)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
 
+class TestConfig1:
+    pass
+
+
+@dataclass
+class TestConfig2:
+    a: int
+    """docstring"""
+
+
+@dataclass
+class TestConfig3:
+    a: int = 1
+
+
+@dataclass
+class TestConfig4:
+    a: Union[Literal[1], Literal[2]] = 1
+    """docstring"""
+
+
+@pytest.mark.parametrize(("test_config", "expected_error"), [
+    (TestConfig1, "must be a dataclass"),
+    (TestConfig2, "must have a default"),
+    (TestConfig3, "must have a docstring"),
+    (TestConfig4, "must use a single Literal"),
+])
+def test_config(test_config, expected_error):
+    with pytest.raises(Exception, match=expected_error):
+        config(test_config)
+
+
+def test_get_field():
+
+    @dataclass
+    class TestConfig:
+        a: int
+        b: dict = field(default_factory=dict)
+        c: str = "default"
+
+    with pytest.raises(ValueError):
+        get_field(TestConfig, "a")
+
+    b = get_field(TestConfig, "b")
+    assert isinstance(b, Field)
+    assert b.default is MISSING
+    assert b.default_factory is dict
+
+    c = get_field(TestConfig, "c")
+    assert isinstance(c, Field)
+    assert c.default == "default"
+    assert c.default_factory is MISSING
+
+
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
@@ -130,7 +186,7 @@ def test_get_pooling_config():
         revision=None,
     )
 
-    pooling_config = model_config._init_pooler_config(None)
+    pooling_config = model_config._init_pooler_config()
     assert pooling_config is not None
 
     assert pooling_config.normalize
@@ -150,11 +206,12 @@ def test_get_pooling_config_from_args():
                                dtype="float16",
                                revision=None)
 
-    override_config = PoolerConfig(pooling_type='CLS', normalize=True)
+    override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
+    model_config.override_pooler_config = override_pooler_config
 
-    pooling_config = model_config._init_pooler_config(override_config)
+    pooling_config = model_config._init_pooler_config()
     assert pooling_config is not None
-    assert asdict(pooling_config) == asdict(override_config)
+    assert asdict(pooling_config) == asdict(override_pooler_config)
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
@@ -354,3 +411,16 @@ def test_generation_config_loading():
         override_generation_config=override_generation_config)
 
     assert model_config.get_diff_sampling_param() == override_generation_config
+
+
+@pytest.mark.parametrize("pt_load_map_location", [
+    "cuda",
+    {
+        "": "cuda"
+    },
+])
+def test_load_config_pt_load_map_location(pt_load_map_location):
+    load_config = LoadConfig(pt_load_map_location=pt_load_map_location)
+    config = VllmConfig(load_config=load_config)
+
+    assert config.load_config.pt_load_map_location == pt_load_map_location
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 580e65f1f83..deff33e5c3c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -10,13 +10,15 @@
 
 import pytest
 import torch
+import zmq
 from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache,
                         MemorySnapshot, PlaceholderModule, StoreBoolean,
                         bind_kv_cache, deprecate_kwargs, get_open_port,
-                        memory_profiling, merge_async_iterators, sha256,
+                        make_zmq_socket, memory_profiling,
+                        merge_async_iterators, sha256, split_zmq_path,
                         supports_kw, swap_dict_values)
 
 from .utils import create_new_process_for_each_test, error_on_warning
@@ -662,3 +664,53 @@ def test_sha256(input: tuple, output: int):
 
     # hashing different input, returns different value
     assert hash != sha256(input + (1, ))
+
+
+@pytest.mark.parametrize(
+    "path,expected",
+    [
+        ("ipc://some_path", ("ipc", "some_path", "")),
+        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
+        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
+        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
+    ]
+)
+def test_split_zmq_path(path, expected):
+    assert split_zmq_path(path) == expected
+
+
+@pytest.mark.parametrize(
+    "invalid_path",
+    [
+        "invalid_path",  # Missing scheme
+        "tcp://127.0.0.1",  # Missing port
+        "tcp://[::1]",  # Missing port for IPv6
+        "tcp://:5555",  # Missing host
+    ]
+)
+def test_split_zmq_path_invalid(invalid_path):
+    with pytest.raises(ValueError):
+        split_zmq_path(invalid_path)
+
+
+def test_make_zmq_socket_ipv6():
+    # Check if IPv6 is supported by trying to create an IPv6 socket
+    try:
+        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+        sock.close()
+    except socket.error:
+        pytest.skip("IPv6 is not supported on this system")
+
+    ctx = zmq.Context()
+    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
+    socket_type = zmq.REP  # Example socket type
+
+    # Create the socket
+    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
+
+    # Verify that the IPV6 option is set
+    assert zsock.getsockopt(zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
+
+    # Clean up
+    zsock.close()
+    ctx.term()
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py
index cd60cefd7cc..c740fde4263 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenization/test_cached_tokenizer.py
@@ -1,24 +1,43 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import pickle
 from copy import deepcopy
 
+import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import get_cached_tokenizer
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               get_cached_tokenizer)
 
 
-def test_cached_tokenizer():
-    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+@pytest.mark.parametrize("model_id", ["gpt2", "THUDM/chatglm3-6b"])
+def test_cached_tokenizer(model_id: str):
+    reference_tokenizer = AutoTokenizer.from_pretrained(model_id,
+                                                        trust_remote_code=True)
     reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
     reference_tokenizer.add_special_tokens(
         {"additional_special_tokens": ["<SEP>"]})
+
     cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
+    _check_consistency(cached_tokenizer, reference_tokenizer)
+
+    pickled_tokenizer = pickle.dumps(cached_tokenizer)
+    unpickled_tokenizer = pickle.loads(pickled_tokenizer)
+    _check_consistency(unpickled_tokenizer, reference_tokenizer)
+
+
+def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
+    assert isinstance(target, type(expected))
+
+    # Cached attributes
+    assert target.all_special_ids == expected.all_special_ids
+    assert target.all_special_tokens == expected.all_special_tokens
+    assert (target.all_special_tokens_extended ==
+            expected.all_special_tokens_extended)
+    assert target.get_vocab() == expected.get_vocab()
+    assert len(target) == len(expected)
+
+    # Other attributes
+    assert getattr(target, "padding_side",
+                   None) == getattr(expected, "padding_side", None)
 
-    assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode(
-        "prompt")
-    assert set(reference_tokenizer.all_special_ids) == set(
-        cached_tokenizer.all_special_ids)
-    assert set(reference_tokenizer.all_special_tokens) == set(
-        cached_tokenizer.all_special_tokens)
-    assert set(reference_tokenizer.all_special_tokens_extended) == set(
-        cached_tokenizer.all_special_tokens_extended)
+    assert target.encode("prompt") == expected.encode("prompt")
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index b1860e0bb70..079100e78b5 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -4,14 +4,22 @@
 from typing import Any, Optional
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
 
 from vllm.inputs import token_inputs
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
-from vllm.transformers_utils.detokenizer import (Detokenizer,
-                                                 detokenize_incrementally)
-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
+                                        IncrementalDetokenizer,
+                                        SlowIncrementalDetokenizer)
+
+SPECIAL_TOKS_TRUTH = [
+    "Some text with adjacent special tokens                <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>",  # noqa
+]
 
 TRUTH = [
     "Hello here, this is a simple test",
@@ -22,7 +30,8 @@
     # incomplete UTF-8 characters
     # see https://github.com/vllm-project/vllm/pull/9625
     "ပုံပြင်လေးပြောပြပါ်",
-]
+] + SPECIAL_TOKS_TRUTH
+
 TOKENIZERS = [
     "facebook/opt-125m",
     "gpt2",
@@ -38,26 +47,45 @@
 ]
 
 
-def _run_incremental_decode(tokenizer, all_input_ids,
-                            skip_special_tokens: bool, starting_index: int):
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    prev_tokens = None
-    for i in range(starting_index, len(all_input_ids)):
-        new_tokens, text, offset, token_offset = detokenize_incrementally(
-            tokenizer,
-            all_input_ids[:i + 1],
-            prev_tokens,
-            offset,
-            token_offset,
-            skip_special_tokens=skip_special_tokens)
-        decoded_text += text
-        if prev_tokens is None:
-            prev_tokens = new_tokens
-        else:
-            prev_tokens += new_tokens
-    return decoded_text
+def _run_incremental_decode(tokenizer,
+                            all_input_ids,
+                            skip_special_tokens: bool,
+                            starting_index: int,
+                            spaces_between_special_tokens: bool = True,
+                            fast: Optional[bool] = None):
+
+    prompt_token_ids = all_input_ids[:starting_index]
+
+    params = SamplingParams(
+        skip_special_tokens=skip_special_tokens,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+    )
+    request = EngineCoreRequest("",
+                                prompt_token_ids,
+                                None,
+                                None,
+                                None,
+                                params,
+                                None,
+                                0.0,
+                                None,
+                                cache_salt=None)
+
+    if fast is None:
+        detokenizer = IncrementalDetokenizer.from_new_request(
+            tokenizer, request)
+    elif fast:
+        detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+    else:
+        detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
+
+    output_text = ""
+    for i, token_id in enumerate(all_input_ids[starting_index:]):
+        detokenizer.update([token_id], False)
+        finished = i == len(all_input_ids) - 1
+        output_text += detokenizer.get_next_output_text(finished, delta=True)
+
+    return output_text, detokenizer.output_token_ids
 
 
 @pytest.fixture
@@ -85,11 +113,13 @@ def test_mistral_edge_case(tokenizer, truth):
     starting_index = 0
     all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
 
-    decoded_text = _run_incremental_decode(tokenizer,
-                                           all_input_ids,
-                                           skip_special_tokens=True,
-                                           starting_index=starting_index)
+    decoded_text, out_ids = _run_incremental_decode(
+        tokenizer,
+        all_input_ids,
+        skip_special_tokens=True,
+        starting_index=starting_index)
     assert decoded_text == truth
+    assert out_ids == all_input_ids[starting_index:]
 
 
 @pytest.fixture
@@ -106,45 +136,91 @@ def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
 @pytest.mark.parametrize("with_prompt", [True, False])
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
-def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens):
+@pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
+@pytest.mark.parametrize("fast", (True, False))
+def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens,
+                          spaces_between_special_tokens, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
+
+    if skip_special_tokens and not spaces_between_special_tokens:
+        pytest.skip()
+
+    if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
+        # Fix up inconsistency in fast/slow tokenizer behaviour.
+        tokenizer.add_special_tokens({
+            "additional_special_tokens": [
+                at for at in
+                tokenizer._tokenizer.get_added_tokens_decoder().values()
+                if at.special
+            ]
+        })
+
+    extra_decode_args = {} if not isinstance(tokenizer,  PreTrainedTokenizer) \
+        else {"spaces_between_special_tokens": spaces_between_special_tokens}
+
+    truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
+    if tokenizer.bos_token_id is not None:
+        truth_tokens.insert(0, tokenizer.bos_token_id)
+    truth_tokens.append(tokenizer.eos_token_id)
+
+    new_truth = tokenizer.decode(truth_tokens,
+                                 skip_special_tokens=skip_special_tokens,
+                                 **extra_decode_args)
+
     if with_prompt:
-        truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
-        prompt_input_ids = truth_tokens[:len(truth) // 2]
-        generated_input_ids = truth_tokens[len(truth) // 2:]
+        num_prompt_tokens = len(
+            tokenizer(truth[:len(truth) // 2],
+                      add_special_tokens=False).input_ids)
+        if tokenizer.bos_token_id is not None:
+            num_prompt_tokens += 1
+
+        prompt_input_ids = truth_tokens[:num_prompt_tokens]
+        generated_input_ids = truth_tokens[num_prompt_tokens:]
         all_input_ids = prompt_input_ids + generated_input_ids
         starting_index = len(prompt_input_ids)
         prompt = tokenizer.decode(prompt_input_ids,
-                                  skip_special_tokens=skip_special_tokens)
-        generated = truth[len(prompt):]
+                                  skip_special_tokens=skip_special_tokens,
+                                  **extra_decode_args)
+
+        generated = new_truth[len(prompt):]
     else:
-        generated = truth
+        generated = new_truth
         starting_index = 0
-        all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
-    if skip_special_tokens:
-        if tokenizer.bos_token_id is not None:
-            all_input_ids = [tokenizer.bos_token_id] + all_input_ids
-            starting_index += 1
-        all_input_ids = all_input_ids + [tokenizer.eos_token_id]
+        all_input_ids = truth_tokens
 
-    decoded_text = _run_incremental_decode(
+    decoded_text, out_ids = _run_incremental_decode(
         tokenizer,
         all_input_ids,
         skip_special_tokens=skip_special_tokens,
-        starting_index=starting_index)
+        starting_index=starting_index,
+        spaces_between_special_tokens=spaces_between_special_tokens,
+        fast=fast)
 
     assert decoded_text == generated
+    assert out_ids == all_input_ids[starting_index:]
+
+
+@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
+@pytest.mark.parametrize("fast", (True, False))
+def test_oov_decode(tokenizer, fast):
+    if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
+        pytest.skip()
 
-    decoded_text = _run_incremental_decode(
+    decoded_text, out_ids = _run_incremental_decode(
         tokenizer, [len(tokenizer)],
-        skip_special_tokens=skip_special_tokens,
-        starting_index=starting_index)
+        skip_special_tokens=True,
+        starting_index=0,
+        spaces_between_special_tokens=True,
+        fast=fast)
 
     assert decoded_text == ''
+    assert out_ids == [len(tokenizer)]
 
 
 @pytest.fixture
 def detokenizer(tokenizer_name: str) -> Detokenizer:
-    init_kwargs = dict(
+    tokenizer_group = TokenizerGroup(
         tokenizer_id=tokenizer_name,
         enable_lora=False,
         max_num_seqs=100,
@@ -154,26 +230,20 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
         revision=None,
     )
 
-    tokenizer_group = get_tokenizer_group(
-        None,
-        **init_kwargs,
-    )
-
     return Detokenizer(tokenizer_group)
 
 
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
                                        tokenizer) -> list[int]:
-    complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
-    return complete_sequence_token_ids
+    return tokenizer(complete_sequence, add_special_tokens=False).input_ids
 
 
 def create_sequence(prompt_token_ids=None):
-    prompt_token_ids = prompt_token_ids or [1]
+    prompt_token_ids = prompt_token_ids or []
     return Sequence(
         seq_id=0,
-        inputs=token_inputs(prompt_token_ids, prompt="<s>"),
+        inputs=token_inputs(prompt_token_ids),
         block_size=16,
     )
 
@@ -224,7 +294,7 @@ def test_decode_sequence_logprobs(complete_sequence: str,
     assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
     assert sequential_result != "".join(sequential_logprobs_text_other_token)
 
-    if skip_special_tokens:
+    if not skip_special_tokens:
         # Text for logprobs for the chosen token should be the same as the
         # generated text. Note that this will only be true if we skip
         # special tokens.
@@ -233,10 +303,23 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
+def test_decode_prompt_logprobs(complete_sequence: str,
+                                complete_sequence_token_ids: list[int],
                                 detokenizer: Detokenizer):
+
+    # We want to use skip_special_tokens=False here but Mistral tokenizers
+    # don't support that.
+    if complete_sequence not in SPECIAL_TOKS_TRUTH:
+        skip_special_tokens = True
+    elif not isinstance(detokenizer.tokenizer_group.get_lora_tokenizer(None),
+                        MistralTokenizer):
+        skip_special_tokens = False
+    else:
+        pytest.skip("MistralTokenizers don't support "
+                    "skip_special_tokens=False")
+        return
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=True,
+    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      prompt_logprobs=1)
 
     # Run sequentially.
@@ -256,8 +339,10 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
     # decoded_prompt_logprobs doesn't contain the first token.
     token_ids = complete_sequence_token_ids
     tokenizer = detokenizer.get_tokenizer_for_seq(seq)
-    text_full = tokenizer.decode(token_ids, skip_special_tokens=True)
-    text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True)
+    text_full = tokenizer.decode(token_ids,
+                                 skip_special_tokens=skip_special_tokens)
+    text_first = tokenizer.decode(token_ids[0],
+                                  skip_special_tokens=skip_special_tokens)
     text = text_full[len(text_first):]
 
     # Text for logprobs for the chosen token should be the same as the
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index fc47bcb9de3..8942f889128 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -2,7 +2,7 @@
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
-:meth:`vllm.LLMEngine._get_eos_token_id`.
+{meth}`vllm.LLMEngine._get_eos_token_id`.
 """
 from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.tokenizer import get_tokenizer
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 5b62f992c1b..bcfa78ed41c 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,40 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import asyncio
-import os
-import sys
-from typing import Optional
-from unittest.mock import patch
-
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from vllm.transformers_utils.tokenizer_group import (TokenizerGroup,
-                                                     get_tokenizer_group)
-from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
-    RayTokenizerGroupPool)
-
-from ..conftest import get_tokenizer_pool_config
-
-
-class CustomTokenizerGroup(TokenizerGroup):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._i = 0
-
-    def encode(self, *args, **kwargs):
-        self._i += 1
-        return super().encode(*args, **kwargs)
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type",
-                         [None, "ray", CustomTokenizerGroup])
-async def test_tokenizer_group(tokenizer_group_type):
+async def test_tokenizer_group():
     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
+    tokenizer_group = TokenizerGroup(
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
@@ -49,159 +24,3 @@ async def test_tokenizer_group(tokenizer_group_type):
                       PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
         None) == await tokenizer_group.get_lora_tokenizer_async(None)
-    if tokenizer_group_type is CustomTokenizerGroup:
-        assert tokenizer_group._i > 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
-async def test_tokenizer_group_pool(tokenizer_group_type):
-    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    tokenizer_group_pool = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None,
-    )
-    # Send multiple requests to the tokenizer group pool
-    # (more than the pool size)
-    # and check that all requests are processed correctly.
-    num_requests = tokenizer_group_pool.pool_size * 5
-    requests = [
-        tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
-                                          lora_request=None)
-        for i in range(num_requests)
-    ]
-    results = await asyncio.gather(*requests)
-    expected_results = [
-        reference_tokenizer.encode(f"prompt {i}") for i in range(num_requests)
-    ]
-    assert results == expected_results
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
-async def test_tokenizer_group_ray_pool_env_var_propagation(
-        tokenizer_group_type):
-    """Test that env vars from caller process are propagated to
-    tokenizer Ray actors."""
-    env_var = "MY_ENV_VAR"
-
-    class EnvVarCheckerTokenizerGroup(TokenizerGroup):
-
-        def ping(self):
-            assert os.environ.get(env_var) == "1"
-            return super().ping()
-
-    class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
-        _worker_cls = EnvVarCheckerTokenizerGroup
-
-    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
-    tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None)
-    with pytest.raises(AssertionError):
-        tokenizer_pool.ping()
-
-    with patch.dict(os.environ, {env_var: "1"}):
-        tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
-        tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
-            tokenizer_pool_config,
-            tokenizer_id="gpt2",
-            enable_lora=False,
-            max_num_seqs=1,
-            max_input_length=None)
-        tokenizer_pool.ping()
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
-async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
-    """Test that Ray tokenizer pool group can recover from failures and
-    if that's not possible, mark itself as unhealthy."""
-
-    class FailingTokenizerGroup(TokenizerGroup):
-
-        def __init__(self,
-                     *args,
-                     fail_at: Optional[list[int]] = None,
-                     **kwargs):
-            super().__init__(*args, **kwargs)
-            self.i = 0
-            self.fail_at = fail_at or []
-
-        def encode(self, *args, **kwargs):
-            self.i += 1
-            if self.i in self.fail_at:
-                sys.exit(1)
-            return super().encode(*args, **kwargs)
-
-    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
-        _worker_cls = FailingTokenizerGroup
-
-    # Fail at first iteration
-    fail_at = [1]
-    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
-    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None,
-        fail_at=fail_at)
-    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
-
-    # Modify fail at to not fail at all (will be re-read when actor is
-    # re-initialized).
-    fail_at[0] = 1000
-
-    # We should recover successfully.
-    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
-    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
-
-    # Check that we have a new actor
-    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
-    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
-
-    # Fail at first iteration
-    fail_at = [1]
-    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None,
-        fail_at=fail_at)
-
-    # We should fail after re-initialization.
-    with pytest.raises(RuntimeError):
-        await tokenizer_group_pool.encode_async(prompt="prompt",
-                                                lora_request=None)
-
-    # check_health should raise the same thing
-    with pytest.raises(RuntimeError):
-        tokenizer_group_pool.check_health()
-
-    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
-    # cause a re-initialization.
-    fail_at = []
-    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=2,
-        fail_at=fail_at)
-    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
-
-    # Prompt too long error
-    with pytest.raises(ValueError):
-        await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
-                                                lora_request=None)
-    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
-    # Actors should stay the same.
-    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 7c87c73f04d..c14eaf71e97 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -98,6 +98,20 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "extended":
         True
     },
+    "llama4_json": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
+            "--distributed-executor-backend", "mp", "--tool-call-parser",
+            "llama4_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja")
+        ],
+        "supports_parallel":
+        True,
+        "extended":
+        True
+    },
     "mistral": {
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/tests/utils.py b/tests/utils.py
index 8f8c102b73b..0983687e2ce 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -952,7 +952,7 @@ def get_client_text_logprob_generations(
         completions: list[Completion]) -> list[TextTextLogprobs]:
     '''Operates on the output of a request made to an Open-AI-protocol
     completions endpoint; obtains top-rank logprobs for each token in
-    each :class:`SequenceGroup`
+    each {class}`SequenceGroup`
     '''
     text_generations = get_client_text_generations(completions)
     text = ''.join(text_generations)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index a4a571b180c..e8069b8c6d7 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -29,7 +29,8 @@
 def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
-                 mm_hashes=None):
+                 mm_hashes=None,
+                 cache_salt=None):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -37,7 +38,6 @@ def make_request(request_id,
 
     return Request(
         request_id=request_id,
-        prompt=None,
         prompt_token_ids=prompt_token_ids,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
@@ -46,6 +46,7 @@ def make_request(request_id,
         eos_token_id=100,
         arrival_time=0,
         lora_request=None,
+        cache_salt=cache_salt,
     )
 
 
@@ -214,6 +215,45 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
     assert next_mm_idx == 0
 
 
+def test_generate_block_hash_extra_keys_cache_salt():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+        cache_salt="salt",
+    )
+
+    # salt is added for the first token
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 1, 0)
+    assert extra_keys == ('salt', )
+    extra_keys, _ = generate_block_hash_extra_keys(request, 0, 10, 0)
+    assert extra_keys == ('salt', )
+
+    # no salt added for other tokens
+    extra_keys, _ = generate_block_hash_extra_keys(request, 1, 2, 0)
+    assert extra_keys is None
+    extra_keys, _ = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys is None
+
+    # works together with other extra keys
+    request_mm = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+        ],
+        mm_hashes=["hash1"],
+        cache_salt="salt",
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(
+        request_mm, 0, 5, 0)
+    assert extra_keys == ("hash1", "salt")
+    assert next_mm_idx == 1
+
+
 @pytest.mark.parametrize("hash_fn", [sha256, hash])
 def test_hash_block_tokens(hash_fn):
     parent_block_hash = 123
@@ -311,7 +351,7 @@ def test_metrics():
     def stats(requests, queries, hits):
         return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
 
-    metrics = PrefixCachingMetrics(interval=5)
+    metrics = PrefixCachingMetrics(max_recent_requests=5)
     assert metrics.hit_rate == 0.0
 
     metrics.observe(stats(1, 20, 9))
@@ -496,8 +536,7 @@ def test_allocate_with_lookahead():
 
     # Test case 1: Requires additional lookahead tokens
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100,
-                                      num_preallocate_tokens=0)
+                                      max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_tokens=3,
@@ -507,25 +546,19 @@ def test_allocate_with_lookahead():
 
     # Test case 2: With precomputed blocks
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100,
-                                      num_preallocate_tokens=4)
-    # num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
+                                      max_model_len=100)
     # required_blocks = ceil((3 + 2) /4) = 2
-    # total_blocks = 1 + 2 = 3
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_tokens=3,
         num_lookahead_tokens=2,
     )
-    assert len(blocks) == 3
+    assert len(blocks) == 2
 
     # Test case 3: With precomputed blocks
-    # num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
     # required_blocks = ceil((3 + 4) / 4) = 2
-    # total_blocks = 0 + 2 = 2
     kv_cache_manager = KVCacheManager(kv_cache_config=config,
-                                      max_model_len=100,
-                                      num_preallocate_tokens=4)
+                                      max_model_len=100)
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_tokens=3,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 80dd275a90b..4c05e0b87fc 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -6,22 +6,24 @@
 import pytest
 import torch
 
+from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv, sha256
+from vllm.utils import sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_block_tokens)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec)
+                                        KVCacheGroupSpec, SlidingWindowSpec)
 
 
 def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None,
-                 prompt_logprobs: Optional[int] = None):
+                 prompt_logprobs: Optional[int] = None,
+                 cache_salt: Optional[str] = None):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -29,7 +31,6 @@ def make_request(request_id,
 
     return Request(
         request_id=request_id,
-        prompt=None,
         prompt_token_ids=prompt_token_ids,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
@@ -39,6 +40,7 @@ def make_request(request_id,
         eos_token_id=100,
         arrival_time=0,
         lora_request=None,
+        cache_salt=cache_salt,
     )
 
 
@@ -47,9 +49,10 @@ def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
         num_blocks=num_blocks,
         tensors={},
         kv_cache_groups=[
-            KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(block_size, 1, 1, torch.float32,
-                                               False))
+            KVCacheGroupSpec(
+                ["layer"],
+                FullAttentionSpec(block_size, 1, 1, torch.float32, False),
+            )
         ],
     )
 
@@ -61,7 +64,6 @@ def test_prefill(hash_algo):
         max_model_len=8192,
         enable_caching=True,
         caching_hash_algo=hash_algo,
-        num_preallocate_tokens=16,
     )
 
     # choose the hash function according to the parameter
@@ -80,7 +82,7 @@ def test_prefill(hash_algo):
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     # Check full block metadata
     parent_block_hash = None
@@ -92,8 +94,8 @@ def test_prefill(hash_algo):
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
-    # Check partial/preallocated block metadata
-    for block_id in (4, 5):
+    # Check partial block metadata
+    for block_id in (4, ):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -107,12 +109,12 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 7]
+    assert [b.block_id for b in blocks] == [5]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
-    # At this point, we should have 3 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
@@ -120,14 +122,14 @@ def test_prefill(hash_algo):
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (8, 9, 10)]
-    # [unique_req0 (5, 4)]
-    # [unique_req1 (7, 6)]
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
@@ -139,29 +141,29 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [8, 9]
+    assert [b.block_id for b in blocks] == [6]
 
-    # Although we only have 5 free blocks, we have 8 blocks in
+    # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+    assert manager.block_pool.free_block_queue.num_free_blocks == 6
     assert all([
         b.ref_cnt == 0
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ])
     assert len([
         b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ]) == 5
+    ]) == 6
 
     manager.free(req2)
 
     # Cache miss and eviction.
-    req3 = make_request("3", [99] * (16 * 9))
+    req3 = make_request("3", [99] * (16 * 10))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks
     assert num_computed_tokens == 0
-    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
+    blocks = manager.allocate_slots(req3, 16 * 10, computed_blocks)
     # This block ID order also checks the eviction order.
-    assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
+    assert [b.block_id for b in blocks] == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1]
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -178,7 +180,6 @@ def test_prefill_plp():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
     # the default hash function is hash
     hash_fn = hash
@@ -197,7 +198,7 @@ def test_prefill_plp():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
     req0_block_hashes = [b.block_hash for b in blocks]
 
     # Check full block metadata
@@ -210,8 +211,8 @@ def test_prefill_plp():
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
-    # Check partial/preallocated block metadata
-    for block_id in (4, 5):
+    # Check partial block metadata
+    for block_id in (4, ):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -226,12 +227,12 @@ def test_prefill_plp():
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 7]
+    assert [b.block_id for b in blocks] == [5]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
-    # At this point, we should have 3 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+    # At this point, we should have 5 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
@@ -239,14 +240,14 @@ def test_prefill_plp():
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (8, 9, 10)]
-    # [unique_req0 (5, 4)]
-    # [unique_req1 (7, 6)]
+    # [unallocated (6, 7, 8, 9, 10)]
+    # [unique_req0 (4)]
+    # [unique_req1 (5)]
     # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
+    ] == [6, 7, 8, 9, 10, 4, 5, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
@@ -262,7 +263,7 @@ def test_prefill_plp():
     block_ids = [b.block_id for b in blocks]
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks] == req0_block_hashes
-    assert block_ids != [1, 2, 3, 4, 5]
+    assert block_ids != [1, 2, 3, 4]
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -277,7 +278,6 @@ def test_decode():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -291,7 +291,7 @@ def test_decode():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -299,28 +299,18 @@ def test_decode():
         req0.append_output_token_ids(8)
     new_blocks = manager.allocate_slots(req0, 4)
     assert new_blocks is not None and len(new_blocks) == 0
-    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
+    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
 
-    # Append slots without allocating a new block, but start using the
-    # preallocated block.
+    # Append slots with allocating a new block.
     req0.num_computed_tokens = 59
-    # 6 tokens to fill the previous block, and 10 tokens to fill
+    # 9 tokens to fill the previous block, and 10 tokens to fill
     # the preallocated block.
-    for _ in range(5 + 10):
+    for _ in range(9 + 10):
         req0.append_output_token_ids(7)
-    new_blocks = manager.allocate_slots(req0, 15)
-    assert new_blocks is not None and len(new_blocks) == 0
+    new_blocks = manager.allocate_slots(req0, 19)
+    assert new_blocks is not None and len(new_blocks) == 1
     assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
-
-    # Append slots with allocating a new block.
-    req0.num_computed_tokens = 74
-    # 6 tokens to fill the previous block, and 10 tokens to fill
-    # the preallocated block.
-    for _ in range(6 + 11):
-        req0.append_output_token_ids(12)
-    new_blocks = manager.allocate_slots(req0, 17)
-    # Plus one preallocated block.
-    assert new_blocks is not None and len(new_blocks) == 2
+    assert manager.req_to_blocks[req0.request_id][-1].block_hash is None
 
 
 def test_evict():
@@ -328,7 +318,6 @@ def test_evict():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     last_token_id = 5 * 16 + 7
@@ -337,7 +326,7 @@ def test_evict():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
-    assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
+    assert len(blocks) == 6  # 5 full + 1 partial
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
@@ -349,7 +338,8 @@ def test_evict():
     assert len(blocks) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
-    assert manager.block_pool.free_block_queue.num_free_blocks == 0
+    # 10 - (6 + 3) == 1
+    assert manager.block_pool.free_block_queue.num_free_blocks == 1
 
     manager.free(req0)
     manager.free(req1)
@@ -357,7 +347,7 @@ def test_evict():
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
+    ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
@@ -365,8 +355,8 @@ def test_evict():
     assert [b.block_id for b in computed_blocks] == [1, 2]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
-    assert [b.block_id for b in blocks] == [7, 6]
-    assert manager.block_pool.free_block_queue.num_free_blocks == 6
+    assert [b.block_id for b in blocks] == [10]
+    assert manager.block_pool.free_block_queue.num_free_blocks == 7
 
 
 def test_hash_block_correct_reuse():
@@ -379,7 +369,6 @@ def test_hash_block_correct_reuse():
         make_kv_cache_config(16, 2),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     # Allocate 1 block and cache it.
@@ -416,7 +405,6 @@ def test_computed_blocks_not_evicted():
         make_kv_cache_config(block_size, 3),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     # Allocate a block and cache it.
@@ -465,7 +453,6 @@ def test_basic_prefix_caching_disabled():
         make_kv_cache_config(block_size, 5),
         max_model_len=8192,
         enable_caching=False,
-        num_preallocate_tokens=0,
     )
 
     req1 = make_request("1", list(range(10)))  # 2 blocks and some more
@@ -496,40 +483,6 @@ def test_basic_prefix_caching_disabled():
     assert not blocks
 
 
-@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8)))
-@pytest.mark.parametrize("block_size", [4])
-def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
-    """
-    This tests that the preallocated blocks are correctly added.
-    """
-    manager = KVCacheManager(
-        make_kv_cache_config(block_size, 11),
-        max_model_len=8192,
-        enable_caching=True,
-        num_preallocate_tokens=num_preallocate_tokens,
-    )
-    num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
-
-    req = make_request("0", list(range(block_size * 30)))
-    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-    assert not computed_blocks
-    assert num_computed_tokens == 0
-    # Just ask for 1 block.
-    blocks = manager.allocate_slots(req, block_size, computed_blocks)
-    req.num_computed_tokens = block_size
-    assert len(blocks) == 1 + num_preallocated_blocks
-
-    # Assume all computed, only when num_preallocate_tokens > 0, we need to
-    # consume the previously preallocated blocks.
-    if num_preallocated_blocks > 0:
-        manager.allocate_slots(req, block_size * (len(blocks) - 1))
-        req.num_computed_tokens = block_size * len(blocks)
-
-    # Append 1 block.
-    blocks = manager.allocate_slots(req, block_size)
-    assert len(blocks) == 1 + num_preallocated_blocks
-
-
 @pytest.mark.parametrize("hash_fn", [sha256, hash])
 def test_cache_blocks(hash_fn):
     """
@@ -588,7 +541,6 @@ def test_mm_prefix_caching():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=16,
     )
 
     # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@@ -626,7 +578,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
-    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -655,6 +607,66 @@ def test_mm_prefix_caching():
     assert num_computed_tokens == 3 * 16
 
 
+def test_cache_key_salting():
+    """
+    This tests that cache salts are applied during hashing and the cache
+    is separated cache as expected.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, 11),
+        max_model_len=8192,
+        enable_caching=True,
+    )
+
+    # 3 complete blocks and an incomplete block with 11 tokens.
+    common_token_ids = [i for i in range(3) for _ in range(block_size)]
+    token_ids = common_token_ids + [3] * 11
+    req0 = make_request("0", token_ids, cache_salt="salt1")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+
+    # Completed block should have hashes with extra keys.
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("salt1", )
+    assert block_hashes[1].extra_keys is None
+    assert block_hashes[2].extra_keys is None
+
+    blocks = manager.allocate_slots(req0, 59, computed_blocks)
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
+    req0.num_computed_tokens = 59
+
+    # Append slots without allocating a new block.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.allocate_slots(req0, 5)
+    assert new_blocks is not None and len(new_blocks) == 0
+
+    # Now one more block that should not have extra keys.
+    assert len(block_hashes) == 4
+    assert block_hashes[3].extra_keys is None
+
+    # Test cache hit with a new request that has the same salt.
+    token_ids = common_token_ids + [4] * 11
+    req1 = make_request("1", token_ids, cache_salt="salt1")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    # Should match only a prefix of 3 blocks.
+    assert len(computed_blocks) == 3
+    assert num_computed_tokens == 3 * block_size
+
+    # Test cache miss with same content but different salt.
+    token_ids = common_token_ids + [4] * 11
+    req2 = make_request("2", token_ids, cache_salt="salt2")
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks) == 0
+    assert num_computed_tokens == 0
+    block_hashes = manager.req_to_block_hashes[req2.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("salt2", )
+
+
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
     """
     This is a unit test that tests the correctness of the allocate_slots
@@ -667,7 +679,6 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
@@ -721,7 +732,6 @@ def test_reset_prefix_cache():
         make_kv_cache_config(16, 11),
         max_model_len=8192,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
 
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
@@ -751,3 +761,192 @@ def test_reset_prefix_cache():
     assert manager.reset_prefix_cache()
     assert not manager.block_pool.cached_block_hash_to_block
     assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
+
+
+def test_prefix_cache_stats_disabled():
+    """Test that prefix_cache_stats is None when log_stats is False."""
+    manager = KVCacheManager(
+        make_kv_cache_config(16, 11),
+        max_model_len=8192,
+        enable_caching=True,
+        log_stats=False,  # Disable logging stats
+    )
+    assert manager.prefix_cache_stats is None
+
+    # Call all functions that check whether log_stats is disabled.
+    req = make_request("0", list(range(16)))
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req, 16, computed_blocks)
+    manager.reset_prefix_cache()
+
+    # Ensure prefix_cache_stats remains None
+    assert manager.prefix_cache_stats is None
+
+
+@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
+def test_kv_cache_events(blocks_to_cache: int):
+    block_size = 16
+    num_blocks = blocks_to_cache + 1
+
+    # Allocate Blocks
+    # Should see a single block stored event with a blocks_to_cache number of
+    # block hashes
+    # take_events should reset the kv_event_queue
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks),
+        max_model_len=8192,
+        enable_caching=True,
+        enable_kv_cache_events=True,
+    )
+
+    num_tokens = block_size * blocks_to_cache
+    req0 = make_request("0", list(range(num_tokens)))
+    _ = manager.allocate_slots(req0, num_tokens)
+    events = manager.take_events()
+
+    block = events[-1]
+    assert (len(block.block_hashes) == blocks_to_cache == len(
+        manager.block_pool.cached_block_hash_to_block))
+    assert len(block.token_ids) == block.block_size * len(block.block_hashes)
+    assert len(manager.block_pool.kv_event_queue) == 0
+
+    stored_block_hash = block.block_hashes
+
+    # Remove blocks and send another request
+    # Should see block_to_cache number of removed block events and a new block
+    # stored event
+    manager.free(req0)
+    req1 = make_request("1", list(range(num_tokens)))
+    _ = manager.allocate_slots(req1, num_tokens)
+    events = manager.take_events()
+
+    for blocks in events[:-1]:
+        assert blocks.block_hashes[0] in stored_block_hash
+    assert len(events) == blocks_to_cache + 1
+    assert (isinstance(events[-2], BlockRemoved))
+    assert (len(events[-1].block_hashes) == blocks_to_cache == len(
+        manager.block_pool.cached_block_hash_to_block))
+
+    # All Blocks Cleared
+    # Should see a single all blocks cleared event
+    manager.free(req1)
+    manager.reset_prefix_cache()
+    events = manager.take_events()
+
+    assert isinstance(events[-1], AllBlocksCleared)
+    assert len(manager.block_pool.cached_block_hash_to_block) == 0
+
+
+def test_eagle_enabled_removes_last_block():
+    """Verify Eagle does NOT remove blocks when request 
+    length is divisible by block size."""
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks=10),
+        max_model_len=8192,
+        enable_caching=True,
+        use_eagle=True,
+    )
+
+    # Request with 3 full blocks (48 tokens)
+    token_ids = [0] * (3 * block_size)
+    req = make_request("divisible_request", token_ids)
+
+    # Prime the cache
+    computed_blocks, _ = manager.get_computed_blocks(req)
+    manager.allocate_slots(req, len(token_ids), computed_blocks)
+    manager.free(req)
+
+    # New request with same tokens + Eagle enabled
+    req_eagle = make_request("eagle_divisible", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
+
+    # Should retain 1 block:
+    # 1. Original 3 blocks → pop last hash → 2 matched blocks
+    # 2. drop last matched block → 1 remaining block
+    assert len(computed_blocks) == 1
+    assert num_tokens == 1 * block_size  # 16 tokens
+
+
+def test_eagle_with_partial_blocks():
+    """Test Eagle behavior with requests containing partial blocks."""
+    block_size = 16
+    manager = KVCacheManager(
+        make_kv_cache_config(block_size, num_blocks=10),
+        max_model_len=8192,
+        enable_caching=True,
+        use_eagle=True,
+    )
+    # 2 full blocks + 5 tokens (non-divisible length)
+    token_ids = [0] * (2 * block_size + 5)
+    req = make_request("partial_block_test", token_ids)
+
+    # Prime the cache
+    computed_blocks, _ = manager.get_computed_blocks(req)
+    manager.allocate_slots(req, len(token_ids), computed_blocks)
+    manager.free(req)
+
+    # New request with Eagle enabled
+    req_eagle = make_request("partial_eagle", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
+    # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
+    assert len(computed_blocks) == 1
+    assert num_tokens == 1 * block_size
+
+
+def test_eagle_with_sliding_window():
+    """Test Eagle behavior with sliding window."""
+    block_size = 16
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=block_size,
+        use_mla=False,
+    )
+    manager = KVCacheManager(
+        KVCacheConfig(
+            num_blocks=10,
+            tensors={},
+            kv_cache_groups=[KVCacheGroupSpec(['layer'], sliding_window_spec)],
+        ),
+        max_model_len=8192,
+        enable_caching=True,
+        use_eagle=True,
+    )
+
+    # 2 full blocks + 5 tokens (non-divisible length)
+    token_ids = [0] * (2 * block_size + 5)
+    req = make_request("partial_block_test", token_ids)
+
+    # Prime the cache
+    computed_blocks, _ = manager.get_computed_blocks(req)
+    manager.allocate_slots(req, len(token_ids), computed_blocks)
+    # record the block hash of the first block in the request for later use
+    block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
+    assert block_hash_first_block is not None
+    manager.free(req)
+
+    # New request with Eagle enabled
+    req_eagle = make_request("partial_eagle", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
+    # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
+    assert len(computed_blocks) == 1
+    assert num_tokens == 1 * block_size
+
+    # Evict the first block in the request
+    assert manager.block_pool.get_cached_block(
+        block_hash_first_block) is not None
+    manager.block_pool.cached_block_hash_to_block.pop(block_hash_first_block)
+
+    # New request
+    req_after_evict = make_request("partial_eagle_after_evict", token_ids)
+    computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict)
+    # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
+    # not considered. But after dropping the last matched block due to eagle,
+    # there will be no matched prefix.
+    assert len(computed_blocks) == 0
+    assert num_tokens == 0
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index bc17ca32e5b..bfe9df10d4d 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Optional
+from unittest.mock import Mock
 
 import pytest
 import torch
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
+                         SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -25,6 +27,11 @@ def create_scheduler(
     enable_prefix_caching: Optional[bool] = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
+    max_model_len: Optional[int] = None,
+    num_speculative_tokens: Optional[int] = None,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -37,14 +44,17 @@ def create_scheduler(
                              (None)
 
     Returns:
-      :class:`Scheduler` instance
+      {class}`Scheduler` instance
     '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_num_batched_tokens,
+        max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
     )
     model_config = ModelConfig(
         model=model,
@@ -60,31 +70,42 @@ def create_scheduler(
         'enable_prefix_caching': enable_prefix_caching
     })
     cache_config = CacheConfig(
-        block_size=16,
+        block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
         **kwargs_cache,
     )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
+    speculative_config: Optional[SpeculativeConfig] = None
+    if num_speculative_tokens is not None:
+        speculative_config = SpeculativeConfig(
+            model="ngram", num_speculative_tokens=num_speculative_tokens)
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        speculative_config=speculative_config,
     )
     kv_cache_config = KVCacheConfig(
-        num_blocks=10000,  # A large number of blocks to hold all requests
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
         tensors={},
         kv_cache_groups=[
             KVCacheGroupSpec(['layer'],
-                             FullAttentionSpec(16, 1, 1, torch.float32, False))
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
         ],
     )
-    cache_config.num_gpu_blocks = 10000
+    cache_config.num_gpu_blocks = num_blocks
     return Scheduler(
-        scheduler_config,
-        model_config,
-        cache_config,
-        lora_config=None,
+        vllm_config=vllm_config,
         kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
@@ -111,7 +132,6 @@ def create_requests(num_requests: int,
             mm_inputs = None
         request = Request(
             request_id=f"{i}",
-            prompt=None,
             prompt_token_ids=[i] * num_tokens,
             sampling_params=sampling_params,
             multi_modal_inputs=mm_inputs,
@@ -286,6 +306,7 @@ def test_no_mm_input_chunking():
         model="llava-hf/llava-1.5-7b-hf",
         max_num_batched_tokens=1024,
         disable_chunked_mm_input=True,
+        max_model_len=2048,
     )
     mm_positions = [[PlaceholderRange(offset=400, length=800)]]
     requests = create_requests(num_requests=1,
@@ -414,7 +435,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
 
 def test_stop_via_update_from_output():
     """Test stopping behavior through update_from_output"""
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=1)
 
     # Test case 1: Stop on EOS token
     requests = create_requests(num_requests=2, max_tokens=10)
@@ -422,7 +443,6 @@ def test_stop_via_update_from_output():
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
         scheduler.running.append(req)
-        scheduler.scheduled_req_ids.add(req.request_id)
 
     scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                        scheduled_cached_reqs=[],
@@ -466,7 +486,7 @@ def test_stop_via_update_from_output():
     assert list(requests[1].output_token_ids) == [10, 11]
 
     # Test case 2: Stop on custom stop token
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=2)
     requests = create_requests(num_requests=2,
                                max_tokens=10,
                                stop_token_ids=[42, 43])
@@ -474,7 +494,6 @@ def test_stop_via_update_from_output():
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
         scheduler.running.append(req)
-        scheduler.scheduled_req_ids.add(req.request_id)
 
     scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                        scheduled_cached_reqs=[],
@@ -518,13 +537,12 @@ def test_stop_via_update_from_output():
     assert list(requests[1].output_token_ids) == [13, 14]
 
     # Test case 3: Stop on max tokens
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=2)
     requests = create_requests(num_requests=2, max_tokens=2)
     for req in requests:
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
         scheduler.running.append(req)
-        scheduler.scheduled_req_ids.add(req.request_id)
 
     scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                        scheduled_cached_reqs=[],
@@ -568,13 +586,12 @@ def test_stop_via_update_from_output():
     assert list(requests[1].output_token_ids) == [13]
 
     # Test case 4: Ignore EOS flag
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=2)
     requests = create_requests(num_requests=1, max_tokens=10)
     requests[0].sampling_params.ignore_eos = True
     requests[0].num_computed_tokens = requests[0].num_tokens
     scheduler.requests[requests[0].request_id] = requests[0]
     scheduler.running.append(requests[0])
-    scheduler.scheduled_req_ids.add(requests[0].request_id)
 
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
@@ -671,13 +688,14 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)),  # perfect match
-        ([[1, 2, 3]], [[1, 5]], (3, 1)),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)),  # multiple sequences
-        ([[1]], [[1, 2]], (1, 1)),  # single token sequence
-        ([[]], [[5]], (0, 0)),  # empty sequence
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]],
+         (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
         ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
-         (6, 3)),  # multiple mismatches
+         (2, 6, 3, [2, 1, 0])),  # multiple mismatches
     ])
 def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     """Test scheduling behavior with speculative decoding.
@@ -686,7 +704,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     1. Speculated tokens get scheduled correctly
     2. Spec decoding stats properly count number of draft and accepted tokens
     """
-    scheduler = create_scheduler()
+    num_spec_tokens = max(1, max(len(t) for t in spec_tokens))
+    scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens)
     requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
     req_ids = []
     req_to_index = {}
@@ -759,5 +778,467 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     else:
         assert scheduler_stats.spec_decoding_stats is not None
         stats = scheduler_stats.spec_decoding_stats
-        assert stats.num_draft_tokens == expected[0]
-        assert stats.num_accepted_tokens == expected[1]
+        assert stats.num_drafts == expected[0]
+        assert stats.num_draft_tokens == expected[1]
+        assert stats.num_accepted_tokens == expected[2]
+        assert stats.num_accepted_tokens_per_pos == expected[3]
+
+
+def _assert_right_scheduler_output(
+    output: SchedulerOutput,
+    num_requests: int,
+    expected_num_scheduled_tokens: int,
+):
+    """Check if SchedulerOutput is correct after remote KV cache hit."""
+
+    # We should inject the kv_connector_metadata.
+    assert len(output.kv_connector_metadata.requests) == num_requests
+
+    # Only num_tokens - matched_num_new_tokens should be scheduled.
+    for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+        assert num_scheduled_tokens == expected_num_scheduled_tokens
+
+
+def _assert_right_kv_cache_manager(
+    scheduler: Scheduler,
+    req_ids: list[str],
+    num_tokens: int,
+    block_size: int,
+    num_requests: int,
+    num_total_blocks: int,
+):
+    """Check whether KVCacheManager is correct after allocate."""
+
+    # Make sure the request stats are right.
+    EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
+    for req_id in req_ids:
+        blocks = scheduler.kv_cache_manager.req_to_blocks[req_id]
+        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+        assert (scheduler.kv_cache_manager.num_cached_block[req_id] ==
+                EXPECTED_TOTAL_BLOCKS)
+        assert len(blocks) == EXPECTED_TOTAL_BLOCKS
+        assert len(hashes) == EXPECTED_TOTAL_BLOCKS
+
+    # Make sure we actually touched all the blocks.
+    BLOCKS_PER_REQ = num_tokens / block_size
+    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
+            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+
+
+def _step_until_done(
+    scheduler: Scheduler,
+    output: SchedulerOutput,
+    model_runner_output: ModelRunnerOutput,
+):
+    """Loop over schedule(), update_from_output() until finished."""
+
+    all_finished = False
+    _ = scheduler.update_from_output(output, model_runner_output)
+    while not all_finished:
+        # Schedule + a few iterations until stopping.
+        output = scheduler.schedule()
+        assert len(scheduler.running)
+        for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+            # We should be in the decode phase now.
+            assert num_scheduled_tokens == 1
+        assert len(output.kv_connector_metadata.requests) == 0
+        ecos = scheduler.update_from_output(output, model_runner_output)
+        all_done = True
+        for eco in ecos.outputs:
+            if eco.finish_reason is None:
+                all_done = False
+        all_finished = all_done
+
+
+def test_kv_connector_basic():
+    """
+    Test whether Scheduler with KVConnector schedules tokens, allocates
+    memory, and cleans up requests as expected under normal operation.
+    """
+
+    # Setup Scheduler.
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+    )
+    NUM_TOTAL_BLOCKS = (
+        scheduler.kv_cache_manager.block_pool.get_num_free_blocks())
+    BLOCK_SIZE = scheduler.cache_config.block_size
+
+    # Mock External Cache Hit.
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    ######################################################
+    # FIRST SET OF REQUESTS - External Hit Only
+    NUM_REQUESTS = 2
+    NUM_TOKENS = NUM_MATCHED_NEW_TOKENS * 2
+    MAX_TOKENS = 3
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # Ensure ScheduleOutput is correct.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens should be scheduled.
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS,
+    )
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+    ######################################################
+    # SECOND SET OF REQUESTS - Local And External Hit
+    NUM_TOKENS_PREFIX = NUM_TOKENS
+    # We will get a local prefix cache hit for the first
+    # NUM_TOKENS_PREFIX tokens since they are used above.
+    NUM_TOKENS = NUM_TOKENS_PREFIX * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # We should get a local cache hit of NUM_TOKENS_PREFIX and
+    # a remote KV cache hit of NUM_MATCHED_NEW_TOKENS.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output=output,
+        num_requests=NUM_REQUESTS,
+        # Just the incremental tokens after local + remote cache hit.
+        expected_num_scheduled_tokens=(NUM_TOKENS - NUM_TOKENS_PREFIX -
+                                       NUM_MATCHED_NEW_TOKENS))
+
+    # Ensure KVCacheManager is correct.
+    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+                                   NUM_REQUESTS, NUM_TOTAL_BLOCKS)
+
+    # Continue Generation until done.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    _ = scheduler.schedule()
+    # Confirm we clean up the memory properly.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_TOTAL_BLOCKS
+
+
+def test_kv_connector_unable_to_allocate():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 4
+    NUM_BLOCKS = 10
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    # Create two requests. The second request will not be able to
+    # allocate slots because it will not have enough blocks.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE
+    MAX_TOKENS = 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # All memory should be freed, with one request waiting.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Just one request should be running.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(output,
+                                   num_requests=1,
+                                   expected_num_scheduled_tokens=NUM_TOKENS -
+                                   NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # All memory should be freed, with no requests waiting / running.
+    _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT)
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 0
+
+
+def test_kv_connector_handles_preemption():
+    """
+    Test whether scheduler with KVConnector is able to handle
+    unable to allocate (run out of blocks in allocate_slots().
+    """
+
+    # Setup Scheduler With Mock External Cache Hit.
+    BLOCK_SIZE = 2
+    # NOTE: there is 1 null block, so this is 6 blocks.
+    NUM_BLOCKS = 7
+    scheduler = create_scheduler(
+        enable_prefix_caching=True,
+        use_kv_connector=True,
+        block_size=BLOCK_SIZE,
+        num_blocks=NUM_BLOCKS,
+    )
+
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
+    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
+    scheduler.connector.get_num_new_matched_tokens.return_value = (
+        NUM_MATCHED_NEW_TOKENS)
+
+    # Create two requests.
+    # Both can be scheduled at first, but the second request
+    # will be preempted and re-scheduled.
+    NUM_REQUESTS = 2
+    NUM_TOKENS = BLOCK_SIZE * 2 + 1
+    MAX_TOKENS = BLOCK_SIZE * 2
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[1000]] * len(req_ids),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+    # All can be scheduled - 1st token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 2 remote kv cache hits.
+        num_requests=2,
+        expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # All can be scheduled - 2nd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 2
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+
+    # This will generate a new block and cause a preemption - 3rd token.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+
+    # Restarts the preempted request - generate 3rd token.
+    # This will have a local and remote cache hit.
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # 1 remote kv_cache hit!
+        num_requests=1,
+        # Only 1 block was preempted and there is a single
+        # remote hit. So only single new token is scheduled.
+        expected_num_scheduled_tokens=1,
+    )
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Only 1 can be scheduled - 4th (and last token).
+    output = scheduler.schedule()
+    _assert_right_scheduler_output(
+        output,
+        # no connector_metadata
+        num_requests=0,
+        expected_num_scheduled_tokens=1)
+    assert len(scheduler.running) == 1
+    _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    assert len(scheduler.running) == 0
+    # All memory should be freed since nothing is running.
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
+        == NUM_BLOCKS - 1
+
+
+def make_output(scheduler: Scheduler):
+    return ModelRunnerOutput(
+        req_ids=[req.request_id for req in scheduler.running],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(scheduler.running)
+        },
+        sampled_token_ids=[[1000]] * len(scheduler.running),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+
+def assert_scheduler_empty(scheduler: Scheduler):
+    """Confirm the scheduler is "empty" - i.e. no leaks."""
+    # Scheduler Metadata.
+    assert len(scheduler.requests) == 0
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.finished_req_ids) == 0
+    assert len(scheduler._cached_reqs_data) == 0
+
+    # EncoderCacheManager.
+    assert len(scheduler.encoder_cache_manager.freed) == 0
+    assert len(scheduler.encoder_cache_manager.cached) == 0
+
+    # KVCache Manager.
+    assert len(scheduler.kv_cache_manager.req_to_blocks) == 0
+    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
+    assert len(scheduler.kv_cache_manager.num_cached_block) == 0
+    num_free_blocks = (
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+    assert num_free_blocks == (
+        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+
+    # NOTE(rob): just the ref count on blocks will be 0. The hash
+    # value, etc will remain since we lazily evict for prefix cache.
+    for block in scheduler.kv_cache_manager.block_pool.blocks:
+        assert block.ref_cnt == 0
+        # assert block._block_hash is None
+    # assert (
+    #     len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
+    #           ) == 0)
+
+
+def test_memory_leak():
+    """Test that we do not have a memory leak."""
+
+    scheduler = create_scheduler(enable_prefix_caching=True)
+
+    NUM_REQUESTS = 5
+    NUM_TOKENS = 10
+    MAX_TOKENS = 10
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+
+    # Add each request.
+    for request in requests:
+        scheduler.add_request(request)
+        scheduler_output = scheduler.schedule()
+        model_runner_output = make_output(scheduler)
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Iterate until done.
+    while True:
+        scheduler_output = scheduler.schedule()
+        if len(scheduler.running) == 0:
+            break
+        model_runner_output = make_output(scheduler)
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Confirm no memory leak.
+    assert_scheduler_empty(scheduler)
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
index 9b4ab5fa8b1..595c8608fc6 100644
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
@@ -19,7 +19,9 @@ def test_sliding_window_possible_cached_prefix():
     )
 
     block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
-    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+    manager = SlidingWindowManager(sliding_window_spec,
+                                   block_pool,
+                                   use_eagle=False)
 
     def run_one_case(block_is_cached, expect_length):
         block_hash_list = [
@@ -79,7 +81,9 @@ def test_sliding_window_remove_skipped_blocks():
 
     block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
 
-    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+    manager = SlidingWindowManager(sliding_window_spec,
+                                   block_pool,
+                                   use_eagle=False)
 
     null_block_id = block_pool.null_block.block_id
 
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index a8079dcce5e..48c26556034 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -1,13 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 from vllm import LLM, SamplingParams
 
+from ...utils import fork_new_process_for_each_test
+
 
-def test_cascade_attention(example_system_message, monkeypatch):
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("attn_backend",
+                         ["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
+def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
         sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 67371498059..2fad37d6801 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -44,18 +44,20 @@ def test_prompts():
 
 @pytest.fixture
 def sampling_config():
-    # Only support greedy for now
     return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
 
 
 @pytest.fixture
 def model_name():
-    return "meta-llama/Meta-Llama-3-8B-Instruct"
+    return "meta-llama/Llama-3.1-8B-Instruct"
 
 
-@pytest.fixture
 def eagle_model_name():
-    return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
+    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+
+def eagle3_model_name():
+    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
 
 def test_ngram_correctness(
@@ -102,12 +104,13 @@ def test_ngram_correctness(
         del spec_llm
 
 
+@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_name: str,
-    eagle_model_name: str,
+    use_eagle3: bool,
 ):
     '''
     Compare the outputs of a original LLM and a speculative LLM
@@ -116,18 +119,22 @@ def test_eagle_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_llm = LLM(model=model_name, max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
+        spec_model_name = eagle3_model_name(
+        ) if use_eagle3 else eagle_model_name()
         spec_llm = LLM(
             model=model_name,
+            trust_remote_code=True,
             speculative_config={
-                "method": "eagle",
-                "model": eagle_model_name,
+                "method": "eagle3" if use_eagle3 else "eagle",
+                "model": spec_model_name,
                 "num_speculative_tokens": 3,
+                "max_model_len": 2048,
             },
-            max_model_len=1024,
+            max_model_len=2048,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
@@ -140,7 +147,7 @@ def test_eagle_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Heuristic: expect at least 66% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.7 * len(ref_outputs))
+        assert matches > int(0.66 * len(ref_outputs))
         del spec_llm
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index 8872f0388dd..d04679c1244 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -13,6 +13,8 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 
+from ...distributed.conftest import publisher_config, random_port  # noqa: F401
+
 from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
 
 EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]]
@@ -47,7 +49,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
         tokenizer=tokenizer,
         tokenizer_group=init_tokenizer_from_configs(
             vllm_config.model_config, vllm_config.scheduler_config,
-            vllm_config.parallel_config, vllm_config.lora_config),
+            vllm_config.lora_config),
         vllm_config=vllm_config,
         full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
         prompt_tokens=prompt_tokens,
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index da0639678af..5d52ad5f532 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -3,16 +3,19 @@
 import asyncio
 from contextlib import ExitStack
 from typing import Optional
+from unittest.mock import MagicMock
 
 import pytest
 
 from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.metrics.loggers import LoggingStatLogger
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -216,3 +219,33 @@ async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
         # Assert only the last output has the finished flag set
         assert all(not out.finished for out in outputs[:-1])
         assert outputs[-1].finished
+
+
+class MockLoggingStatLogger(LoggingStatLogger):
+
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+        super().__init__(vllm_config, engine_index)
+        self.log = MagicMock()
+
+
+@pytest.mark.asyncio
+async def test_customize_loggers(monkeypatch):
+    """Test that we can customize the loggers.
+    If a customized logger is provided at the init, it should
+    be used directly.
+    """
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(
+            TEXT_ENGINE_ARGS,
+            stat_loggers=[MockLoggingStatLogger],
+        )
+        after.callback(engine.shutdown)
+
+        await engine.do_log_stats()
+
+        assert len(engine.stat_loggers) == 1
+        assert len(engine.stat_loggers[0]) == 1
+        engine.stat_loggers[0][0].log.assert_called_once()
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 3f3109c1484..dcf494825b0 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import copy
-import threading
 import time
 import uuid
-from concurrent.futures import Future
+from concurrent.futures import Future, ThreadPoolExecutor
 
 import pytest
 from transformers import AutoTokenizer
@@ -32,8 +31,7 @@
 
 def make_request() -> EngineCoreRequest:
     return EngineCoreRequest(
-        request_id=uuid.uuid4(),
-        prompt=PROMPT,
+        request_id=str(uuid.uuid4()),
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
         mm_hashes=None,
@@ -42,6 +40,7 @@ def make_request() -> EngineCoreRequest:
         eos_token_id=None,
         arrival_time=time.time(),
         lora_request=None,
+        cache_salt=None,
     )
 
 
@@ -244,33 +243,33 @@ def initialize_from_config(
                 self, kv_cache_configs: list[KVCacheConfig]) -> None:
             super().initialize_from_config(kv_cache_configs)
 
-            # This executor actually can only run 1 batch at a time
-            self.semaphore = threading.Semaphore(1)
+            # Create a thread pool with a single worker
+            self.thread_pool = ThreadPoolExecutor(max_workers=1)
 
         def execute_model(
             self,
             scheduler_output,
         ) -> Future[ModelRunnerOutput]:
             """Make execute_model non-blocking."""
-            future: Future[ModelRunnerOutput] = Future()
 
-            def _thread_wrapper(scheduler_output, future):
-                with self.semaphore:
-                    output = self.collective_rpc("execute_model",
-                                                 args=(scheduler_output, ))
-                    # Make a copy because output[0] may be reused
-                    # by the next batch.
-                    output = copy.deepcopy(output[0])
-                    future.set_result(output)
+            def _execute():
+                output = self.collective_rpc("execute_model",
+                                             args=(scheduler_output, ))
+                # Make a copy because output[0] may be reused
+                # by the next batch.
+                return copy.deepcopy(output[0])
 
-            threading.Thread(target=_thread_wrapper,
-                             args=(scheduler_output, future)).start()
-            return future
+            # Use the thread pool instead of creating a new thread
+            return self.thread_pool.submit(_execute)
 
         @property
         def max_concurrent_batches(self) -> int:
             return 2
 
+        def shutdown(self):
+            if hasattr(self, 'thread_pool'):
+                self.thread_pool.shutdown(wait=False)
+
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -299,14 +298,77 @@ def max_concurrent_batches(self) -> int:
         # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 1
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 10
+        # num_computed_tokens should have been updated immediately.
+        assert engine_core.scheduler.requests[
+            req0.request_id].num_computed_tokens == 10
+
+        # Schedule Batch 2: (2, req0), (8, req1)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 2
+        assert scheduler_output.num_scheduled_tokens[1] == 8
+        # num_computed_tokens should have been updated immediately.
+        assert engine_core.scheduler.requests[0].num_computed_tokens == 12
+        assert engine_core.scheduler.requests[1].num_computed_tokens == 8
+
         assert engine_core.scheduler.get_num_unfinished_requests() == 2
 
-        # Loop through both requests.
-        while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            engine_core.step_with_batch_queue()
+        # Batch queue is full. Finish Batch 1.
+        engine_core.step_with_batch_queue()
+
+        # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
+        # because it is in the decoding stage now.
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[1] == 4
 
-        # Reaching here when got the result of the first request.
-        while engine_core.scheduler.get_num_unfinished_requests() == 1:
-            engine_core.step_with_batch_queue()
+        # Batch queue is full. Finish Batch 2. Get first token of req0.
+        output = engine_core.step_with_batch_queue()
+        assert output is not None
+        assert len(output.outputs) == 1
+        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+
+        # Schedule Batch 4: (1, req0).
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 1
+
+        # Batch queue is full. Finish Batch 3. Get first token of req1.
+        output = engine_core.step_with_batch_queue()
+        assert output is not None
+        assert len(output.outputs) == 1
+        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+
+        # Schedule Batch 5: (1, req1).
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[1] == 1
+
+        # Loop until req0 is finished.
+        step = 0
+        req_id = 0
+        expected_num_tokens = [
+            engine_core.scheduler.requests[0].num_tokens + 1,
+            engine_core.scheduler.requests[1].num_tokens + 1,
+        ]
+        while engine_core.scheduler.get_num_unfinished_requests() == 2:
+            output = engine_core.step_with_batch_queue()
+            if step % 2 == 0:
+                # Even steps consumes an output.
+                assert output is not None
+                assert len(output.outputs) == 1
+                if req_id in engine_core.scheduler.requests:
+                    assert engine_core.scheduler.requests[
+                        req_id].num_tokens == expected_num_tokens[req_id]
+                expected_num_tokens[req_id] += 1
+                req_id = (req_id + 1) % 2
+            else:
+                # Odd steps schedules a new batch.
+                assert output is None
+            step += 1
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 8ebdaf63b48..ae072a47e7f 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -11,6 +11,7 @@
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
+from vllm.distributed.kv_events import BlockStored, KVEventBatch
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
@@ -20,6 +21,7 @@
                                         SyncMPClient)
 from vllm.v1.executor.abstract import Executor
 
+from ...distributed.conftest import MockSubscriber
 from ...utils import create_new_process_for_each_test
 
 if not current_platform.is_cuda():
@@ -35,7 +37,6 @@
 def make_request(params: SamplingParams) -> EngineCoreRequest:
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
-        prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
         mm_inputs=None,
         mm_hashes=None,
@@ -44,6 +45,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         eos_token_id=None,
         arrival_time=time.time(),
         lora_request=None,
+        cache_salt=None,
     )
 
 
@@ -199,54 +201,143 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
             log_stats=True,
         )
 
-        MAX_TOKENS = 20
-        params = SamplingParams(max_tokens=MAX_TOKENS)
-        """Normal Request Cycle."""
+        try:
+            MAX_TOKENS = 20
+            params = SamplingParams(max_tokens=MAX_TOKENS)
+            """Normal Request Cycle."""
 
-        requests = [make_request(params) for _ in range(10)]
-        request_ids = [req.request_id for req in requests]
+            requests = [make_request(params) for _ in range(10)]
+            request_ids = [req.request_id for req in requests]
 
-        # Add requests to the engine.
-        for request in requests:
-            await client.add_request_async(request)
-            await asyncio.sleep(0.01)
+            # Add requests to the engine.
+            for request in requests:
+                await client.add_request_async(request)
+                await asyncio.sleep(0.01)
 
-        outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
-        await loop_until_done_async(client, outputs)
+            outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
+            await loop_until_done_async(client, outputs)
 
-        for req_id in request_ids:
-            assert len(outputs[req_id]) == MAX_TOKENS, (
-                f"{outputs[req_id]=}, {MAX_TOKENS=}")
-        """Abort Request Cycle."""
+            for req_id in request_ids:
+                assert len(outputs[req_id]) == MAX_TOKENS, (
+                    f"{outputs[req_id]=}, {MAX_TOKENS=}")
+            """Abort Request Cycle."""
+
+            # Add requests to the engine.
+            for idx, request in enumerate(requests):
+                await client.add_request_async(request)
+                await asyncio.sleep(0.01)
+                if idx % 2 == 0:
+                    await client.abort_requests_async([request.request_id])
+
+            outputs = {req_id: [] for req_id in request_ids}
+            await loop_until_done_async(client, outputs)
+
+            for idx, req_id in enumerate(request_ids):
+                if idx % 2 == 0:
+                    assert len(outputs[req_id]) < MAX_TOKENS, (
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+                else:
+                    assert len(outputs[req_id]) == MAX_TOKENS, (
+                        f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+            """Utility method invocation"""
 
-        # Add requests to the engine.
-        for idx, request in enumerate(requests):
-            await client.add_request_async(request)
-            await asyncio.sleep(0.01)
-            if idx % 2 == 0:
-                await client.abort_requests_async([request.request_id])
+            core_client: AsyncMPClient = client
 
-        outputs = {req_id: [] for req_id in request_ids}
-        await loop_until_done_async(client, outputs)
+            result = await core_client.call_utility_async("echo", "testarg")
+            assert result == "testarg"
 
-        for idx, req_id in enumerate(request_ids):
-            if idx % 2 == 0:
-                assert len(outputs[req_id]) < MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-            else:
-                assert len(outputs[req_id]) == MAX_TOKENS, (
-                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-        """Utility method invocation"""
+            with pytest.raises(Exception) as e_info:
+                await core_client.call_utility_async("echo", None, "help!")
+
+            assert str(e_info.value) == "Call to echo method failed: help!"
+        finally:
+            client.shutdown()
+
+
+@pytest.mark.parametrize(
+    "multiprocessing_mode,publisher_config",
+    [(True, "tcp"), (False, "inproc")],
+    indirect=["publisher_config"],
+)
+def test_kv_cache_events(
+    monkeypatch: pytest.MonkeyPatch,
+    multiprocessing_mode: bool,
+    publisher_config,
+):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        block_size = 16
+        num_blocks = 2
 
-        core_client: AsyncMPClient = client
+        engine_args = EngineArgs(model=MODEL_NAME,
+                                 enforce_eager=True,
+                                 enable_prefix_caching=True,
+                                 block_size=block_size)
+        engine_args.kv_events_config = publisher_config
 
-        result = await core_client.call_utility_async("echo", "testarg")
-        assert result == "testarg"
+        vllm_config = engine_args.create_engine_config(
+            UsageContext.UNKNOWN_CONTEXT)
 
-        with pytest.raises(Exception) as e_info:
-            await core_client.call_utility_async("echo", None, "help!")
+        executor_class = Executor.get_class(vllm_config)
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+        )
+        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+        time.sleep(0.1)
+        subscriber = MockSubscriber(endpoint,
+                                    topic=publisher_config.topic,
+                                    decode_type=KVEventBatch)
+
+        try:
+            custom_tokens = list(range(num_blocks * block_size))
+            request = EngineCoreRequest(
+                request_id=str(uuid.uuid4()),
+                prompt_token_ids=custom_tokens,
+                mm_inputs=None,
+                mm_hashes=None,
+                mm_placeholders=None,
+                sampling_params=SamplingParams(
+                    max_tokens=1),  # Short completion for speed
+                eos_token_id=None,
+                arrival_time=time.time(),
+                lora_request=None,
+                cache_salt=None,
+            )
+            client.add_request(request)
 
-        assert str(e_info.value) == "Call to echo method failed: help!"
+            outputs: dict[str, list] = {request.request_id: []}
+            loop_until_done(client, outputs)
+
+            result = subscriber.receive_one(timeout=1000)
+            assert result is not None, "No message received"
+
+            seq, received = result
+
+            assert seq == 0, "Sequence number mismatch"
+            assert len(received.events) == 1, (
+                "We should have exactly one BlockStored event")
+            event = received.events[0]
+            assert isinstance(
+                event, BlockStored), ("We should have a BlockStored event")
+            assert len(event.block_hashes) == num_blocks, (
+                "We should have a BlockStored event with 2 block_hashes")
+            assert event.block_size == block_size, (
+                "Block size should be the same as the block size")
+            assert event.parent_block_hash is None, (
+                "Parent block hash should be None")
+            assert event.lora_id is None, "Lora id should be None"
+            assert len(event.token_ids) == num_blocks * block_size, (
+                "Token ids should be the same as the custom tokens")
+            assert event.token_ids == custom_tokens, (
+                "Token ids should be the same as the custom tokens")
+        finally:
+            client.shutdown()
+        return
 
 
 @pytest.mark.timeout(10)
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 9ac42dbc34a..fac701c4ca3 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -50,7 +50,6 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
     # Make N requests.
     requests = [
         EngineCoreRequest(request_id=f"request-{idx}",
-                          prompt=prompt,
                           prompt_token_ids=prompt_tokens,
                           arrival_time=0,
                           mm_inputs=None,
@@ -58,20 +57,20 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
                           mm_placeholders=None,
                           eos_token_id=None,
                           lora_request=None,
+                          cache_salt=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
                               output_kind=request_output_kind,
                               stop=[],
                               include_stop_str_in_output=False,
-                          )) for idx, (prompt, prompt_tokens) in enumerate(
-                              zip(dummy_test_vectors.prompt_strings,
-                                  dummy_test_vectors.prompt_tokens))
+                          ))
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     # Add requests to the detokenizer.
-    for request in requests:
-        output_processor.add_request(request)
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
 
     gen_strings = {}
     gen_tokens = {}
@@ -398,7 +397,6 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
     ]
     requests = [
         EngineCoreRequest(request_id=request_id_list[idx],
-                          prompt=prompt,
                           prompt_token_ids=prompt_tokens,
                           arrival_time=0,
                           mm_inputs=None,
@@ -406,6 +404,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                           mm_placeholders=None,
                           eos_token_id=None,
                           lora_request=None,
+                          cache_salt=None,
                           sampling_params=SamplingParams(
                               skip_special_tokens=False,
                               spaces_between_special_tokens=False,
@@ -414,14 +413,13 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
                               include_stop_str_in_output=False,
                               logprobs=num_sample_logprobs,
                               prompt_logprobs=num_prompt_logprobs,
-                          )) for idx, (prompt, prompt_tokens) in enumerate(
-                              zip(dummy_test_vectors.prompt_strings,
-                                  dummy_test_vectors.prompt_tokens))
+                          ))
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     # Add requests to the detokenizer.
-    for request in requests:
-        output_processor.add_request(request)
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
 
     gen_tokens = {}
     gen_logprobs = {}
@@ -507,7 +505,7 @@ def test_stop_token(include_stop_str_in_output: bool,
       reason should be "stop" (i.e. first control token causes stop
       and is represented in output text)
 
-    * else, the detokenized string should be 
+    * else, the detokenized string should be
       <token><token>...<token> and the finish reason should be "stop"
       (i.e. first control token causes stop but is not represented
       in output text.)
@@ -562,7 +560,6 @@ def test_stop_token(include_stop_str_in_output: bool,
     request_id = "request-0"
     request = EngineCoreRequest(
         request_id=request_id,
-        prompt=prompt_string,
         prompt_token_ids=prompt_tokens,
         arrival_time=0,
         mm_inputs=None,
@@ -570,6 +567,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         mm_placeholders=None,
         eos_token_id=eos_token_id,
         lora_request=None,
+        cache_salt=None,
         sampling_params=SamplingParams(
             skip_special_tokens=False,
             spaces_between_special_tokens=False,
@@ -583,7 +581,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         ))
 
     # Add request to the detokenizer.
-    output_processor.add_request(request)
+    output_processor.add_request(request, prompt_string)
 
     # Loop over engine core steps; run output processor
     gen_string = ""
@@ -659,7 +657,6 @@ def test_stop_string(include_stop_str_in_output: bool,
     requests = [
         EngineCoreRequest(
             request_id=request_id_list[idx],
-            prompt=prompt,
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
             mm_inputs=None,
@@ -667,6 +664,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             mm_placeholders=None,
             eos_token_id=None,
             lora_request=None,
+            cache_salt=None,
             sampling_params=SamplingParams(
                 skip_special_tokens=False,
                 spaces_between_special_tokens=False,
@@ -675,14 +673,13 @@ def test_stop_string(include_stop_str_in_output: bool,
                 include_stop_str_in_output=include_stop_str_in_output,
                 logprobs=num_sample_logprobs,
                 prompt_logprobs=None,
-            )) for idx, (prompt, prompt_tokens) in enumerate(
-                zip(dummy_test_vectors.prompt_strings,
-                    dummy_test_vectors.prompt_tokens))
+            ))
+        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     # Add requests to the detokenizer.
-    for request in requests:
-        output_processor.add_request(request)
+    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
+        output_processor.add_request(request, prompt)
 
     gen_strings = {}
     gen_tokens = {}
@@ -774,7 +771,6 @@ def test_iteration_stats(dummy_test_vectors):
     requests = [
         EngineCoreRequest(
             request_id=f"request-{idx}",
-            prompt=prompt,
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
             mm_inputs=None,
@@ -782,16 +778,15 @@ def test_iteration_stats(dummy_test_vectors):
             mm_placeholders=None,
             eos_token_id=None,
             lora_request=None,
+            cache_salt=None,
             sampling_params=SamplingParams(),
-        ) for idx, (prompt, prompt_tokens) in enumerate(
-            zip(dummy_test_vectors.prompt_strings,
-                dummy_test_vectors.prompt_tokens))
+        ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
     ]
 
     # Add all requests except one to the OutputProcessor.
     num_active = len(dummy_test_vectors.generation_tokens) - 1
     for request in requests[:num_active]:
-        output_processor.add_request(request)
+        output_processor.add_request(request, None)
     inactive_request = requests[num_active]
 
     # First iteration has 2 prefills.
@@ -817,7 +812,7 @@ def test_iteration_stats(dummy_test_vectors):
     assert iteration_stats.num_generation_tokens == num_active
 
     # Add a new request - prefill and 2 decodes in this step.
-    output_processor.add_request(inactive_request)
+    output_processor.add_request(inactive_request, None)
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]
     iteration_stats = IterationStats()
@@ -921,3 +916,84 @@ def make_outputs() -> list[RequestOutput]:
     # Cumulative logprobs should be the last one.
     cumulative_logprob_expected = 1.0 * num_to_put
     assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+
+@pytest.mark.asyncio
+async def test_cumulative_output_collector_n():
+    """Test collector correctly handles multiple outputs by index."""
+    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    outputs = [
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="a",
+                    token_ids=[0],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=1,
+                    text="b",
+                    token_ids=[1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+        RequestOutput(
+            request_id="my-request-id",
+            prompt=None,
+            prompt_token_ids=[1, 2, 3],
+            prompt_logprobs=None,
+            outputs=[
+                CompletionOutput(
+                    index=0,
+                    text="ab",
+                    token_ids=[0, 1],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+                CompletionOutput(
+                    index=2,
+                    text="c",
+                    token_ids=[2],
+                    cumulative_logprob=None,
+                    logprobs=None,
+                    finish_reason=None,
+                ),
+            ],
+            finished=False,
+        ),
+    ]
+    for output in outputs:
+        collector.put(output)
+
+    # Get the output and check that the text and token_ids are correct.
+    result = await collector.get()
+    # We are expecting
+    # [{index: 0, text: "ab"}, {index: 1, text: "b"}, {index: 2, text: "c"}]
+    assert len(result.outputs) == 3
+    # First is the one where index is 0
+    first = [k for k in result.outputs if k.index == 0]
+    assert len(first) == 1
+    assert first[0].text == "ab"
+
+    # Second is the one where index is 1
+    second = [k for k in result.outputs if k.index == 1]
+    assert len(second) == 1
+    assert second[0].text == "b"
+    assert second[0].token_ids == [1]
+
+    # Third is the one where index is 2
+    third = [k for k in result.outputs if k.index == 2]
+    assert len(third) == 1
+    assert third[0].text == "c"
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 1ee93c72cd2..4a23e0c1b21 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -8,8 +8,7 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, FinishReason
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
@@ -296,7 +295,7 @@ def generate_dummy_prompt_logprobs_tensors(
 class DummyOutputProcessorTestVectors:
     """Dummy test vectors for output processor tests"""
     tokenizer: GeneralTokenizerType
-    tokenizer_group: BaseTokenizerGroup
+    tokenizer_group: TokenizerGroup
     vllm_config: EngineArgs
     full_tokens: list[list[int]]  # Prompt + generated tokens
     prompt_tokens: list[list[int]]
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 6d4278b4c87..d84b2b22db1 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -47,6 +47,14 @@ def sample_json_schema():
                     "type": "string",
                 }
             },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
             "work_history": {
                 "type": "array",
                 "items": {
@@ -56,17 +64,20 @@ def sample_json_schema():
                             "type": "string"
                         },
                         "duration": {
-                            "type": "number"
+                            "type": "number",
+                            "minimum": 0.0,
+                            "maximum": 100.0,  # Numeric range
                         },
                         "position": {
                             "type": "string"
                         }
                     },
-                    "required": ["company", "position"]
+                    "required": ["company", "duration", "position"]
                 }
             }
         },
-        "required": ["name", "age", "skills", "work_history"]
+        "required":
+        ["name", "age", "skills", "grade", "email", "work_history"]
     }
 
 
@@ -78,27 +89,18 @@ def unsupported_json_schema():
         "properties": {
             "score": {
                 "type": "integer",
-                "minimum": 0,
-                "maximum": 100  # Numeric range
-            },
-            "grade": {
-                "type": "string",
-                "pattern": "^[A-D]$"  # Regex pattern
-            },
-            "email": {
-                "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+                "multipleOf": 5  # Numeric multiple
             },
             "tags": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "pattern":
-                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
+                    "minLength": 10,
+                    "maxLength": 20
                 }
             }
         },
-        "required": ["score", "grade", "email", "tags"]
+        "required": ["score", "tags"]
     }
 
 
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b179dc3b474..c3ea024f58c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -13,18 +13,34 @@
 
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
+NGRAM_SPEC_CONFIG = {
+    "model": "[ngram]",
+    "num_speculative_tokens": 5,
+    "prompt_lookup_max": 5,
+    "prompt_lookup_min": 1,
+}
+
+EAGLE_SPEC_CONFIG = {
+    "method": "eagle",
+    "model": "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+    "num_speculative_tokens": 5,
+}
+
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
-     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
-     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
-     "mistral"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     #FIXME: This test is flaky on CI thus disabled
-    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
+     NGRAM_SPEC_CONFIG),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
+    ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto",
+     EAGLE_SPEC_CONFIG)
 ]
 
 PARAMS_MODELS_TOKENIZER_MODE = [
@@ -47,8 +63,9 @@ class CarDescription(BaseModel):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
-                         PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+@pytest.mark.parametrize(
+    "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
+    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
@@ -60,16 +77,25 @@ def test_structured_output(
     guided_decoding_backend: str,
     tokenizer_mode: str,
     model_name: str,
+    speculative_config: dict[str, Any],
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
+    if current_platform.is_tpu() and speculative_config:
+        pytest.skip("TPU does not support speculative decoding")
+
+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
     # Use a single LLM instance for several scenarios to
     # speed up the test suite.
     llm = LLM(model=model_name,
-              enforce_eager=True,
+              enforce_eager=enforce_eager,
               max_model_len=1024,
               guided_decoding_backend=guided_decoding_backend,
-              tokenizer_mode=tokenizer_mode)
+              guided_decoding_disable_any_whitespace=True,
+              tokenizer_mode=tokenizer_mode,
+              speculative_config=speculative_config)
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -94,8 +120,7 @@ def test_structured_output(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        if 'disable-any-whitespace' in guided_decoding_backend:
-            assert "\n" not in generated_text
+        assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
@@ -346,6 +371,7 @@ def test_structured_output(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=json_schema))
+
     outputs = llm.generate(
         prompts="Generate a description of a frog using 50 characters.",
         sampling_params=sampling_params,
@@ -364,6 +390,103 @@ def test_structured_output(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
 
+    #
+    # Test 11: Generate structured output using structural_tag format
+    #
+    structural_tag_config = {
+        "type":
+        "structural_tag",
+        "structures": [{
+            "begin": "<function=get_weather>",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    }
+                }
+            },
+            "end": "</function>"
+        }],
+        "triggers": ["<function="]
+    }
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=100,
+        guided_decoding=GuidedDecodingParams(
+            structural_tag=json.dumps(structural_tag_config)))
+
+    prompt = """
+You have access to the following function to retrieve the weather in a city:
+         
+    {
+        "name": "get_weather",
+        "parameters": {
+            "city": {
+                "param_type": "string",
+                "description": "The city to get the weather for",
+                "required": True
+            }
+        }
+    }
+         
+If a you choose to call a function ONLY reply in the following format:
+<{start_tag}={function_name}>{parameters}{end_tag}
+where
+
+start_tag => `<function`
+parameters => a JSON dict with the function argument name
+              as key and function argument value as value.
+end_tag => `</function>`
+
+Here is an example,
+<function=example_function_name>{"example_name": "example_value"}</function>
+
+Reminder:
+- Function calls MUST follow the specified format
+- Required parameters MUST be specified
+- Only call one function at a time
+- Put the entire function call reply on one line
+- Always add your sources when using search results to answer the user query
+
+You are a helpful assistant.
+         
+Given the previous instructions, what is the weather in New York City?
+"""
+
+    # Change this once other backends support structural_tag
+    outputs = llm.generate(prompts=prompt,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # Search for function call pattern in the response
+        function_call_pattern = r'<function=get_weather>(.*?)</function>'
+        matches = re.findall(function_call_pattern, generated_text)
+
+        if not matches:
+            print(f"Warning: No function calls found in response: "
+                  f"{generated_text!r}")
+            continue
+
+        # Take the first function call if multiple are found
+        json_str = matches[0]
+        try:
+            json_content = json.loads(json_str)
+            assert "city" in json_content
+            assert isinstance(json_content["city"], str)
+            print(f"Found valid function call: {generated_text!r}")
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail("Invalid function call format: "
+                        f"{generated_text!r}\nError: {str(e)}")
+
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode",
@@ -386,13 +509,21 @@ def test_structured_output_auto_mode(
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
 
+    prompts = ("Give an example JSON object for a grade "
+               "that fits this schema: "
+               f"{unsupported_json_schema}")
     # This would fail with the default of "xgrammar", but in "auto"
     # we will handle fallback automatically.
-    outputs = llm.generate(prompts=("Give an example JSON object for a grade "
-                                    "that fits this schema: "
-                                    f"{unsupported_json_schema}"),
+    outputs = llm.generate(prompts=prompts,
                            sampling_params=sampling_params,
                            use_tqdm=True)
+    # Make sure `auto` backend handling doesn't mess up sampling_params
+    # and that we can reuse it without error.
+    outputs.extend(
+        llm.generate(prompts=prompts,
+                     sampling_params=sampling_params,
+                     use_tqdm=True))
+
     assert outputs is not None
     for output in outputs:
         assert output is not None
@@ -404,3 +535,63 @@ def test_structured_output_auto_mode(
         # Parse to verify it is valid JSON
         parsed_json = json.loads(generated_text)
         assert isinstance(parsed_json, dict)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+              max_model_len=1024,
+              guided_decoding_backend="guidance",
+              guided_decoding_disable_any_whitespace=True,
+              guided_decoding_disable_additional_properties=True)
+
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
+        "<|im_end|>\n<|im_start|>assistant\n")
+
+    def generate_with_backend(backend):
+        guided_params = GuidedDecodingParams(
+            json=schema,
+            backend=backend,
+            disable_any_whitespace=True,
+            disable_additional_properties=True)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+
+    generated = generate_with_backend("guidance")
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 5f041b44893..24b759bc1fa 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from vllm.platforms import current_platform
 from vllm.utils import make_tensor_with_pad
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
@@ -13,7 +14,8 @@
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
 CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
 
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
new file mode 100644
index 00000000000..ed368fe828d
--- /dev/null
+++ b/tests/v1/shutdown/test_delete.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("send_one_request", [False, True])
+async def test_async_llm_delete(model: str, tensor_parallel_size: int,
+                                send_one_request: bool) -> None:
+    """Test that AsyncLLM frees GPU memory upon deletion.
+    AsyncLLM always uses an MP client.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Instantiate AsyncLLM; make request to complete any deferred
+    # initialization; then delete instance
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    if send_one_request:
+        async for _ in async_llm.generate(
+                "Hello my name is",
+                request_id="abc",
+                sampling_params=SamplingParams(
+                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            pass
+    del async_llm
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("send_one_request", [False, True])
+def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
+                    enable_multiprocessing: bool,
+                    send_one_request: bool) -> None:
+    """Test that LLM frees GPU memory upon deletion.
+    TODO(andy) - LLM without multiprocessing.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      enable_multiprocessing: enable workers in separate process(es)
+      send_one_request: send one request to engine before deleting
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Instantiate LLM; make request to complete any deferred
+        # initialization; then delete instance
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+        if send_one_request:
+            llm.generate("Hello my name is",
+                         sampling_params=SamplingParams(max_tokens=1))
+        del llm
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
new file mode 100644
index 00000000000..9fedbe4f9a0
--- /dev/null
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle an Error in model forward and shutdown."""
+
+import asyncio
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM, AsyncEngineArgs, SamplingParams
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineDeadError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_forward(self, *args, **kwargs):
+    """Evil forward method that raise an exception after 10 calls."""
+    NUMBER_OF_GOOD_PASSES = 10
+
+    if not hasattr(self, "num_calls"):
+        self.num_calls = 0
+
+    if (self.num_calls == NUMBER_OF_GOOD_PASSES
+            and get_tensor_model_parallel_rank() == 0):
+        raise Exception("Simulated illegal memory access on Rank 0!")
+    self.num_calls += 1
+
+    return self.model(*args, **kwargs)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                                     model: str) -> None:
+    """Test that AsyncLLM propagates a forward pass error and frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        generator = async_llm.generate("Hello my name is",
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should get an EngineDeadError.
+    for output in outputs:
+        assert isinstance(output, EngineDeadError)
+
+    # AsyncLLM should be errored.
+    assert async_llm.errored
+
+    # We should not be able to make another request.
+    with pytest.raises(EngineDeadError):
+        async for _ in async_llm.generate("Hello my name is",
+                                          request_id="abc",
+                                          sampling_params=SamplingParams()):
+            raise Exception("We should not get here.")
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    # NOTE: shutdown is handled by the API Server if an exception
+    # occurs, so it is expected that we would need to call this.
+    async_llm.shutdown()
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("model", MODELS)
+def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                         enable_multiprocessing: bool, model: str) -> None:
+    """Test that LLM propagates a forward pass error and frees memory.
+    TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
+    and >1 rank
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        m.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+
+        with pytest.raises(
+                EngineDeadError if enable_multiprocessing else Exception):
+            llm.generate("Hello my name is Robert and I")
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
new file mode 100644
index 00000000000..0fe48da475c
--- /dev/null
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test error handling in Processor. Should not impact other reqs."""
+
+import asyncio
+
+import pytest
+
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs.data import TokensPrompt
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineGenerateError
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_processor_error(model: str) -> None:
+    """Test that AsyncLLM propagates a processor error.
+    Test empty tokens prompt (failure) and non-empty prompt (no failure.)
+    AsyncLLM always uses an MP client.
+    """
+    engine_args = AsyncEngineArgs(model=model, enforce_eager=True)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        # [] is not allowed and will raise a ValueError in Processor.
+        generator = async_llm.generate(TokensPrompt([]),
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should have get an EngineGenerateError.
+    for output in outputs:
+        with pytest.raises(EngineGenerateError):
+            raise output
+
+    # AsyncLLM should be errored.
+    assert not async_llm.errored
+
+    # This should be no problem.
+    EXPECTED_TOKENS = 5
+    outputs = []
+    async for out in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=EXPECTED_TOKENS,
+                output_kind=RequestOutputKind.DELTA)):
+        outputs.append(out)
+
+    generated_tokens = []
+    for out in outputs:
+        generated_tokens.extend(out.outputs[0].token_ids)
+    assert len(generated_tokens) == EXPECTED_TOKENS
+
+    async_llm.shutdown()
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
new file mode 100644
index 00000000000..1bba19102ec
--- /dev/null
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
+from vllm import LLM
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
+
+def evil_method(self, *args, **kwargs):
+    """Evil method that raises an exception."""
+
+    if get_tensor_model_parallel_rank() == 0:
+        raise Exception("Simulated Error in startup!")
+
+    return self.model(*args, **kwargs, intermediate_tensors=None)
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_async_llm_startup_error(monkeypatch, model: str,
+                                 tensor_parallel_size: int,
+                                 failing_method: str) -> None:
+    """Test that AsyncLLM propagates an __init__ error & frees memory.
+    Test profiling (forward()) and load weights failures.
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Confirm we get an exception.
+    with pytest.raises(Exception, match="initialization failed"):
+        _ = AsyncLLM.from_engine_args(engine_args)
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+    )
+
+
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
+def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
+                           enable_multiprocessing: bool,
+                           failing_method: str) -> None:
+    """Test that LLM propagates an __init__ error and frees memory.
+    Test profiling (forward()) and load weights failures.
+    TODO(andy) - LLM without multiprocessing.
+    """
+    if model != "meta-llama/Llama-3.2-1B":
+        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
+
+        with pytest.raises(
+                Exception,
+                match="initialization failed"
+                if enable_multiprocessing else "Simulated Error in startup!"):
+            _ = LLM(model=model,
+                    enforce_eager=True,
+                    tensor_parallel_size=tensor_parallel_size)
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
+        )
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
new file mode 100644
index 00000000000..8f7c0380d40
--- /dev/null
+++ b/tests/v1/shutdown/utils.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shutdown test utils"""
+
+SHUTDOWN_TEST_TIMEOUT_SEC = 120
+SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
new file mode 100644
index 00000000000..f577fb4ab32
--- /dev/null
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test whether spec decoding handles the max model length properly."""
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+_PROMPTS = [
+    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
+    "Repeat the following sentence 10 times: Consistency is key to mastering any skill.",  # noqa: E501
+    "Who won the Turing Award in 2018, and for what contribution? Describe in detail.",  # noqa: E501
+]
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_ngram_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="facebook/opt-125m",
+            max_model_len=100,
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
+
+
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 10])
+def test_eagle_max_len(
+    monkeypatch: pytest.MonkeyPatch,
+    num_speculative_tokens: int,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(
+            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            enforce_eager=True,  # For faster initialization.
+            speculative_config={
+                "method": "eagle",
+                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "num_speculative_tokens": num_speculative_tokens,
+            },
+            max_model_len=100,
+        )
+        sampling_params = SamplingParams(max_tokens=100, ignore_eos=True)
+        llm.generate(_PROMPTS, sampling_params)
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index a81b4897e5d..50548219fff 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                 _find_subarray_kmp,
                                                 _kmp_lps_array)
@@ -39,50 +40,50 @@ def test_find_subarray_kmp():
 
 
 def test_ngram_proposer():
-    proposer = NgramProposer()
+
+    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
+        # Dummy model config. Just to set max_model_len.
+        model_config = ModelConfig(model="facebook/opt-125m",
+                                   task="generate",
+                                   max_model_len=100,
+                                   tokenizer="facebook/opt-125m",
+                                   tokenizer_mode="auto",
+                                   dtype="auto",
+                                   seed=None,
+                                   trust_remote_code=False)
+        return NgramProposer(
+            vllm_config=VllmConfig(model_config=model_config,
+                                   speculative_config=SpeculativeConfig.
+                                   from_dict({
+                                       "prompt_lookup_min": min_n,
+                                       "prompt_lookup_max": max_n,
+                                       "num_speculative_tokens": k,
+                                       "method": "ngram",
+                                   })))
 
     # No match.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 5]),
-        min_n=2,
-        max_n=2,
-        k=2,
-    )
+    result = ngram_proposer(
+        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
     assert result is None
 
     # No match for 4-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
-        min_n=4,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert result is None
 
     # No match for 4-gram but match for 3-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
-        min_n=3,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert np.array_equal(result, np.array([4, 1]))
 
     # Match for both 4-gram and 3-gram.
     # In this case, the proposer should return the 4-gram match.
-    result = proposer.propose(
-        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]),
-        min_n=3,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(3, 4, 2).propose(
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
 
     # Match for 2-gram and 3-gram, but not 4-gram.
-    result = proposer.propose(
-        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]),
-        min_n=2,
-        max_n=4,
-        k=2,
-    )
+    result = ngram_proposer(
+        2, 4,
+        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 0929f990162..1cefe8726df 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -2,17 +2,13 @@
 
 import pytest
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 
 @pytest.fixture
 def unsupported_string_schemas():
     return [
-        {
-            "type": "string",
-            "pattern": "^[a-zA-Z]+$"
-        },
         {
             "type": "string",
             "format": "email"
@@ -23,22 +19,6 @@ def unsupported_string_schemas():
 @pytest.fixture
 def unsupported_integer_schemas():
     return [
-        {
-            "type": "integer",
-            "minimum": 0
-        },
-        {
-            "type": "integer",
-            "maximum": 120
-        },
-        {
-            "type": "integer",
-            "exclusiveMinimum": 120
-        },
-        {
-            "type": "integer",
-            "exclusiveMaximum": 120
-        },
         {
             "type": "integer",
             "multipleOf": 120
@@ -49,22 +29,6 @@ def unsupported_integer_schemas():
 @pytest.fixture
 def unsupported_number_schemas():
     return [
-        {
-            "type": "number",
-            "minimum": 0
-        },
-        {
-            "type": "number",
-            "maximum": 120
-        },
-        {
-            "type": "number",
-            "exclusiveMinimum": 120
-        },
-        {
-            "type": "number",
-            "exclusiveMaximum": 120
-        },
         {
             "type": "number",
             "multipleOf": 120
@@ -156,13 +120,28 @@ def supported_schema():
                 "type": "string",
                 "enum": ["sedan", "suv", "truck"]
             },
+            "car_brand": {
+                "type": "string",
+                "pattern": "^[a-zA-Z]+$"
+            },
             "short_description": {
                 "type": "string",
                 "maxLength": 50
             },
+            "mileage": {
+                "type": "number",
+                "minimum": 0,
+                "maximum": 1000000
+            },
+            "model_year": {
+                "type": "integer",
+                "exclusiveMinimum": 1900,
+                "exclusiveMaximum": 2100
+            },
             "long_description": {
                 "type": "string",
-                "minLength": 50
+                "minLength": 50,
+                "maxLength": 2000
             },
             "address": {
                 "type": "object",
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index f0e031969e7..ce4c4d198db 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -101,9 +101,9 @@ async def test_load(output_kind: RequestOutputKind):
         # the engines only synchronize stopping every N steps so
         # allow a small amount of time here.
         for _ in range(10):
-            if core_client.num_engines_running == 0:
+            if not core_client.engines_running:
                 break
             await asyncio.sleep(0.5)
 
-        assert core_client.num_engines_running == 0
+        assert not core_client.engines_running
         assert not core_client.reqs_in_flight
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 1448641f6a5..c34c673e985 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -12,7 +12,7 @@
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
-    "ibm-ai-platform/Bamba-9B",  # hybrid
+    "hmellor/bamba-tiny-random",  # hybrid
     "BAAI/bge-m3",  # embedding
 ]
 
@@ -57,7 +57,8 @@ def test_unsupported_configs(monkeypatch):
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
-                guided_decoding_backend="lm-format-enforcer:no-fallback",
+                guided_decoding_backend="lm-format-enforcer",
+                guided_decoding_disable_fallback=True,
             ).create_engine_config()
 
         with pytest.raises(NotImplementedError):
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index bc0e0cbd85e..d1271b210ad 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -1,10 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections import UserDict
 from dataclasses import dataclass
+from typing import Optional
 
+import msgspec
 import numpy as np
+import pytest
 import torch
 
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
 
@@ -26,6 +33,7 @@ class MyType:
     large_f_contig_tensor: torch.Tensor
     small_non_contig_tensor: torch.Tensor
     large_non_contig_tensor: torch.Tensor
+    empty_tensor: torch.Tensor
 
 
 def test_encode_decode():
@@ -41,6 +49,10 @@ def test_encode_decode():
             torch.rand((1, 10), dtype=torch.float32),
             torch.rand((3, 5, 4000), dtype=torch.float64),
             torch.tensor(1984),  # test scalar too
+            # Make sure to test bf16 which numpy doesn't support.
+            torch.rand((3, 5, 1000), dtype=torch.bfloat16),
+            torch.tensor([float("-inf"), float("inf")] * 1024,
+                         dtype=torch.bfloat16),
         ],
         numpy_array=np.arange(512),
         unrecognized=UnrecognizedType(33),
@@ -48,9 +60,10 @@ def test_encode_decode():
         large_f_contig_tensor=torch.rand(1024, 4).t(),
         small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
         large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
+        empty_tensor=torch.empty(0),
     )
 
-    encoder = MsgpackEncoder()
+    encoder = MsgpackEncoder(size_threshold=256)
     decoder = MsgpackDecoder(MyType)
 
     encoded = encoder.encode(obj)
@@ -58,7 +71,7 @@ def test_encode_decode():
     # There should be the main buffer + 4 large tensor buffers
     # + 1 large numpy array. "large" is <= 512 bytes.
     # The two small tensors are encoded inline.
-    assert len(encoded) == 6
+    assert len(encoded) == 8
 
     decoded: MyType = decoder.decode(encoded)
 
@@ -70,7 +83,7 @@ def test_encode_decode():
 
     encoded2 = encoder.encode_into(obj, preallocated)
 
-    assert len(encoded2) == 6
+    assert len(encoded2) == 8
     assert encoded2[0] is preallocated
 
     decoded2: MyType = decoder.decode(encoded2)
@@ -78,6 +91,97 @@ def test_encode_decode():
     assert_equal(decoded2, obj)
 
 
+class MyRequest(msgspec.Struct):
+    mm: Optional[list[MultiModalKwargs]]
+
+
+def test_multimodal_kwargs():
+    d = {
+        "foo":
+        torch.zeros(20000, dtype=torch.float16),
+        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
+        "baz": [
+            torch.rand((256), dtype=torch.float16),
+            [
+                torch.rand((1, 12), dtype=torch.float32),
+                torch.rand((3, 5, 7), dtype=torch.float64),
+            ], [torch.rand((4, 4), dtype=torch.float16)]
+        ],
+    }
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest(mm=[MultiModalKwargs(d)])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 6
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 44559, +-20 for minor changes
+    assert total_len >= 44539 and total_len <= 44579
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    assert all(nested_equal(d[k], decoded[k]) for k in d)
+
+
+def test_multimodal_items_by_modality():
+    e1 = MultiModalFieldElem("audio", "a0",
+                             torch.zeros(1000, dtype=torch.bfloat16),
+                             MultiModalBatchedField())
+    e2 = MultiModalFieldElem(
+        "video",
+        "v0",
+        [torch.zeros(1000, dtype=torch.int8) for _ in range(4)],
+        MultiModalBatchedField(),
+    )
+    e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalSharedField(4))
+    e4 = MultiModalFieldElem("image", "i1", torch.zeros(1000,
+                                                        dtype=torch.int32),
+                             MultiModalBatchedField())
+    audio = MultiModalKwargsItem.from_elems([e1])
+    video = MultiModalKwargsItem.from_elems([e2])
+    image = MultiModalKwargsItem.from_elems([e3, e4])
+    mm = MultiModalKwargs.from_items([audio, video, image])
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest([mm])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    assert len(encoded) == 8
+
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
+
+    # expected total encoding length, should be 14255, +-20 for minor changes
+    assert total_len >= 14235 and total_len <= 14275
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+
+    # check all modalities were recovered and do some basic sanity checks
+    assert len(decoded.modalities) == 3
+    images = decoded.get_items("image")
+    assert len(images) == 1
+    assert len(images[0].items()) == 2
+    assert list(images[0].keys()) == ["i0", "i1"]
+
+    # check the tensor contents and layout in the main dict
+    assert all(nested_equal(mm[k], decoded[k]) for k in mm)
+
+
+def nested_equal(a: NestedTensors, b: NestedTensors):
+    if isinstance(a, torch.Tensor):
+        return torch.equal(a, b)
+    else:
+        return all(nested_equal(x, y) for x, y in zip(a, b))
+
+
 def assert_equal(obj1: MyType, obj2: MyType):
     assert torch.equal(obj1.tensor1, obj2.tensor1)
     assert obj1.a_string == obj2.a_string
@@ -92,3 +196,101 @@ def assert_equal(obj1: MyType, obj2: MyType):
                        obj2.small_non_contig_tensor)
     assert torch.equal(obj1.large_non_contig_tensor,
                        obj2.large_non_contig_tensor)
+    assert torch.equal(obj1.empty_tensor, obj2.empty_tensor)
+
+
+@pytest.mark.parametrize("allow_pickle", [True, False])
+def test_dict_serialization(allow_pickle: bool):
+    """Test encoding and decoding of a generic Python object using pickle."""
+    encoder = MsgpackEncoder(allow_pickle=allow_pickle)
+    decoder = MsgpackDecoder(allow_pickle=allow_pickle)
+
+    # Create a sample Python object
+    obj = {"key": "value", "number": 42}
+
+    # Encode the object
+    encoded = encoder.encode(obj)
+
+    # Decode the object
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded object matches the original
+    assert obj == decoded, "Decoded object does not match the original object."
+
+
+@pytest.mark.parametrize("allow_pickle", [True, False])
+def test_tensor_serialization(allow_pickle: bool):
+    """Test encoding and decoding of a torch.Tensor."""
+    encoder = MsgpackEncoder(allow_pickle=allow_pickle)
+    decoder = MsgpackDecoder(torch.Tensor, allow_pickle=allow_pickle)
+
+    # Create a sample tensor
+    tensor = torch.rand(10, 10)
+
+    # Encode the tensor
+    encoded = encoder.encode(tensor)
+
+    # Decode the tensor
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded tensor matches the original
+    assert torch.allclose(
+        tensor, decoded), "Decoded tensor does not match the original tensor."
+
+
+@pytest.mark.parametrize("allow_pickle", [True, False])
+def test_numpy_array_serialization(allow_pickle: bool):
+    """Test encoding and decoding of a numpy array."""
+    encoder = MsgpackEncoder(allow_pickle=allow_pickle)
+    decoder = MsgpackDecoder(np.ndarray, allow_pickle=allow_pickle)
+
+    # Create a sample numpy array
+    array = np.random.rand(10, 10)
+
+    # Encode the numpy array
+    encoded = encoder.encode(array)
+
+    # Decode the numpy array
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded array matches the original
+    assert np.allclose(
+        array,
+        decoded), "Decoded numpy array does not match the original array."
+
+
+class CustomClass:
+
+    def __init__(self, value):
+        self.value = value
+
+    def __eq__(self, other):
+        return isinstance(other, CustomClass) and self.value == other.value
+
+
+def test_custom_class_serialization_allowed_with_pickle():
+    """Test that serializing a custom class succeeds when allow_pickle=True."""
+    encoder = MsgpackEncoder(allow_pickle=True)
+    decoder = MsgpackDecoder(CustomClass, allow_pickle=True)
+
+    obj = CustomClass("test_value")
+
+    # Encode the custom class
+    encoded = encoder.encode(obj)
+
+    # Decode the custom class
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded object matches the original
+    assert obj == decoded, "Decoded object does not match the original object."
+
+
+def test_custom_class_serialization_disallowed_without_pickle():
+    """Test that serializing a custom class fails when allow_pickle=False."""
+    encoder = MsgpackEncoder(allow_pickle=False)
+
+    obj = CustomClass("test_value")
+
+    with pytest.raises(TypeError):
+        # Attempt to encode the custom class
+        encoder.encode(obj)
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 8164952fe38..a4571a55457 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -22,6 +22,7 @@
 ]
 
 TENSOR_PARALLEL_SIZES = [1]
+MAX_NUM_REQS = [16, 1024]
 
 # TODO: Enable when CI/CD will have a multi-tpu instance
 # TENSOR_PARALLEL_SIZES = [1, 4]
@@ -32,12 +33,14 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
+@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
     tensor_parallel_size: int,
+    max_num_seqs: int,
 ) -> None:
     prompt = "The next numbers of the sequence " + ", ".join(
         str(i) for i in range(1024)) + " are:"
@@ -51,9 +54,9 @@ def test_basic(
                 # Note: max_num_batched_tokens == 1024 is needed here to
                 # actually test chunked prompt
                 max_num_batched_tokens=1024,
-                max_model_len=8196,
+                max_model_len=8192,
                 gpu_memory_utilization=0.7,
-                max_num_seqs=16,
+                max_num_seqs=max_num_seqs,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
new file mode 100644
index 00000000000..dbd2e220451
--- /dev/null
+++ b/tests/v1/tpu/test_multimodal.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+
+from vllm import envs
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.platforms import current_platform
+
+from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...utils import RemoteOpenAIServer
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
+async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
+                                                                        str]):
+
+    pytest.skip("Skip this test until it's fixed.")
+
+    def whats_in_this_image_msg(b64):
+        return [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{b64}"
+                    },
+                },
+            ],
+        }]
+
+    server_args = [
+        "--max-model-len",
+        "1024",
+        "--max-num-seqs",
+        "16",
+        "--gpu-memory-utilization",
+        "0.95",
+        "--trust-remote-code",
+        "--max-num-batched-tokens",
+        "576",
+        # NOTE: max-num-batched-tokens>=mm_item_size
+        "--disable_chunked_mm_input",
+        "--chat-template",
+        "examples/template_llava.jinja"
+    ]
+
+    # Server will pre-compile on first startup (takes a long time).
+    with RemoteOpenAIServer(model_name, server_args,
+                            max_wait_seconds=600) as remote_server:
+        client: openai.AsyncOpenAI = remote_server.get_async_client()
+
+        # Other requests now should be much faster
+        for image_url in TEST_IMAGE_URLS:
+            image_base64 = base64_encoded_image[image_url]
+            chat_completion_from_base64 = await client.chat.completions\
+                .create(
+                model=model_name,
+                messages=whats_in_this_image_msg(image_base64),
+                max_completion_tokens=24,
+                temperature=0.0)
+            result = chat_completion_from_base64
+            assert result
+            choice = result.choices[0]
+            assert choice.finish_reason == "length"
+
+            message = choice.message
+            message = result.choices[0].message
+            assert message.content is not None and len(message.content) >= 10
+            assert message.role == "assistant"
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index 94a1da88a2f..811833f73cd 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -59,17 +59,16 @@ class TestParams:
         prefix_len=500,
         decode_len=50,
 
-        # (This is the active CI/CD instance)
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
-        # tpu: v5lite (vllm CI/CD)
-        expected_avg_time=1.4,
-        err_tol=0.30,
+        # tpu: v5lite (old vllm CI/CD)
+        # expected_avg_time=1.4,
+        # err_tol=0.30,
 
-        # (TODO: There is no v6e in CI/CD currently)
+        # (This is the active CI/CD instance)
         # commit id: ccb246776d93ef105904a8ec015b3587240a1183
-        # tpu: v6e
-        # expected_avg_time=1.5,
-        # err_tol=0.20,
+        # tpu: v6e (current vllm CI/CD)
+        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=  
+        err_tol=0.20,
     ),
 ]
 
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 0147da53351..c6b492b5a3c 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+import random
+
 import pytest
 
 from vllm import LLM, envs
@@ -39,3 +41,23 @@ def test_sampler_different(model_name: str):
         # Unsupported `seed` param.
         sampling_params = SamplingParams(temperature=0.3, seed=42)
         output2 = llm.generate(prompts, sampling_params)
+
+    # Batch-case with TopK/P
+    for B in [4, 16]:
+        p = prompts * B
+        sampling_params = [
+            SamplingParams(
+                temperature=0.1,
+                min_p=0.8,
+                max_tokens=64,
+                # Vary number of ks
+                top_k=random.randint(4, 12),
+                top_p=random.random()) for _ in range(B)
+        ]
+        # Make sure first two reqs have the same K/P
+        sampling_params[0] = sampling_params[1]
+        output = llm.generate(p, sampling_params)
+        # There are natural numerical instabilities that make it difficult
+        # to have deterministic results over many tokens, tests the first ~20
+        # tokens match.
+        assert output[0].outputs[0].text[:20] == output[1].outputs[0].text[:20]
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index dce0303e68d..ff9217f8f3c 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -5,7 +5,8 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_tpu
+from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
+                                                  apply_top_k_top_p_tpu)
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
@@ -16,6 +17,25 @@
 TOLERANCE = 1e-6
 
 
+def test_topk_equivalence_to_native_impl():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+
+        # Random top-k values between 1 and 10.
+        k = torch.randint(1, 10, (BATCH_SIZE, ))
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool),
+                       VOCAB_SIZE)
+
+        result_tpu = apply_top_k_top_p_tpu(logits=logits.clone(), k=k, p=None)
+
+        result_native = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+        assert torch.allclose(result_native, result_tpu)
+
+
 def test_topp_result_sums_past_p():
     with torch.device(xm.xla_device()):
         xm.set_rng_state(seed=33)
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 5c7eab0b6b1..319b38b4ca0 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -77,7 +77,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
             NewRequestData(
                 req_id=req_id,
                 prompt_token_ids=[1, 2, 3],
-                prompt="test",
                 mm_inputs=[],
                 mm_hashes=[],
                 mm_positions=[],
@@ -294,11 +293,19 @@ def test_update_states_request_unscheduled(model_runner):
 
 
 def test_get_paddings():
+    # Bucketed padding
     min_token_size, max_token_size, padding_gap = 16, 512, 64
     expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
+                                          padding_gap)
+
+    # Bucketed padding with max_token_size not a power of two.
+    max_token_size = 317
+    expected_paddings = [16, 32, 64, 128, 192, 256, 320]
     actual_paddings = _get_token_paddings(min_token_size, max_token_size,
                                           padding_gap)
     assert actual_paddings == expected_paddings
+
     # Exponential padding.
     max_token_size, padding_gap = 1024, 0
     expected_paddings = [16, 32, 64, 128, 256, 512, 1024]
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 2486c26c607..915ec2914a8 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -195,7 +195,6 @@ def _construct_cached_request_state(req_id_suffix: int):
     return CachedRequestState(
         req_id=f"req_id_{req_id_suffix}",
         prompt_token_ids=prompt_token_ids,
-        prompt=None,
         sampling_params=_create_sampling_params(),
         mm_inputs=[],
         mm_positions=[],
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index dd95a7f5306..68e34cfacc5 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -50,7 +50,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
             NewRequestData(
                 req_id=req_id,
                 prompt_token_ids=[1, 2, 3],
-                prompt="test",
                 mm_inputs=[],
                 mm_hashes=[],
                 mm_positions=[],
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index b8ba69b0dd8..ae4b536524b 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -31,23 +31,38 @@ def test_deepseek_mla_attn_backend_module():
     assert model_runner.attn_backend.__name__ == "TritonMLABackend"
 
 
-@pytest.mark.parametrize("batch_size", list(range(1, 257)))
-def test_prepare_prompt(batch_size):
+@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
     model_runner = _create_model_runner(
         "facebook/opt-125m",
         max_num_batched_tokens=100000,
         max_num_seqs=100000,
         enable_chunked_prefill=False,
+        enable_prompt_embeds=True,
     )
 
     seq_lens: list[int] = []
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
+    expected_input_embeds_len = 0
     for i in range(batch_size):
         # make sure all tokens fit into one block
         seq_len = i % (model_runner.block_size - 1) + 1
         seq_lens.append(seq_len)
-        seq_data = SequenceData.from_seqs(range(seq_len))
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * seq_len,
+                prompt_embeds=torch.rand(seq_len, 10),
+            )
+            expected_input_embeds_len += seq_len
+        else:
+            seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len))
+
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=True,
@@ -68,6 +83,7 @@ def test_prepare_prompt(batch_size):
         seq_group_metadata_list)
     input_tokens = model_input.input_tokens
     input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
     attn_metadata = model_input.attn_metadata
     return_seq_lens = model_input.seq_lens
     slot_mapping = attn_metadata.slot_mapping
@@ -121,7 +137,11 @@ def test_prepare_prompt(batch_size):
 
     assert len(input_tokens) == sum(seq_lens)
     assert len(input_positions) == sum(seq_lens)
-    torch.testing.assert_close(input_tokens, input_positions)
+    if expected_input_embeds_len == 0:
+        torch.testing.assert_close(input_tokens, input_positions)
+        assert input_embeds is None
+    else:
+        assert len(input_embeds) == expected_input_embeds_len
 
     sampling_metadata = SamplingMetadata.prepare(
         seq_group_metadata_list,
@@ -145,8 +165,13 @@ def test_prepare_prompt(batch_size):
     torch.testing.assert_close(actual, expected)
 
 
-@pytest.mark.parametrize("batch_size", list(range(1, 257)))
-def test_prepare_decode_cuda_graph(batch_size):
+@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
     model_runner = _create_model_runner(
         "facebook/opt-125m",
         seed=0,
@@ -155,6 +180,7 @@ def test_prepare_decode_cuda_graph(batch_size):
         max_num_batched_tokens=100000,
         max_num_seqs=100000,
         enable_chunked_prefill=False,
+        enable_prompt_embeds=True,
     )
 
     context_lens: list[int] = []
@@ -164,10 +190,19 @@ def test_prepare_decode_cuda_graph(batch_size):
         # make sure all tokens fit into one block
         context_len = i % (model_runner.block_size - 1) + 1
         context_lens.append(context_len)
-        seq_data = SequenceData.from_seqs(range(context_len))
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * context_len,
+                prompt_embeds=torch.rand(context_len, 10),
+            )
+            output_embed = torch.rand(10)
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(context_len))
+            output_embed = None
         seq_data.update_num_computed_tokens(context_len)
         # Append one token ID since prefill is finished.
-        seq_data.append_token_id(1, 0)
+        seq_data.append_token_id(1, 0, output_embed)
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=False,
@@ -180,9 +215,12 @@ def test_prepare_decode_cuda_graph(batch_size):
 
     model_input = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list)
-    input_tokens, input_positions, attn_metadata, slot_mapping = (
-        model_input.input_tokens, model_input.input_positions,
-        model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    slot_mapping = attn_metadata.slot_mapping
+
     assert len(slot_mapping) == len(input_tokens)
 
     expected_bs = model_runner.vllm_config.pad_for_cudagraph(
@@ -227,7 +265,7 @@ def test_prepare_decode_cuda_graph(batch_size):
     # block table's first index corresponds to each batch, meaning in
     # decoding it is each token.
     assert attn_metadata.block_tables.shape[0] == len(input_tokens)
-    # Block table's second dim correspondsd to each token's block number.
+    # Block table's second dim corresponds to each token's block number.
     # It is padded up to
     assert attn_metadata.block_tables.shape[1] == (
         model_runner.get_max_block_per_batch())
@@ -235,7 +273,12 @@ def test_prepare_decode_cuda_graph(batch_size):
 
     assert len(input_tokens) == expected_bs
     assert len(input_positions) == expected_bs
-    torch.allclose(input_tokens, input_positions)
+    if use_prompt_embeds:
+        expected_input_embeds_length = start_loc[-1]
+        assert len(input_embeds) == expected_input_embeds_length
+        assert expected_input_embeds_length <= expected_bs
+    else:
+        assert input_embeds is None
 
     # Verify Sampling
     expected_selected_token_indices = []
@@ -266,25 +309,27 @@ def test_empty_seq_group():
     seq_group_metadata_list: list[SequenceGroupMetadata] = []
     model_input = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list)
-    input_tokens, input_positions, attn_metadata = (
-        model_input.input_tokens,
-        model_input.input_positions,
-        model_input.attn_metadata,
-    )
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+
     assert input_tokens is None
     assert input_positions is None
     assert attn_metadata is None
 
     model_input = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list)
-    (input_tokens, input_positions, attn_metadata, return_seq_lens) = (
-        model_input.input_tokens,
-        model_input.input_positions,
-        model_input.attn_metadata,
-        model_input.seq_lens,
-    )
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+
     assert input_tokens is None
     assert input_positions is None
+    assert input_embeds is None
     assert attn_metadata is None
     assert return_seq_lens is None
 
@@ -299,9 +344,15 @@ def distributed_init():
     ensure_model_parallel_initialized(1, 1)
 
 
-@pytest.mark.parametrize("batch_size", list(range(2, 128)))
+@pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
 @pytest.mark.parametrize("enforce_eager", [True, False])
-def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
+@pytest.mark.parametrize('use_prompt_embeds', [True, False])
+def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
+                        distributed_init, monkeypatch):
+    if use_prompt_embeds:
+        # Prompt Embeddings is only currently supported on V0
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+
     model_runner = _create_model_runner(
         "facebook/opt-125m",
         seed=0,
@@ -310,6 +361,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
         max_num_batched_tokens=100000,
         max_num_seqs=100000,
         enable_chunked_prefill=True,
+        enable_prompt_embeds=True,
     )
 
     # Add prefill requests.
@@ -320,11 +372,20 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     block_tables = {0: [1]}
     prefill_batch_size = batch_size // 2
     decode_batch_size = batch_size - prefill_batch_size
+    expected_input_embeds_len = 0
     for i in range(prefill_batch_size):
         # make sure all tokens fit into one block
         seq_len = i % (model_runner.block_size - 1) + 1
         seq_lens.append(seq_len)
-        seq_data = SequenceData.from_seqs(range(seq_len))
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * seq_len,
+                prompt_embeds=torch.rand(seq_len, 10),
+            )
+            expected_input_embeds_len += seq_len
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(seq_len), )
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
             is_prompt=True,
@@ -340,8 +401,21 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     for i in range(prefill_batch_size, batch_size):
         # make sure all tokens fit into one block
         context_len = i % (model_runner.block_size - 1) + 1
-        seq_data = SequenceData.from_seqs(range(context_len))
-        seq_data.append_token_id(1, 0)
+        if use_prompt_embeds:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=[0] * context_len,
+                prompt_embeds=torch.rand(context_len, 10),
+            )
+            output_embed = torch.rand(10)
+            # This also iterates the expected input_embeds, because the model
+            # needs both the input and output embeddings passed into together
+            expected_input_embeds_len += 1
+        else:
+            seq_data = SequenceData.from_seqs(
+                prompt_token_ids=range(context_len), )
+            output_embed = None
+        assert len(seq_data.prompt_token_ids) == context_len
+        seq_data.append_token_id(1, 0, output_embed)
         seq_data.update_num_computed_tokens(context_len)
         seq_group_metadata = SequenceGroupMetadata(
             request_id=f"test_{i}",
@@ -355,11 +429,11 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
         decode_metadata_list.append(seq_group_metadata)
 
     model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-    (input_tokens, input_positions, attn_metadata) = (
-        model_input.input_tokens,
-        model_input.input_positions,
-        model_input.attn_metadata,
-    )
+
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    input_embeds = model_input.inputs_embeds
+    attn_metadata = model_input.attn_metadata
 
     prefill_meta_actual = attn_metadata.prefill_metadata
     decode_meta_actual = attn_metadata.decode_metadata
@@ -369,6 +443,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     assert attn_metadata.num_prefills == prefill_batch_size
     assert attn_metadata.num_decode_tokens == decode_batch_size
     assert attn_metadata.num_prefill_tokens == sum(seq_lens)
+    if expected_input_embeds_len == 0:
+        assert input_embeds is None
+    else:
+        assert len(input_embeds) == expected_input_embeds_len
 
     # Verify attn metadata is consistent. We don't need to test individual
     # values here because they are tested above.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index bd930bb9065..64f4310151c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -158,8 +158,13 @@ def rotary_embedding(
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    torch.ops._C.rotary_embedding(positions, query, key, head_size,
-                                  cos_sin_cache, is_neox)
+    # TODO: Remove this contiguous call when the kernel is updated to support tensor slices
+    query_contiguous = query.contiguous()
+    key_contiguous = key.contiguous()
+    torch.ops._C.rotary_embedding(positions, query_contiguous, key_contiguous,
+                                  head_size, cos_sin_cache, is_neox)
+    query.copy_(query_contiguous)
+    key.copy_(key_contiguous)
 
 
 def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
@@ -167,9 +172,15 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                              cos_sin_cache: torch.Tensor, is_neox: bool,
                              rot_dim: int,
                              cos_sin_cache_offsets: torch.Tensor) -> None:
-    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
+    # TODO: Remove this contiguous call when the kernel is updated to support tensor slices
+    query_contiguous = query.contiguous()
+    key_contiguous = key.contiguous()
+    torch.ops._C.batched_rotary_embedding(positions, query_contiguous,
+                                          key_contiguous, head_size,
                                           cos_sin_cache, is_neox, rot_dim,
                                           cos_sin_cache_offsets)
+    query.copy_(query_contiguous)
+    key.copy_(key_contiguous)
 
 
 # layer norm ops
@@ -548,7 +559,6 @@ def cutlass_scaled_mm(a: torch.Tensor,
         scale_a.shape * [1, 128] == a.shape
         scale_b.shape * [128, 128] == b.shape
     """
-    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.shape[0] == b.shape[
         1] and bias.dtype == out_dtype
@@ -556,7 +566,8 @@ def cutlass_scaled_mm(a: torch.Tensor,
     m = a.shape[0]
     n = b.shape[1]
 
-    if current_platform.is_rocm():
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    if current_platform.is_rocm() or not cutlass_compatible_b:
         triton_scaled_mm_module = importlib.import_module(
             "vllm.model_executor.layers.quantization.compressed_tensors."
             "triton_scaled_mm")
@@ -1196,6 +1207,26 @@ def selective_scan_fwd(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
                                     ssm_states, pad_slot_id)
 
 
+# ROCm skinny gemms
+def LLMM1(a: torch.Tensor, b: torch.Tensor,
+          rows_per_block: int) -> torch.Tensor:
+    return torch.ops._rocm_C.LLMM1(a, b, rows_per_block)
+
+
+def wvSplitK(a: torch.Tensor, b: torch.Tensor, cu_count: int) -> torch.Tensor:
+    return torch.ops._rocm_C.wvSplitK(a, b, cu_count)
+
+
+def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
+              scale_a: torch.Tensor, scale_b: torch.Tensor,
+              cu_count: int) -> torch.Tensor:
+    out = torch.empty((b.shape[0], a.shape[0]),
+                      dtype=out_dtype,
+                      device=b.device)
+    torch.ops._rocm_C.wvSplitKQ(a, b, out, scale_a, scale_b, cu_count)
+    return out
+
+
 # moe
 def moe_sum(input: torch.Tensor, output: torch.Tensor):
     torch.ops._moe_C.moe_sum(input, output)
@@ -1505,3 +1536,12 @@ def flash_mla_with_kvcache(
         num_splits,
     )
     return out, softmax_lse
+
+
+def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
+                       q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor,
+                       seq_lens: torch.Tensor, page_table: torch.Tensor,
+                       scale: float) -> torch.Tensor:
+    torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache,
+                                    seq_lens, page_table, scale)
+    return out
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index c3d210c27ca..a9a624b85ab 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -177,6 +177,7 @@ def varlen_attention(
         out: torch.Tensor,
         seqlen_q: torch.Tensor,
         seqlen_k: torch.Tensor,
+        alibi_slopes: Optional[torch.Tensor],
         max_seqlen_q: int,
         max_seqlen_k: int,
         pdropout: float,
@@ -185,11 +186,15 @@ def varlen_attention(
         is_causal: bool,
         return_softmax: bool,
         gen_: torch.Generator,
+        window_size_left: float,
+        window_size_right: float,
         logits_soft_cap: float,
     ) -> None:
         if ipex.__version__.endswith("cpu"):
             if logits_soft_cap != 0.0:
                 raise ValueError("IPEX CPU does not support logits_soft_cap")
+            assert alibi_slopes is None
+            assert window_size_left < 0 and window_size_right < 0
             ipex.llm.functional.varlen_attention(query.contiguous(),
                                                  key.contiguous(),
                                                  value.contiguous(), out,
@@ -200,15 +205,12 @@ def varlen_attention(
                                                  is_causal, return_softmax,
                                                  gen_)
         else:  # XPU build
-            ipex.llm.functional.varlen_attention(query.contiguous(),
-                                                 key.contiguous(),
-                                                 value.contiguous(), out,
-                                                 seqlen_q.int(),
-                                                 seqlen_k.int(), max_seqlen_q,
-                                                 max_seqlen_k, pdropout,
-                                                 softmax_scale, zero_tensors,
-                                                 is_causal, return_softmax,
-                                                 gen_, logits_soft_cap)
+            ipex.llm.functional.varlen_attention(
+                query.contiguous(), key.contiguous(), value.contiguous(), out,
+                seqlen_q.int(), seqlen_k.int(), alibi_slopes, max_seqlen_q,
+                max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal,
+                return_softmax, gen_, window_size_left, window_size_right,
+                logits_soft_cap)
 
     @staticmethod
     def reshape_and_cache(
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 0203dc092a7..a21eb7f599f 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -18,19 +18,25 @@
 
 ASSET_DIR = "multimodal_asset"
 
+AudioAssetName = Literal["winning_call", "mary_had_lamb"]
+
 
 @dataclass(frozen=True)
 class AudioAsset:
-    name: Literal["winning_call", "mary_had_lamb"]
+    name: AudioAssetName
+
+    @property
+    def filename(self) -> str:
+        return f"{self.name}.ogg"
 
     @property
     def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
-        audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
+        audio_path = get_vllm_public_assets(filename=self.filename,
                                             s3_prefix=ASSET_DIR)
         return librosa.load(audio_path, sr=None)
 
     def get_local_path(self) -> Path:
-        return get_vllm_public_assets(filename=f"{self.name}.ogg",
+        return get_vllm_public_assets(filename=self.filename,
                                       s3_prefix=ASSET_DIR)
 
     @property
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 2b1d258da9c..d8cca9b74ed 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -10,10 +10,12 @@
 
 VLM_IMAGES_DIR = "vision_model_images"
 
+ImageAssetName = Literal["stop_sign", "cherry_blossom"]
+
 
 @dataclass(frozen=True)
 class ImageAsset:
-    name: Literal["stop_sign", "cherry_blossom"]
+    name: ImageAssetName
 
     @property
     def pil_image(self) -> Image.Image:
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 32b0b86ba36..bf06746a9ff 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Literal
+from typing import ClassVar, Literal, Optional
 
 import cv2
 import numpy as np
@@ -10,8 +10,15 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
+from vllm.utils import PlaceholderModule
+
 from .base import get_cache_dir
 
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
 
 @lru_cache
 def download_video_asset(filename: str) -> str:
@@ -69,19 +76,39 @@ def video_to_pil_images_list(path: str,
     ]
 
 
+VideoAssetName = Literal["baby_reading"]
+
+
 @dataclass(frozen=True)
 class VideoAsset:
-    name: Literal["sample_demo_1.mp4"]
+    name: VideoAssetName
     num_frames: int = -1
 
+    _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = {
+        "baby_reading": "sample_demo_1.mp4",
+    }
+
+    @property
+    def filename(self) -> str:
+        return self._NAME_TO_FILE[self.name]
+
     @property
     def pil_images(self) -> list[Image.Image]:
-        video_path = download_video_asset(self.name)
+        video_path = download_video_asset(self.filename)
         ret = video_to_pil_images_list(video_path, self.num_frames)
         return ret
 
     @property
     def np_ndarrays(self) -> npt.NDArray:
-        video_path = download_video_asset(self.name)
+        video_path = download_video_asset(self.filename)
         ret = video_to_ndarrays(video_path, self.num_frames)
         return ret
+
+    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
+        """
+        Read audio data from the video asset, used in Qwen2.5-Omni examples.
+        
+        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
+        """
+        video_path = download_video_asset(self.filename)
+        return librosa.load(video_path, sr=sampling_rate)[0]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 82d60f9da7d..f3d6ffaeb8f 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -77,6 +77,10 @@ def get_kv_cache_shape(
     ) -> Tuple[int, ...]:
         raise NotImplementedError
 
+    @staticmethod
+    def get_kv_cache_stride_order() -> Tuple[int, ...]:
+        raise NotImplementedError
+
     @staticmethod
     @abstractmethod
     def swap_blocks(
@@ -237,6 +241,7 @@ class AttentionLayer(Protocol):
     _v_scale: torch.Tensor
     _k_scale_float: float
     _v_scale_float: float
+    _prob_scale: torch.Tensor
 
     def forward(
         self,
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
index e2d16908fa9..4567893a9ef 100644
--- a/vllm/attention/backends/cpu_mla.py
+++ b/vllm/attention/backends/cpu_mla.py
@@ -273,13 +273,15 @@ def _forward_prefill(
             return_softmax=False,
             gen_=None,
             logits_soft_cap=0.0,
+            window_size_left=-1,
+            window_size_right=-1,
+            alibi_slopes=None,
         )
 
         # remove padding
         output = output.view(-1, self.num_heads,
                              q.shape[-1])[..., :v.shape[-1]]
-        output = output.reshape(-1, self.num_heads * v.shape[-1])
-        return self.o_proj(output)[0]
+        return output.reshape(-1, self.num_heads * v.shape[-1])
 
     def _forward_decode(
             self,
@@ -300,4 +302,4 @@ def _forward_decode(
         ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
                                    decode_meta.block_tables,
                                    decode_meta.seq_lens_tensor)
-        return self._v_up_proj_and_o_proj(o)
+        return self._v_up_proj(o)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index f9c5ad4df54..7f8f720eee0 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -22,13 +22,13 @@
     compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
     get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
     is_all_encoder_attn_metadata_set, is_block_tables_empty)
+from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache)
-from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8,
-                                           get_flash_attn_version)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -689,7 +689,7 @@ def forward(
         assert output is not None, "Output tensor must be provided."
 
         # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache.
-        if self.vllm_flash_attn_version < 3 or output.dtype != torch.bfloat16:
+        if not flash_attn_supports_fp8() or output.dtype != torch.bfloat16:
             assert (
                 layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), (
                     "key/v_scale is only supported in FlashAttention 3 with "
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 09717a1121d..37b20d0739f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dataclasses
+import os
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -37,7 +38,7 @@
                                            is_block_tables_empty)
 from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                         make_tensor_with_pad)
@@ -48,6 +49,9 @@
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
+FLASHINFER_KV_CACHE_LAYOUT: str = os.getenv("FLASHINFER_KV_CACHE_LAYOUT",
+                                            "NHD").upper()
+
 
 class FlashInferBackend(AttentionBackend):
 
@@ -80,6 +84,14 @@ def get_kv_cache_shape(
     ) -> Tuple[int, ...]:
         return (num_blocks, 2, block_size, num_kv_heads, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order() -> Tuple[int, ...]:
+        cache_layout = FLASHINFER_KV_CACHE_LAYOUT
+        assert (cache_layout in ("NHD", "HND"))
+        stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3,
+                                                                      2, 4)
+        return stride_order
+
     @staticmethod
     def swap_blocks(
         src_kv_cache: torch.Tensor,
@@ -128,12 +140,10 @@ def get_per_layer_parameters(
     to use during `plan`.
     """
 
-    layers = vllm_config.compilation_config.static_forward_context
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
     per_layer_params: Dict[str, PerLayerParameters] = {}
 
     for key, layer in layers.items():
-        assert isinstance(layer, Attention)
-
         impl = layer.impl
         assert isinstance(impl, FlashInferImpl)
 
@@ -187,7 +197,8 @@ def __init__(self, runner):
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters: Optional[PerLayerParameters] = None
 
-        self.vllm_config = get_current_vllm_config()
+        self.vllm_config = self.runner.vllm_config
+        self._kv_cache_layout = None
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
@@ -197,10 +208,15 @@ def _get_workspace_buffer(self):
                 device=self.runner.device)
         return self._workspace_buffer
 
+    def get_kv_cache_layout(self):
+        if self._kv_cache_layout is None:
+            self._kv_cache_layout = FLASHINFER_KV_CACHE_LAYOUT
+        return self._kv_cache_layout
+
     def _get_prefill_wrapper(self):
         if self._prefill_wrapper is None:
             self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), "NHD")
+                self._get_workspace_buffer(), self.get_kv_cache_layout())
         return self._prefill_wrapper
 
     def _get_decode_wrapper(self):
@@ -213,7 +229,7 @@ def _get_decode_wrapper(self):
                 num_qo_heads // num_kv_heads > 4)
             self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                 self._get_workspace_buffer(),
-                "NHD",
+                self.get_kv_cache_layout(),
                 use_tensor_cores=use_tensor_cores)
         return self._decode_wrapper
 
@@ -274,7 +290,8 @@ def graph_capture_get_metadata_for_batch(
         self._graph_decode_wrapper = \
             CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
             self._graph_decode_workspace_buffer, _indptr_buffer,
-            self._graph_indices_buffer, _last_page_len_buffer, "NHD",
+            self._graph_indices_buffer, _last_page_len_buffer,
+            self.get_kv_cache_layout(),
             use_tensor_cores)
         if self.runner.kv_cache_dtype.startswith("fp8"):
             kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
@@ -350,9 +367,17 @@ def begin_forward(self, model_input):
         # scheduled while CUDA graph mode is enabled. We don't run graph in that
         # case.
         if use_cuda_graph and is_decode:
-            batch_size = model_input.input_tokens.shape[0]
-            state = (self.runner.graph_runners[model_input.virtual_engine]
-                     [batch_size].attn_state)
+            if model_input.inputs_embeds is None:
+                batch_size = model_input.input_tokens.shape[0]
+                state = (
+                    self.runner.graph_runners[model_input.virtual_engine][(
+                        batch_size, False)].attn_state)
+            else:
+                batch_size = model_input.inputs_embeds.shape[0]
+                state = (
+                    self.runner.graph_runners[model_input.virtual_engine][(
+                        batch_size, True)].attn_state)
+
         model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper(
         )
         model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper()
@@ -613,7 +638,7 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         # Global hyperparameters shared by all attention layers
         self.global_hyperparameters: Optional[PerLayerParameters] = None
 
-        self.vllm_config = get_current_vllm_config()
+        self.vllm_config = self.runner.vllm_config
 
     def prepare(self):
         self.slot_mapping: List[int] = []
@@ -1005,6 +1030,7 @@ def forward(
 
         prefill_output: Optional[torch.Tensor] = None
         decode_output: Optional[torch.Tensor] = None
+        stride_order = FlashInferBackend.get_kv_cache_stride_order()
         if prefill_meta := attn_metadata.prefill_metadata:
             # We will use flash attention for prefill
             # when kv_cache is not provided.
@@ -1036,7 +1062,7 @@ def forward(
 
                 prefill_output = prefill_meta.prefill_wrapper.run(
                     query,
-                    kv_cache,
+                    kv_cache.permute(*stride_order),
                     k_scale=layer._k_scale_float,
                     v_scale=layer._v_scale_float,
                 )
@@ -1051,7 +1077,7 @@ def forward(
 
             decode_output = decode_meta.decode_wrapper.run(
                 decode_query,
-                kv_cache,
+                kv_cache.permute(*stride_order),
                 k_scale=layer._k_scale_float,
                 v_scale=layer._v_scale_float,
             )
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 5d0c2309331..0e62748ddbe 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -239,4 +239,4 @@ def _forward_decode(
             causal=True,
         )
 
-        return self._v_up_proj_and_o_proj(o)
+        return self._v_up_proj(o)
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 15625612e08..55a63a81677 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -4,14 +4,14 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
+import vllm_hpu_extension.kernels as kernels
 import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax,
-                                      VLLMKVCache)
+from vllm_hpu_extension.flags import enabled_flags
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
@@ -126,7 +126,15 @@ def __init__(
         self.block2batch_matmul = Matmul()
         self.k_cache = VLLMKVCache()
         self.v_cache = VLLMKVCache()
-        ops.pa_impl = ops.pa
+        self.fused_scaled_dot_product_attention = kernels.fsdpa()
+
+        self.prefill_impl = 'naive'
+        if "flex_attention" in enabled_flags():
+            self.prefill_impl = 'flex'
+        if "fsdpa" in enabled_flags():
+            assert alibi_slopes is None, \
+                'Prefill with FusedSDPA not supported with alibi slopes!'
+            self.prefill_impl = 'fsdpa'
 
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
@@ -138,19 +146,9 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
-                                              '0').lower() in ['1', 'true']
-        self.fused_scaled_dot_product_attention = None
-        if self.prefill_usefusedsdpa:
+        if self.prefill_impl == 'fsdpa':
             assert alibi_slopes is None, \
                 'Prefill with FusedSDPA not supported with alibi slopes!'
-            try:
-                from habana_frameworks.torch.hpex.kernels import FusedSDPA
-                self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
-                    FusedSDPA)
-            except ImportError:
-                logger.warning("Could not import HPU FusedSDPA kernel. "
-                               "vLLM will use native implementation.")
 
         supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in supported_head_sizes:
@@ -158,7 +156,8 @@ def __init__(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
 
-        if attn_type != AttentionType.DECODER:
+        self.attn_type = attn_type
+        if self.attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
@@ -192,15 +191,18 @@ def forward(
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 
-        query = query.view(-1, self.num_heads, self.head_size)
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
         block_indices = attn_metadata.block_indices
         block_offsets = attn_metadata.block_offsets
-        if attn_metadata.is_prompt:
+        key_cache = None
+        value_cache = None
+        if attn_metadata.is_prompt and self.attn_type \
+           is not AttentionType.ENCODER_ONLY \
+           and attn_metadata.block_list is None:
             key = key.unflatten(0, (block_indices.size(0), -1))
             value = value.unflatten(0, (block_indices.size(0), -1))
-        if kv_cache is not None:
+        if kv_cache is not None and isinstance(kv_cache, tuple):
             key_cache, value_cache = HPUPagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
@@ -214,36 +216,28 @@ def forward(
 
         if attn_metadata.is_prompt:
             # Prompt run.
-            if not self.prefill_usefusedsdpa:
-                # TODO: move this outside of model
-                assert attn_metadata.attn_bias is not None, \
-                        'attn_bias must be set before calling model.forward!'
-                attn_bias = attn_metadata.attn_bias
-                if self.alibi_slopes is not None:
-                    position_bias = _make_alibi_bias(self.alibi_slopes,
-                                                     self.num_kv_heads,
-                                                     attn_bias.dtype,
-                                                     attn_bias.shape[-1])
-                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
-                    attn_bias.add_(position_bias)
-            else:
-                attn_bias = None
-
             query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
             kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
                         self.head_size)
+
+            attn_bias = attn_metadata.attn_bias
+            if attn_bias is not None and self.alibi_slopes is not None:
+                position_bias = _make_alibi_bias(self.alibi_slopes,
+                                                 self.num_kv_heads,
+                                                 attn_bias.dtype,
+                                                 attn_bias.shape[-1])
+                attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
+                attn_bias.add_(position_bias)
+
             out = ops.prompt_attention(
-                query.view(query_shape),
-                key.view(kv_shape),
-                value.view(kv_shape),
+                impl=self.prefill_impl,
+                query=query.view(query_shape),
+                key=key.view(kv_shape),
+                value=value.view(kv_shape),
+                is_causal=True,
                 attn_bias=attn_bias,
-                p=0.0,
-                scale=self.scale,
-                matmul_qk_op=self.matmul_qk,
-                softmax_op=self.softmax,
-                matmul_av_op=self.matmul_av,
-                fsdpa_op=self.fused_scaled_dot_product_attention,
-            )
+                valid_seq_lengths=attn_metadata.seq_lens_tensor,
+                **self.common_attention_args())
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
@@ -254,18 +248,26 @@ def forward(
                 block_list=attn_metadata.block_list,
                 block_mapping=attn_metadata.block_mapping,
                 block_bias=attn_metadata.attn_bias,
-                block_scales=attn_metadata.block_scales,
                 block_groups=attn_metadata.block_groups,
-                scale=self.scale,
-                matmul_qk_op=self.matmul_qk,
-                matmul_av_op=self.matmul_av,
-                batch2block_matmul_op=self.batch2block_matmul,
-                block2batch_matmul_op=self.block2batch_matmul,
-                keys_fetch_func=self.k_cache.fetch_from_cache,
-                values_fetch_func=self.v_cache.fetch_from_cache)
+                **self.common_attention_args())
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
 
+    def common_attention_args(self):
+        fsdpa_op = self.fused_scaled_dot_product_attention.apply \
+            if self.fused_scaled_dot_product_attention is not None else None
+        return {
+            'scale': self.scale,
+            'matmul_qk_op': self.matmul_qk,
+            'matmul_av_op': self.matmul_av,
+            'batch2block_matmul_op': self.batch2block_matmul,
+            'block2batch_matmul_op': self.block2batch_matmul,
+            'fsdpa_op': fsdpa_op,
+            'keys_fetch_func': self.k_cache.fetch_from_cache,
+            'values_fetch_func': self.v_cache.fetch_from_cache,
+            'softmax_op': self.softmax,
+        }
+
 
 def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 99917a92af5..f322c7b3dd6 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -143,10 +143,9 @@ def __init__(
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.need_mask = (self.alibi_slopes is not None
-                          or self.sliding_window is not None)
+        self.need_mask = (self.sliding_window is not None)
         if logits_soft_cap is None:
-            logits_soft_cap = 0
+            logits_soft_cap = -1
         self.logits_soft_cap = logits_soft_cap
 
         supported_head_sizes = PagedAttention.get_supported_head_sizes()
@@ -220,8 +219,8 @@ def forward(
                 value_cache,
                 attn_metadata.slot_mapping.flatten(),
                 self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
+                layer._k_scale_float,
+                layer._v_scale_float,
             )
 
         if attn_metadata.is_prompt:
@@ -234,11 +233,7 @@ def forward(
                                                     dim=1)
 
                 if attn_metadata.attn_bias is None:
-                    if self.alibi_slopes is not None:
-                        att_masks = _make_alibi_bias(
-                            self.alibi_slopes, query.dtype,
-                            attn_metadata.seq_lens)  # type: ignore
-                    elif self.sliding_window is not None:
+                    if self.sliding_window is not None:
                         att_masks = _make_sliding_window_bias(
                             attn_metadata.seq_lens, self.sliding_window,
                             query.dtype)  # type: ignore
@@ -258,6 +253,7 @@ def forward(
                     output,
                     attn_metadata.seqlen_q,
                     attn_metadata.seqlen_q,
+                    self.alibi_slopes,
                     attn_metadata.max_seqlen,
                     attn_metadata.max_seqlen,
                     pdropout=0.0,
@@ -266,6 +262,8 @@ def forward(
                     is_causal=True,
                     return_softmax=False,
                     gen_=None,
+                    window_size_left=-1,
+                    window_size_right=-1,
                     logits_soft_cap=self.logits_soft_cap,
                 )
             else:
@@ -306,8 +304,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
+                    layer._k_scale_float,
+                    layer._v_scale_float,
                 )
             else:
                 # Run PagedAttention V2.
@@ -339,8 +337,8 @@ def forward(
                     max_seq_len,
                     self.alibi_slopes,
                     self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
+                    layer._k_scale_float,
+                    layer._v_scale_float,
                 )
 
             # Reshape the output tensor.
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 54278f5f608..0100c082aa2 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """
+# MLA Common Components
+
 This file implements common components for MLA implementations.
 
 First we define:
@@ -205,8 +207,9 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.attention.ops.merge_attn_states import merge_attn_states
+from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase, RowParallelLinear,
+                                               LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
@@ -214,7 +217,6 @@
 from vllm.platforms import current_platform
 from vllm.triton_utils import HAS_TRITON
 from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
-from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 if HAS_TRITON:
     from vllm.attention.ops.triton_flash_attention import triton_attention
@@ -711,12 +713,24 @@ def advance_step(self,
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
 
+        self._ops_advance_step(num_seqs=num_seqs,
+                               num_queries=num_queries,
+                               block_size=block_size,
+                               input_tokens=model_input.input_tokens,
+                               sampled_token_ids=sampled_token_ids,
+                               input_positions=model_input.input_positions)
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+        # here we use advance_step_flashinfo to update the paged_kv_* tensors
         ops.advance_step_flashattn(num_seqs=num_seqs,
                                    num_queries=num_queries,
                                    block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
+                                   input_tokens=input_tokens,
                                    sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
+                                   input_positions=input_positions,
                                    seq_lens=self.seq_lens_tensor,
                                    slot_mapping=self.slot_mapping,
                                    block_tables=self.block_tables)
@@ -727,6 +741,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     NOTE: Please read the comment at the top of the file before trying to 
     understand this class
     """
+    BLOCK_TABLE_EXTENDER: list[list[int]] = []
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.input_builder = input_builder
@@ -877,8 +892,10 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         num_seqs = len(seq_lens)
         if use_captured_graph:
             self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
+            self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
+                                     cuda_graph_pad_size)
             num_decode_tokens = batch_size - self.num_prefill_tokens
+
             block_tables = self._get_graph_runner_block_tables(
                 num_seqs, self.block_tables)
         else:
@@ -1017,12 +1034,7 @@ def __init__(
         qk_head_dim: int,
         v_head_dim: int,
         rotary_emb: RotaryEmbedding,
-        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
-        # attention backend perspective we rely on the layer to pass in the
-        # correct matrix
-        q_proj: ColumnParallelLinear,
         kv_b_proj: ColumnParallelLinear,
-        o_proj: RowParallelLinear,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -1040,11 +1052,9 @@ def __init__(
         self.rotary_emb = rotary_emb
         self.use_yarn_rope = isinstance(rotary_emb,
                                         DeepseekScalingRotaryEmbedding)
-        self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
-        self.o_proj = o_proj
-        self.triton_fa_func = triton_attention
 
+        self.triton_fa_func = triton_attention
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
@@ -1055,27 +1065,84 @@ def __init__(
                 functools.partial(flash_attn_varlen_func,
                                   fa_version=self.vllm_flash_attn_version)
 
-    def _v_up_proj_and_o_proj(self, x):
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
+                                         return_softmax_lse, **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
+            and not return_softmax_lse:
+            attn_out = self.triton_fa_func(
+                q,
+                k,
+                maybe_padded_v,
+                None,  # output
+                kwargs["cu_seqlens_q"],
+                kwargs["cu_seqlens_k"],
+                kwargs["max_seqlen_q"],
+                kwargs["max_seqlen_k"],
+                kwargs["causal"],
+                softmax_scale,
+                None,  # bias
+            )
+        if is_vllm_fa:
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_softmax_lse=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+        else:
+            # Use return_attn_probs instead of return_softmax_lse for RoCM
+            attn_out = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=maybe_padded_v,
+                return_attn_probs=return_softmax_lse,
+                softmax_scale=softmax_scale,
+                **kwargs,
+            )
+
+        # Unpack the output if there is multiple results,
+        # triton always returns (output, softmax_lse),
+        # vllm_flash_attn returns (output, softmax_lse) when
+        #  `return_softmax_lse = True`
+        # flash_attn (RoCM) returns (output, softmax_lse, ...) when
+        #  `return_attn_probs = True`
+        rest = None
+        if isinstance(attn_out, tuple):
+            attn_out, *rest = attn_out
+
+        # unpad if necessary
+        if self._pad_v:
+            attn_out = attn_out[..., :v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            assert rest is not None
+            return attn_out, rest[0]
+        return attn_out
+
+    def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
         # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
         x = torch.bmm(x, self.W_UV)
         # Convert from (N, B, V) to (B, N * V)
-        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
-        return self.o_proj(x)[0]
-
-    # Return `ql_nope`, `q_pe`
-    def _q_proj_and_k_up_proj(self, x):
-        q_nope, q_pe = self.q_proj(x)[0]\
-            .view(-1, self.num_heads, self.qk_head_dim)\
-            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-
-        # Convert from (B, N, P) to (N, B, P)
-        q_nope = q_nope.transpose(0, 1)
-        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-        ql_nope = torch.bmm(q_nope, self.W_UK_T)
-        # Convert from (N, B, L) to (B, N, L)
-        return ql_nope.transpose(0, 1), q_pe
+        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
@@ -1176,40 +1243,19 @@ def _compute_prefill_context(
             k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                           dim=-1)
 
-            # For MLA the v head dim is smaller than qk head dim so we pad
-            # out v with 0s to match the qk head dim
-            v_padded = torch.nn.functional.pad(v,
-                                               [0, q.shape[-1] - v.shape[-1]],
-                                               value=0)
-
-            if is_vllm_fa:
-                attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
-                    q=q,
-                    k=k,
-                    v=v_padded,
-                    cu_seqlens_q=prefill_metadata.query_start_loc,
-                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                    max_seqlen_q=prefill_metadata.max_query_len,
-                    max_seqlen_k=prefill_metadata.
-                    context_chunk_max_seq_lens[i],
-                    softmax_scale=self.scale,
-                    causal=False,  # Context is unmasked
-                    return_softmax_lse=True,
-                )
-            else:
-                attn_output, attn_softmax_lse, _ = self.flash_attn_varlen_func(
-                    q=q,
-                    k=k,
-                    v=v_padded,
-                    cu_seqlens_q=prefill_metadata.query_start_loc,
-                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                    max_seqlen_q=prefill_metadata.max_query_len,
-                    max_seqlen_k=prefill_metadata.
-                    context_chunk_max_seq_lens[i],
-                    softmax_scale=self.scale,
-                    causal=False,  # Context is unmasked
-                    return_attn_probs=True,
-                )
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
+                softmax_scale=self.scale,
+                causal=False,  # Context is unmasked
+                return_softmax_lse=True,
+            )
 
             if output is None:
                 output = attn_output
@@ -1252,58 +1298,22 @@ def _forward_prefill(
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
-            output = self.triton_fa_func(
-                q,
-                k,
-                v_padded,
-                None,
-                prefill_metadata.query_start_loc,
-                prefill_metadata.query_start_loc,
-                prefill_metadata.max_prefill_seq_len,
-                prefill_metadata.max_prefill_seq_len,
-                True,  # causal
-                self.scale,
-                None,  # attn_mask is None unless applying ALiBi mask
-            )
-            ## triton flash attention always return 2 objects
-            if not has_context:
-                output = output[0]
-        elif is_vllm_fa:
-            output = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.query_start_loc,
-                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                return_softmax_lse=has_context,
-            )
-        else:
-            output = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.query_start_loc,
-                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                return_attn_probs=has_context,
-            )
+        output = self._flash_attn_varlen_diff_headdims(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=prefill_metadata.query_start_loc,
+            cu_seqlens_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=has_context,
+        )
 
         if has_context:
             # ROCm flash_attn_varlen_func will return 3 objects instead of 2
-            suffix_output, suffix_lse, *rest = output
+            suffix_output, suffix_lse = output
             context_output, context_lse = self._compute_prefill_context( \
                 q, kv_c_and_k_pe_cache, attn_metadata)
 
@@ -1316,12 +1326,7 @@ def _forward_prefill(
                 suffix_lse=suffix_lse,
             )
 
-        # slice by `:v.shape[-1]` in order to remove v headdim padding
-        output = output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(output)[0]
+        return output.flatten(start_dim=-2)
 
     @abstractmethod
     def _forward_decode(
@@ -1336,7 +1341,7 @@ def _forward_decode(
     def forward(
         self,
         layer: AttentionLayer,
-        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        q: torch.Tensor,  # query in unified attn
         k_c_normed: torch.Tensor,  # key in unified attn
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
@@ -1367,27 +1372,32 @@ def forward(
         assert hasattr(attn_metadata, "input_positions")
 
         num_prefill_tokens: int = attn_metadata.num_prefill_tokens
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
 
-        decode_hs_or_q_c = hidden_states_or_q_c[num_prefill_tokens:]
+        decode_q = q[num_prefill_tokens:]
         decode_k_pe = k_pe[num_prefill_tokens:]
         decode_input_positions = \
             attn_metadata.input_positions[num_prefill_tokens:]
 
-        prefill_hs_or_q_c = hidden_states_or_q_c[:num_prefill_tokens]
+        prefill_q = q[:num_prefill_tokens]
         prefill_k_pe = k_pe[:num_prefill_tokens]
         prefill_input_positions = \
             attn_metadata.input_positions[:num_prefill_tokens]
         prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
 
         if has_decode:
-            decode_ql_nope, decode_q_pe = \
-                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
+            decode_q_nope, decode_q_pe = decode_q.split(
+                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            # Convert from (B, N, P) to (N, B, P)
+            decode_q_nope = decode_q_nope.transpose(0, 1)
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            decode_ql_nope = decode_ql_nope.transpose(0, 1)
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
                 decode_input_positions, decode_q_pe, decode_k_pe)
 
         if has_prefill:
-            prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
-                .view(-1, self.num_heads, self.qk_head_dim)
             prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
             prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
                 prefill_input_positions, prefill_q_pe, prefill_k_pe)
@@ -1405,9 +1415,9 @@ def forward(
 
         output = torch.empty(attn_metadata.num_prefill_tokens +
                              attn_metadata.num_decode_tokens,
-                             self.o_proj.output_size,
-                             device=hidden_states_or_q_c.device,
-                             dtype=hidden_states_or_q_c.dtype)
+                             self.v_head_dim * self.num_heads,
+                             device=q.device,
+                             dtype=q.dtype)
         if has_prefill:
             output[:num_prefill_tokens] = self._forward_prefill(
                 prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
new file mode 100644
index 00000000000..2984bc1dad6
--- /dev/null
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Type, Union
+
+import torch
+
+import vllm._custom_ops as ops
+import vllm.envs as envs
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata,
+                                                MLACommonMetadataBuilder,
+                                                MLACommonState)
+from vllm.attention.backends.utils import (compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
+                                               get_aiter_mla_metadata)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
+
+
+def is_aiter_mla_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER_MLA
+
+
+class AiterMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AiterMLAImpl"]:
+        return AiterMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AiterMLAMetadata"]:
+        return AiterMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["AiterMLAState"]:
+        return AiterMLAState
+
+
+@dataclass
+class AiterMLAMetadata(MLACommonMetadata):
+    # The following 4 tensors are for current version of AITER MLA
+    block_table_bound: Optional[torch.Tensor] = None
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: Optional[torch.Tensor] = None
+    # The page indices of the paged kv cache
+    paged_kv_indices: Optional[torch.Tensor] = None
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_lens: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self):
+        prefill_metadata = super().prefill_metadata
+        self._cached_prefill_metadata = prefill_metadata
+
+        if prefill_metadata is not None:
+            prefill_metadata.paged_kv_indptr = self.paged_kv_indptr
+            prefill_metadata.paged_kv_indices = self.paged_kv_indices
+            prefill_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            prefill_metadata.block_table_bound = self.block_table_bound
+
+            # update the cache
+            self._cached_prefill_metadata = self.__class__(
+                **prefill_metadata.__dict__)
+
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self):
+        decode_metadata = super().decode_metadata
+
+        self._cached_decode_metadata = decode_metadata
+
+        if decode_metadata is not None:
+            decode_metadata.paged_kv_indptr = self.paged_kv_indptr
+            decode_metadata.paged_kv_indices = self.paged_kv_indices
+            decode_metadata\
+                .paged_kv_last_page_lens = self.paged_kv_last_page_lens
+            decode_metadata.block_table_bound = self.block_table_bound
+
+            # update the cache
+            self._cached_decode_metadata = self.__class__(
+                **decode_metadata.__dict__)
+
+        return self._cached_decode_metadata
+
+    def _ops_advance_step(self, num_seqs: int, num_queries: int,
+                          block_size: int, input_tokens: torch.Tensor,
+                          sampled_token_ids: torch.Tensor,
+                          input_positions: torch.Tensor) -> None:
+
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=input_tokens,
+            sampled_token_ids=sampled_token_ids,
+            input_positions=input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_lens=self.paged_kv_last_page_lens,
+            block_table_bound=self.block_table_bound)
+
+
+class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
+    BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        super().__init__(input_builder)
+        assert self.runner.model_config.max_model_len == 32768,\
+                "AITER MLA requires max model len to be set to 32768"
+        assert self.block_size == 1, "AITER MLA requires only block size 1."
+
+    def prepare(self):
+        super().prepare()
+        self.paged_kv_indices: list[int] = []
+        self.paged_kv_indptr: list[int] = [0]
+        self.paged_kv_last_page_lens: list[int] = []
+        self.total_blocks = 0
+
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+            if is_profile_run:
+                return
+
+            # Update paged_kv_* tensors only for non-profile run
+            block_table = block_tables[seq_id]
+            self._update_paged_kv_tensors(block_table, seq_len)
+
+    def _update_paged_kv_tensors(self, block_table: list[int], seq_len: int):
+        # Get the number of valid blocks based on sequence length.
+        # If seq_len = 16, block_size = 16,
+        # block_table_bound is 1 with 1 valid block.
+        # If seq_len = 15, block_size = 16,
+        # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
+        block_table_bound = seq_len // self.block_size + 1 \
+            if seq_len % self.block_size != 0 \
+            else seq_len // self.block_size
+        self.paged_kv_indices.extend(block_table[:block_table_bound])
+        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
+                                    block_table_bound)
+
+        last_page_len = seq_len % self.block_size
+        if last_page_len == 0:
+            last_page_len = self.block_size
+        self.paged_kv_last_page_lens.append(last_page_len)
+
+    def build(self, seq_lens: list[int], query_lens: list[int],
+              cuda_graph_pad_size: int, batch_size: int) -> AiterMLAMetadata:
+        metadata = super().build(seq_lens, query_lens, cuda_graph_pad_size,
+                                 batch_size)
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        if use_captured_graph:
+            last_paged_kv_indptr = self.paged_kv_indptr[-1]
+            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
+                                        cuda_graph_pad_size)
+            self.paged_kv_last_page_lens.extend([0] * cuda_graph_pad_size)
+
+        # For current version of AITER MLA
+        if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
+            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
+                                                   device=device,
+                                                   dtype=torch.int)
+            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
+                                                  device=device,
+                                                  dtype=torch.int)
+            paged_kv_last_page_lens_tensor = torch.tensor(
+                self.paged_kv_last_page_lens, device=device, dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device=device,
+                                                   dtype=torch.int)
+        else:
+            paged_kv_indices_tensor = None
+            paged_kv_indptr_tensor = None
+            paged_kv_last_page_lens_tensor = None
+            block_table_bound_tensor = None
+
+        metadata.paged_kv_indptr = paged_kv_indptr_tensor
+        metadata.paged_kv_indices = paged_kv_indices_tensor
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens_tensor
+        metadata.block_table_bound = block_table_bound_tensor
+
+        return metadata
+
+
+class AiterMLAState(MLACommonState[AiterMLAMetadata]):
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        kv_indices, kv_indptr, last_page_lens = get_aiter_mla_metadata(
+            max_batch_size=max_batch_size,
+            block_size=self.runner.block_size,
+            max_block_per_batch=self.runner.get_max_block_per_batch(),
+            device=self.runner.device)
+        self._paged_kv_indices_tensor = kv_indices
+        self._paged_kv_indptr_tensor = kv_indptr
+        self._paged_kv_last_page_lens_tensor = last_page_lens
+
+        with super().graph_capture(max_batch_size):
+            yield
+
+        del self._paged_kv_indices_tensor
+        del self._paged_kv_indptr_tensor
+        del self._paged_kv_last_page_lens_tensor
+
+    def graph_capture_get_metadata_for_batch(
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> AiterMLAMetadata:
+
+        metadata = super().graph_capture_get_metadata_for_batch(
+            batch_size, is_encoder_decoder_model)
+
+        paged_kv_indptr = self._paged_kv_indptr_tensor[:batch_size + 1]
+        paged_kv_indices = self._paged_kv_indices_tensor
+        paged_kv_last_page_lens = self._paged_kv_last_page_lens_tensor[:
+                                                                       batch_size]
+
+        metadata.paged_kv_indptr = paged_kv_indptr
+        metadata.paged_kv_indices = paged_kv_indices
+        metadata.paged_kv_last_page_lens = paged_kv_last_page_lens
+
+        return metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata: AiterMLAMetadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = super().get_graph_input_buffers(
+            attn_metadata, is_encoder_decoder_model)
+        input_buffers[
+            'paged_kv_indptr'] = attn_metadata.decode_metadata.paged_kv_indptr
+        input_buffers[
+            "paged_kv_indices"] = attn_metadata.\
+            decode_metadata.paged_kv_indices
+        input_buffers[
+            "paged_kv_last_page_lens"] = attn_metadata.\
+            decode_metadata.paged_kv_last_page_lens
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata: AiterMLAMetadata,
+                                    is_encoder_decoder_model: bool = False):
+        super().prepare_graph_input_buffers(input_buffers, attn_metadata,
+                                            is_encoder_decoder_model)
+
+        num_total_blocks = attn_metadata.decode_metadata.paged_kv_indices.shape[
+            0]
+        input_buffers["paged_kv_indptr"].copy_(
+            attn_metadata.decode_metadata.paged_kv_indptr, non_blocking=True)
+        input_buffers["paged_kv_indices"][:num_total_blocks].copy_(
+            attn_metadata.decode_metadata.paged_kv_indices, non_blocking=True)
+        input_buffers["paged_kv_last_page_lens"].copy_(
+            attn_metadata.decode_metadata.paged_kv_last_page_lens,
+            non_blocking=True)
+
+
+class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "Aiter MLA does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        from aiter import flash_attn_varlen_func
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+            self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+            softmax_scale: float, return_softmax_lse: bool,
+            **kwargs) -> Union[tuple[torch.Tensor, ...], torch.Tensor]:
+        output = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+
+        return output
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: AiterMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
+
+        aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
+                             attn_metadata.paged_kv_indptr,
+                             attn_metadata.paged_kv_indices,
+                             attn_metadata.paged_kv_last_page_lens)
+
+        return self._v_up_proj(o)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7376f930378..8076c4791d3 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -2,6 +2,7 @@
 """Attention layer ROCm GPUs."""
 import itertools
 from dataclasses import dataclass
+from functools import cache
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -26,7 +27,34 @@
 _PARTITION_SIZE_ROCM = 256
 
 
+@cache
+def is_rocm_aiter_paged_attn_enabled() -> bool:
+    return envs.VLLM_ROCM_USE_AITER_PAGED_ATTN \
+        and envs.VLLM_ROCM_USE_AITER \
+
+
+@cache
+def _get_paged_attn_module() -> PagedAttention:
+    """
+    Initializes the appropriate PagedAttention module from `attention/ops`, 
+    which is used as helper function
+    by `ROCmFlashAttentionImpl` and `ROCmFlashAttentionBackend`.
+
+    The choice of attention module depends on whether 
+    AITER paged attention is enabled:
+    - If enabled, `ROCmFlashAttentionImpl` uses `AITERPagedAttention`.
+    - Otherwise, it defaults to using the original `PagedAttention`.
+    """
+    if is_rocm_aiter_paged_attn_enabled():
+        # Import AITERPagedAttention only when the flag is enabled
+        from vllm.attention.ops.rocm_aiter_paged_attn import (
+            AITERPagedAttention)
+        return AITERPagedAttention()
+    return PagedAttention()
+
+
 class ROCmFlashAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
 
     @staticmethod
     def get_name() -> str:
@@ -55,8 +83,9 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
+        paged_attn = _get_paged_attn_module()
+        return paged_attn.get_kv_cache_shape(num_blocks, block_size,
+                                             num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -64,14 +93,16 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        paged_attn = _get_paged_attn_module()
+        paged_attn.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        paged_attn = _get_paged_attn_module()
+        paged_attn.copy_blocks(kv_caches, src_to_dists)
 
 
 @dataclass
@@ -495,7 +526,10 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        self.paged_attn_module = _get_paged_attn_module()
+        supported_head_sizes = self.paged_attn_module.get_supported_head_sizes(
+        )
+
         if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
@@ -515,7 +549,7 @@ def __init__(
 
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
-            self.attn_func = triton_attention
+            self.triton_attn_func = triton_attention
             logger.debug("Using Triton FA in ROCmBackend")
             if self.sliding_window != (-1, -1):
                 logger.warning("ROCm Triton FA does not currently support "
@@ -531,7 +565,7 @@ def __init__(
             else:
                 try:
                     from flash_attn import flash_attn_varlen_func  # noqa: F401
-                    self.attn_func = flash_attn_varlen_func
+                    self.fa_attn_func = flash_attn_varlen_func
                     logger.debug("Using CK FA in ROCmBackend")
                 except ModuleNotFoundError:
                     self.use_naive_attn = True
@@ -542,9 +576,11 @@ def __init__(
                         "ROCm Naive FlashAttention does not support "
                         "attention logits soft capping.")
 
-                self.attn_func = _sdpa_attention
+                self.sdpa_attn_func = _sdpa_attention
                 logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
+        self.aiter_kv_scales_initialized = False
+
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
         """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
         tokens, n_kv_heads, head_dim = x.shape
@@ -613,6 +649,8 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        assert output is not None, "Output tensor must be provided."
+
         query = query.view(-1, self.num_heads, self.head_size)
         if key is not None:
             assert value is not None
@@ -621,12 +659,37 @@ def forward(
         else:
             assert value is None
 
+        paged_attn = self.paged_attn_module
+
+        # Reshaping kv tensors is required for AITER paged attention kernel
+        # because it works on a different tensor shape,
+        # when the size of one element is one byte (int8/fp8 dtypes).
+        # This reshaping is only required on the first forward call
+        # and the kv cache must not be empty.
+        if (is_rocm_aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
+                and not self.aiter_kv_scales_initialized
+                and kv_cache.shape != torch.Size([0])):
+            num_blocks = kv_cache.shape[1]
+            block_size = kv_cache.shape[2] // (self.num_kv_heads *
+                                               self.head_size)
+            k_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            v_scale = torch.empty((self.num_kv_heads, num_blocks * block_size),
+                                  dtype=torch.float32,
+                                  device=kv_cache.device)
+            self.aiter_kv_scales_initialized = True
+            k_scale.fill_(layer._k_scale.item())
+            v_scale.fill_(layer._v_scale.item())
+            layer._k_scale = k_scale
+            layer._v_scale = v_scale
+
         # Only update KV cache for decoder self-attention
         # and encoder-decoder cross-attention
         if self.attn_type not in [
                 AttentionType.ENCODER, AttentionType.ENCODER_ONLY
         ] and kv_cache.numel() > 0:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
+            key_cache, value_cache = paged_attn.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
             if key is not None and value is not None:
@@ -634,7 +697,7 @@ def forward(
                 # cache. If kv_cache is not provided, the new key and value
                 # tensors are not cached. This happens during the initial
                 # memory profiling run.
-                PagedAttention.write_to_paged_cache(
+                paged_attn.write_to_paged_cache(
                     key,
                     value,
                     key_cache,
@@ -656,7 +719,6 @@ def forward(
             assert attn_metadata.num_encoder_tokens is not None
             num_prefill_tokens = attn_metadata.num_encoder_tokens
 
-        output = torch.empty_like(query)
         # Query for decode. KV is not needed because it is already cached.
         decode_query = query[num_prefill_tokens:]
         # QKV for prefill.
@@ -704,11 +766,17 @@ def forward(
                             query.dtype,
                             seq_lens,
                             make_attn_mask=causal_mask)  # type: ignore
-                    out, _ = self.attn_func(
+                    use_fp8_scales = (layer._q_scale and layer._k_scale
+                                      and layer._v_scale and layer._prob_scale
+                                      and self.kv_cache_dtype == "fp8")
+                    full_scales = (
+                        layer._q_scale, layer._k_scale, layer._v_scale,
+                        layer._prob_scale) if use_fp8_scales else None
+                    self.triton_attn_func(
                         query,
                         key,
                         value,
-                        None,
+                        output[:num_prefill_tokens],
                         query_seq_start_loc,
                         key_seq_start_loc,
                         query_max_seq_len,
@@ -717,6 +785,7 @@ def forward(
                         self.scale,
                         attn_masks[0][None]
                         if attn_masks is not None else None,
+                        full_scales,
                     )
                 elif self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
@@ -733,10 +802,11 @@ def forward(
                     key = key.movedim(0, key.dim() - 2)
                     value = value.movedim(0, value.dim() - 2)
                     # sdpa math backend attention
-                    out = self.attn_func(
+                    self.sdpa_attn_func(
                         query,
                         key,
                         value,
+                        output[:num_prefill_tokens],
                         query_seq_start_loc,
                         num_prefill_tokens,
                         self.num_heads,
@@ -745,7 +815,8 @@ def forward(
                         attn_masks,
                     )
                 else:
-                    out = self.attn_func(
+                    # upstream FA does not support an output arg, copy
+                    output[:num_prefill_tokens] = self.fa_attn_func(
                         q=query,
                         k=key,
                         v=value,
@@ -760,33 +831,26 @@ def forward(
                         softcap=self.logits_soft_cap,
                     )
 
-                # common code for prefill
-                assert output[:num_prefill_tokens].shape == out.shape
-                if output.shape[0] > num_prefill_tokens:
-                    output[:num_prefill_tokens] = out
-                else:
-                    output = out
             else:
                 # prefix-enabled attention -
                 # not applicable for encoder-only models
                 if self.attn_type != AttentionType.ENCODER_ONLY:
-                    output[:
-                           num_prefill_tokens] = PagedAttention.forward_prefix(
-                               query,
-                               key,
-                               value,
-                               self.kv_cache_dtype,
-                               key_cache,
-                               value_cache,
-                               prefill_meta.block_tables,
-                               prefill_meta.query_start_loc,
-                               prefill_meta.seq_lens_tensor,
-                               prefill_meta.max_query_len,
-                               self.alibi_slopes,
-                               self.sliding_window[0],
-                               layer._k_scale,
-                               layer._v_scale,
-                           )
+                    output[:num_prefill_tokens] = paged_attn.forward_prefix(
+                        query,
+                        key,
+                        value,
+                        self.kv_cache_dtype,
+                        key_cache,
+                        value_cache,
+                        prefill_meta.block_tables,
+                        prefill_meta.query_start_loc,
+                        prefill_meta.seq_lens_tensor,
+                        prefill_meta.max_query_len,
+                        self.alibi_slopes,
+                        self.sliding_window[0],
+                        layer._k_scale,
+                        layer._v_scale,
+                    )
         # Skip decode phase for encoder-only models
         if (decode_meta := attn_metadata.decode_metadata) and (
                 self.attn_type != AttentionType.ENCODER_ONLY):
@@ -818,14 +882,10 @@ def forward(
                     device=output.device,
                 )
                 max_logits = torch.empty_like(exp_sums)
-                if num_prefill_tokens > 0:
-                    out = output[num_prefill_tokens:]
-                else:
-                    out = output
 
                 query_start_loc = None
                 ops.paged_attention_rocm(
-                    out,
+                    output[num_prefill_tokens:],
                     exp_sums,
                     max_logits,
                     tmp_output,
@@ -849,7 +909,7 @@ def forward(
                     layer._v_scale,
                 )
             else:
-                output[num_prefill_tokens:] = PagedAttention.forward_decode(
+                output[num_prefill_tokens:] = paged_attn.forward_decode(
                     decode_query,
                     key_cache,
                     value_cache,
@@ -878,7 +938,8 @@ def _sdpa_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    seq_lens: List[int],
+    output: torch.Tensor,
+    seq_lens: torch.Tensor,
     num_tokens: int,
     num_heads: int,
     head_size: int,
@@ -886,9 +947,9 @@ def _sdpa_attention(
     attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
-    output = torch.empty((num_tokens, num_heads, head_size),
-                         dtype=query.dtype,
-                         device=query.device)
+    assert output.shape == (num_tokens, num_heads, head_size)
+    assert output.dtype == query.dtype
+    assert output.device == query.device
 
     for i, seq_len in enumerate(seq_lens):
         end = start + seq_len
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 61e5c76d9fd..6945c2c6e29 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -110,4 +110,4 @@ def _forward_decode(
                              decode_meta.seq_lens_tensor, attn_logits,
                              num_kv_splits, self.scale, PAGE_SIZE)
 
-        return self._v_up_proj_and_o_proj(o)
+        return self._v_up_proj(o)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index b4413c36b64..54ffd5c45ff 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -2,8 +2,10 @@
 """Attention backend utils"""
 from collections import defaultdict
 from contextlib import contextmanager
+from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -11,6 +13,7 @@
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
 from vllm.attention.backends.abstract import AttentionType
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
@@ -547,7 +550,7 @@ def get_num_prefill_decode_query_kv_tokens(
     based on the attention metadata and the specified attention type.
 
     Args:
-        attn_metadata (FlashAttentionMetadata): Attention Metadata object.
+        attn_metadata (AttentionMetadata): Attention Metadata object.
         attn_type (AttentionType): The type of attention being used.
     Returns:
         Tuple[int, int, int]: A tuple containing three integers:
@@ -583,3 +586,24 @@ def get_num_prefill_decode_query_kv_tokens(
 
     return (num_prefill_query_tokens, num_prefill_kv_tokens,
             num_decode_query_tokens)
+
+
+@dataclass
+class MLADims:
+    q_lora_rank: Optional[int]
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
+def get_mla_dims(model_config: ModelConfig) -> MLADims:
+    hf_text_config = model_config.hf_text_config
+
+    return MLADims(
+        q_lora_rank=getattr(hf_text_config, "q_lora_rank", None),
+        kv_lora_rank=hf_text_config.kv_lora_rank,
+        qk_nope_head_dim=hf_text_config.qk_nope_head_dim,
+        qk_rope_head_dim=hf_text_config.qk_rope_head_dim,
+        v_head_dim=hf_text_config.v_head_dim,
+    )
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index dbf4723ee1b..aa218cc37af 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -10,6 +10,9 @@
 from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
@@ -87,6 +90,7 @@ def __init__(
         # FlashAttn doesn't support quantizing the kv-cache only
         # but requires q to be quantized as well.
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
 
         # We also keep the float32 versions of k/v_scale for attention
         # backends that don't support tensors (Flashinfer)
@@ -329,17 +333,54 @@ def forward(
         return out.reshape(bsz, q_len, -1)
 
 
+def wait_for_kv_layer_from_connector(layer_name: str):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.wait_for_layer_load(layer_name)
+
+
+def maybe_save_kv_layer_to_connector(
+    layer_name: str,
+    kv_cache_layer: List[torch.Tensor],
+):
+    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
+        return
+
+    connector = get_kv_transfer_group()
+
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)
+
+
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     layer_name: str,
 ) -> torch.Tensor:
+    wait_for_kv_layer_from_connector(layer_name)
+
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
+    output = self.impl.forward(self, query, key, value, kv_cache,
+                               attn_metadata)
+
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+    return output
 
 
 def unified_attention_fake(
@@ -367,6 +408,7 @@ def unified_attention_with_output(
     output: torch.Tensor,
     layer_name: str,
 ) -> None:
+    wait_for_kv_layer_from_connector(layer_name)
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
     self = forward_context.no_compile_layers[layer_name]
@@ -379,6 +421,8 @@ def unified_attention_with_output(
                       attn_metadata,
                       output=output)
 
+    maybe_save_kv_layer_to_connector(layer_name, kv_cache)
+
 
 def unified_attention_with_output_fake(
     query: torch.Tensor,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 1b47581641b..759b3d8536d 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -289,7 +289,7 @@ def chunked_prefill_paged_decode(
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
                               _PARTITION_SIZE_ROCM)
         assert _PARTITION_SIZE_ROCM % block_size == 0
-        total_num_seq = query.shape[0]
+        total_num_seq = block_table.shape[0]
         tmp_output = torch.empty(
             size=(total_num_seq, num_query_heads, max_num_partitions,
                   head_size),
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index 49ea420d092..1dedd2ffc5f 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -22,7 +22,6 @@ class HPUPagedAttentionMetadata:
     block_usage: Optional[torch.Tensor]
     block_indices: Optional[torch.Tensor]
     block_offsets: Optional[torch.Tensor]
-    block_scales: Optional[torch.Tensor]
     block_groups: Optional[torch.Tensor]
 
 
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 6d96f58320c..1702203b183 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -5,7 +5,8 @@
 try:
     import intel_extension_for_pytorch.llm.modules as ipex_modules
     _use_ipex = True
-except ImportError:
+# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813
+except (ImportError, AttributeError):
     _use_ipex = False
 
 import torch
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e0478c2aebd..a8c8d840962 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -16,831 +16,778 @@
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
 
-if triton.__version__ >= "2.1.0":
-
-    @triton.jit
-    def _fwd_kernel(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SLIDING_WINDOW: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
-
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
-
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        # start position inside of the query
-        # generally, N goes over kv, while M goes over query_len
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        # [N]; starts at 0
-        offs_n = tl.arange(0, BLOCK_N)
-        # [D]; starts at 0
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        # [M]; starts at current position in query
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        # [M,D]
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
-            0).to(tl.int1)  # [D]
-
-        q = tl.load(Q + off_q,
-                    mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_query_len),
-                    other=0.0)  # [M,D]
-
-        # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")  # [M]
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)  # [M]
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED],
-                       dtype=tl.float32)  # [M,D]
-
-        # compute query against context (no causal mask here)
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)  # [N]
-            # [D,N]
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            # [N,D]
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
-
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-            if SLIDING_WINDOW > 0:
-                # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
-                # Q entries in sequence
-                # (start_n + offs_n[None, :]) are the positions of
-                # KV entries in sequence
-                # So the condition makes sure each entry in Q only attends
-                # to KV entries not more than SLIDING_WINDOW away.
-                #
-                # We can't use -inf here, because the
-                # sliding window may lead to the entire row being masked.
-                # This then makes m_ij contain -inf, which causes NaNs in
-                # exp().
-                qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
-                              (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk,
-                              -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)  # [M]
-            p = tl.exp(qk - m_ij[:, None])  # [M,N]
-            l_ij = tl.sum(p, 1)  # [M]
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)  # [M]
-            alpha = tl.exp(m_i - m_i_new)  # [M]
-            beta = tl.exp(m_ij - m_i_new)  # [M]
-            l_i_new = alpha * l_i + beta * l_ij  # [M]
-
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)  # [N,D]
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        # block_mask is 0 when we're already past the current query length
-        block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
-
-        # compute query against itself (with causal mask)
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) < cur_batch_query_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk *= sm_scale
-            # apply causal mask
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-            if SLIDING_WINDOW > 0:
-                qk = tl.where(
-                    offs_m[:, None] - (start_n + offs_n[None, :])
-                    < SLIDING_WINDOW, qk, -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) < cur_batch_query_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_query_len))
+
+# Here's an example autotuner config for this kernel. This config does provide
+# a performance improvement, but dramatically increases first call latency in
+# triton 3.2. Because of this tradeoff, it's currently commented out.
+# @triton.autotune(
+#     configs=[
+#         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, \
+#                         "num_unroll_cache": 4, \
+#                         "num_unroll_request": 1 } | \
+#                         ({"kpack": 2, "waves_per_eu": 2} \
+#                             if current_platform.is_rocm() else {}), \
+#                         num_warps=4, \
+#                         num_stages=1)
+#     ],
+#     key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"]
+# )
+@triton.jit
+def _fwd_kernel(Q,
+                K,
+                V,
+                K_cache,
+                V_cache,
+                B_Loc,
+                sm_scale,
+                k_scale,
+                v_scale,
+                B_Start_Loc,
+                B_Seqlen,
+                x: tl.constexpr,
+                Out,
+                stride_b_loc_b,
+                stride_b_loc_s,
+                stride_qbs,
+                stride_qh,
+                stride_qd,
+                stride_kbs,
+                stride_kh,
+                stride_kd,
+                stride_vbs,
+                stride_vh,
+                stride_vd,
+                stride_obs,
+                stride_oh,
+                stride_od,
+                stride_k_cache_bs,
+                stride_k_cache_h,
+                stride_k_cache_d,
+                stride_k_cache_bl: tl.constexpr,
+                stride_k_cache_x,
+                stride_v_cache_bs,
+                stride_v_cache_h,
+                stride_v_cache_d,
+                stride_v_cache_bl,
+                num_queries_per_kv: tl.constexpr,
+                IN_PRECISION: tl.constexpr,
+                BLOCK_M: tl.constexpr,
+                BLOCK_DMODEL: tl.constexpr,
+                BLOCK_DMODEL_PADDED: tl.constexpr,
+                BLOCK_SIZE: tl.constexpr,
+                BLOCK_N: tl.constexpr,
+                SLIDING_WINDOW: tl.constexpr,
+                num_unroll_cache: tl.constexpr,
+                num_unroll_request: tl.constexpr,
+                SKIP_DECODE: tl.constexpr,
+                MAX_Q_LEN: tl.constexpr = 0,
+                MAX_CTX_LEN: tl.constexpr = 0):
+
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @triton.jit
-    def _fwd_kernel_flash_attn_v2(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        q = tl.load(Q + off_q,
-                    mask=offs_m[:, None]
-                    < cur_batch_seq_len - cur_batch_ctx_len,
+    # start position inside of the query
+    # generally, N goes over kv, while M goes over query_len
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    # [BLOCK_SIZE]; starts at 0
+    offs_bs_n = tl.arange(0, BLOCK_SIZE)
+    # [N]; starts at 0
+    offs_n = tl.arange(0, BLOCK_N)
+    # [D]; starts at 0
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    # [M]; starts at current position in query
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # [M,D]
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+        0).to(tl.int1)  # [D]
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_query_len),
+                other=0.0)  # [M,D]
+
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
+
+    # compute query against context (no causal mask here)
+    for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
+                            loop_unroll_factor=num_unroll_cache):
+        start_n = tl.multiple_of(start_n, BLOCK_SIZE)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+        # [D,BLOCK_SIZE]
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+
+        # [BLOCK_SIZE,D]
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 offs_bs_n[:, None] * stride_v_cache_bl)
+
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            k_load = tl.load(
+                K_cache + off_k,
+                mask=dim_mask[:, None] &
+                ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
+                other=0.0)  # [D,N]
+        else:
+            k_load = tl.load(K_cache + off_k)
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+        if SLIDING_WINDOW > 0:
+            # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+            # Q entries in sequence
+            # (start_n + offs_bs_n[None, :]) are the positions of
+            # KV entries in sequence
+            # So the condition makes sure each entry in Q only attends
+            # to KV entries not more than SLIDING_WINDOW away.
+            #
+            # We can't use -inf here, because the
+            # sliding window may lead to the entire row being masked.
+            # This then makes m_ij contain -inf, which causes NaNs in
+            # exp().
+            qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                          (start_n + offs_bs_n[None, :]) < SLIDING_WINDOW, qk,
+                          -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            v_load = tl.load(
+                V_cache + off_v,
+                mask=dim_mask[None, :] &
+                ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
+                other=0.0)  # [N,D]
+        else:
+            v_load = tl.load(V_cache + off_v)
+
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # block_mask is 0 when we're already past the current query length
+    block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
+
+    # compute query against itself (with causal mask)
+    for start_n in tl.range(0, \
+                        block_mask * (start_m + 1) * BLOCK_M, BLOCK_N, \
+                        loop_unroll_factor=num_unroll_request):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=dim_mask[:, None] &
+                    ((start_n + offs_n[None, :]) < cur_batch_query_len),
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        # acc /= l_i[:, None]
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @triton.jit
-    def _fwd_kernel_alibi(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        Alibi_slopes,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
-        # attn_bias[]
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        # cur_batch_seq_len: the length of prompts
-        # cur_batch_ctx_len: the length of prefix
-        # cur_batch_in_all_start_index: the start id of the dim=0
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
-
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
-
-        q = tl.load(Q + off_q,
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk *= sm_scale
+        # apply causal mask
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+        if SLIDING_WINDOW > 0:
+            qk = tl.where(
+                offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
+                qk, -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
                     mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                    ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                    other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_attn_v2(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
-
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = 0
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
-
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        # init alibi
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = cur_batch_ctx_len
-        # # init debugger
-        # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
-        # offset_db_k = tl.arange(0, BLOCK_N)
-        # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        acc = acc / l_i[:, None]
-
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    # acc /= l_i[:, None]
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
+
+
+@triton.jit
+def _fwd_kernel_alibi(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    k_scale,
+    v_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Alibi_slopes,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    IN_PRECISION: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,  # head size
+    BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+    BLOCK_N: tl.constexpr,
+    SKIP_DECODE: tl.constexpr,
+):
+    # attn_bias[]
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    # cur_batch_seq_len: the length of prompts
+    # cur_batch_ctx_len: the length of prefix
+    # cur_batch_in_all_start_index: the start id of the dim=0
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @torch.inference_mode()
-    def context_attention_fwd(q,
-                              k,
-                              v,
-                              o,
-                              kv_cache_dtype: str,
-                              k_cache,
-                              v_cache,
-                              b_loc,
-                              b_start_loc,
-                              b_seq_len,
-                              max_seq_len,
-                              max_input_len,
-                              k_scale: torch.Tensor,
-                              v_scale: torch.Tensor,
-                              alibi_slopes=None,
-                              sliding_window=None,
-                              sm_scale=None,
-                              skip_decode=False):
-
-        q_dtype_is_f32 = q.dtype is torch.float32
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = 0
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k_load = tl.load(K_cache + off_k,
+                         mask=dim_mask[:, None] &
+                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+                         other=0.0)  # [D,N]
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v_load = tl.load(V_cache + off_v,
+                         mask=dim_mask[None, :] &
+                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+                         other=0.0)
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    # init alibi
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = cur_batch_ctx_len
+    # # init debugger
+    # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+    # offset_db_k = tl.arange(0, BLOCK_N)
+    # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=dim_mask[:, None] & ((start_n + offs_n[None, :])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=dim_mask[None, :] & ((start_n + offs_n[:, None])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] &
+             (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    return
+
+
+@torch.inference_mode()
+def context_attention_fwd(q,
+                          k,
+                          v,
+                          o,
+                          kv_cache_dtype: str,
+                          k_cache,
+                          v_cache,
+                          b_loc,
+                          b_start_loc,
+                          b_seq_len,
+                          max_seq_len,
+                          max_input_len,
+                          k_scale: torch.Tensor,
+                          v_scale: torch.Tensor,
+                          alibi_slopes=None,
+                          sliding_window=None,
+                          sm_scale=None,
+                          skip_decode=False):
+
+    q_dtype_is_f32 = q.dtype is torch.float32
+
+    # Turing does have tensor core for float32 multiplication
+    # use ieee as fallback for triton kernels work. There is also
+    # warning on vllm/config.py to inform users this fallback
+    # implementation
+    IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert (k_cache.dtype == torch.uint8)
+        assert (v_cache.dtype == torch.uint8)
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = current_platform.fp8_dtype()
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        k_cache = k_cache.view(target_dtype)
+        v_cache = v_cache.view(target_dtype)
+
+    if (k_cache.dtype == torch.uint8
+            or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
+        raise ValueError("kv_cache_dtype='auto' unsupported for\
+            FP8 KV Cache prefill kernel")
+
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    # round up Lk to a power of 2 - this is required for Triton block size
+    Lk_padded = triton.next_power_of_2(Lk)
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    num_queries_per_kv = q.shape[1] // k.shape[1]
+
+    assert batch + 1 == len(b_start_loc)
+
+    # 0 means "disable"
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if alibi_slopes is not None:
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
         # if q.dtype is torch.float32:
         BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
-
-        # Turing does have tensor core for float32 multiplication
-        # use ieee as fallback for triton kernels work. There is also
-        # warning on vllm/config.py to inform users this fallback
-        # implementation
-        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
-
-        # Conversion of FP8 Tensor from uint8 storage to
-        # appropriate torch.dtype for interpretation by Triton
-        if "fp8" in kv_cache_dtype:
-            assert (k_cache.dtype == torch.uint8)
-            assert (v_cache.dtype == torch.uint8)
-
-            if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-                target_dtype = current_platform.fp8_dtype()
-            elif kv_cache_dtype == "fp8_e5m2":
-                target_dtype = torch.float8_e5m2
-            else:
-                raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
-
-            k_cache = k_cache.view(target_dtype)
-            v_cache = v_cache.view(target_dtype)
-
-        if (k_cache.dtype == torch.uint8
-                or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
-            raise ValueError("kv_cache_dtype='auto' unsupported for\
-                FP8 KV Cache prefill kernel")
-
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        # round up Lk to a power of 2 - this is required for Triton block size
-        Lk_padded = triton.next_power_of_2(Lk)
-
-        if sm_scale is None:
-            sm_scale = 1.0 / (Lq**0.5)
-        batch, head = b_seq_len.shape[0], q.shape[1]
-        num_queries_per_kv = q.shape[1] // k.shape[1]
-
-        assert batch + 1 == len(b_start_loc)
-        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-
-        # 0 means "disable"
-        if sliding_window is None or sliding_window <= 0:
-            sliding_window = 0
-
-        if alibi_slopes is not None:
-            _fwd_kernel_alibi[grid](
-                q,
-                k,
-                v,
-                k_cache,
-                v_cache,
-                b_loc,
-                sm_scale,
-                k_scale,
-                v_scale,
-                b_start_loc,
-                b_seq_len,
-                alibi_slopes,
-                v_cache.shape[3],
-                k_cache.shape[4],
-                o,
-                b_loc.stride(0),
-                b_loc.stride(1),
-                q.stride(0),
-                q.stride(1),
-                q.stride(2),
-                k.stride(0),
-                k.stride(1),
-                k.stride(2),
-                v.stride(0),
-                v.stride(1),
-                v.stride(2),
-                o.stride(0),
-                o.stride(1),
-                o.stride(2),
-                k_cache.stride(0),
-                k_cache.stride(1),
-                k_cache.stride(2),
-                k_cache.stride(3),
-                k_cache.stride(
-                    4
-                ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
-                v_cache.stride(0),
-                v_cache.stride(1),
-                v_cache.stride(2),
-                v_cache.stride(
-                    3),  #[num_blocks, num_kv_heads, head_size, block_size]
-                num_queries_per_kv=num_queries_per_kv,
-                IN_PRECISION=IN_PRECISION,
-                BLOCK_M=BLOCK,
-                BLOCK_DMODEL=Lk,
-                BLOCK_DMODEL_PADDED=Lk_padded,
-                BLOCK_N=BLOCK,
-                SKIP_DECODE=skip_decode,
-                num_warps=NUM_WARPS,
-                num_stages=1,
-            )
-            return
-
-        _fwd_kernel[grid](
+        # batch, head,
+        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+        _fwd_kernel_alibi[grid](
             q,
             k,
             v,
@@ -852,6 +799,7 @@ def context_attention_fwd(q,
             v_scale,
             b_start_loc,
             b_seq_len,
+            alibi_slopes,
             v_cache.shape[3],
             k_cache.shape[4],
             o,
@@ -886,9 +834,69 @@ def context_attention_fwd(q,
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
-            SLIDING_WINDOW=sliding_window,
             SKIP_DECODE=skip_decode,
             num_warps=NUM_WARPS,
             num_stages=1,
         )
         return
+
+    max_seq_len = 0 if max_seq_len is None else max_seq_len
+    extra_kargs = {}
+    if current_platform.is_rocm():
+        extra_kargs = {"kpack": 2, "waves_per_eu": 2}
+
+    grid = lambda META: (batch, head,
+                         triton.cdiv(max_input_len, META["BLOCK_M"]))
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        b_loc,
+        sm_scale,
+        k_scale,
+        v_scale,
+        b_start_loc,
+        b_seq_len,
+        k_cache.shape[4],
+        o,
+        b_loc.stride(0),
+        b_loc.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        k_cache.stride(0),
+        k_cache.stride(1),
+        k_cache.stride(2),
+        k_cache.stride(3),
+        k_cache.stride(
+            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        v_cache.stride(0),
+        v_cache.stride(1),
+        v_cache.stride(2),
+        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        BLOCK_SIZE=v_cache.shape[3],
+        num_queries_per_kv=num_queries_per_kv,
+        IN_PRECISION=IN_PRECISION,
+        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL_PADDED=Lk_padded,
+        SLIDING_WINDOW=sliding_window,
+        SKIP_DECODE=skip_decode,
+        BLOCK_M=128,
+        BLOCK_N=64,
+        num_unroll_cache=4,
+        num_unroll_request=1,
+        num_warps=4,
+        num_stages=1,
+        **extra_kargs)
+    return
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
new file mode 100644
index 00000000000..1c90f8c19b0
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+
+
+def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
+                           max_block_per_batch: int,
+                           device: torch.device) -> tuple[torch.Tensor, ...]:
+    paged_kv_indices = torch.zeros(max_batch_size * max_block_per_batch,
+                                   dtype=torch.int32,
+                                   device=device)
+    paged_kv_indptr = torch.zeros(max_batch_size + 1,
+                                  dtype=torch.int32,
+                                  device=device)
+    paged_kv_last_page_lens = torch.full((max_batch_size, ),
+                                         block_size,
+                                         dtype=torch.int32)
+    return paged_kv_indices, paged_kv_indptr, paged_kv_last_page_lens
+
+
+def aiter_mla_decode_fwd(
+    q: torch.Tensor,
+    kv_buffer: torch.Tensor,
+    o: torch.Tensor,
+    sm_scale: float,
+    kv_indptr: Optional[torch.Tensor] = None,
+    kv_indices: Optional[torch.Tensor] = None,
+    kv_last_page_lens: Optional[torch.Tensor] = None,
+    logit_cap: float = 0.0,
+):
+    from aiter.mla import mla_decode_fwd
+
+    mla_decode_fwd(q,
+                   kv_buffer.view(-1, 1, 1, q.shape[-1]),
+                   o,
+                   kv_indptr,
+                   kv_indices,
+                   kv_last_page_lens,
+                   sm_scale=sm_scale,
+                   logit_cap=logit_cap)
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
new file mode 100644
index 00000000000..0f3cf1842c8
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import aiter as rocm_aiter
+import torch
+
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+class AITERPagedAttention(PagedAttention):
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+    ) -> None:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                value_cache, slot_mapping,
+                                                kv_cache_dtype, k_scale,
+                                                v_scale)
+        else:
+            kv_cache_torch_dtype = (FP8_DTYPE
+                                    if "fp8" in kv_cache_dtype else torch.int8)
+            key_cache = key_cache.view(kv_cache_torch_dtype)
+            value_cache = value_cache.view(kv_cache_torch_dtype)
+
+            rocm_aiter.reshape_and_cache_with_pertoken_quant(
+                key, value, key_cache, value_cache, k_scale, v_scale,
+                slot_mapping.flatten(), True)
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+        k_scale: torch.Tensor,
+        v_scale: torch.Tensor,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> torch.Tensor:
+        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
+            return PagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_tables=block_tables,
+                seq_lens=seq_lens,
+                max_seq_len=max_seq_len,
+                kv_cache_dtype=kv_cache_dtype,
+                num_kv_heads=num_kv_heads,
+                scale=scale,
+                alibi_slopes=alibi_slopes,
+                k_scale=k_scale,
+                v_scale=v_scale,
+                tp_rank=tp_rank,
+                blocksparse_local_blocks=blocksparse_local_blocks,
+                blocksparse_vert_stride=blocksparse_vert_stride,
+                blocksparse_block_size=blocksparse_block_size,
+                blocksparse_head_sliding_step=blocksparse_head_sliding_step)
+
+        if "fp8" in kv_cache_dtype:
+            key_cache = key_cache.view(torch.float8_e4m3fnuz)
+            value_cache = value_cache.view(torch.float8_e4m3fnuz)
+
+        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
+            # use blocksparse paged attention
+            block_size = value_cache.size(-1)
+            assert (blocksparse_block_size > 0 and
+                    blocksparse_block_size % block_size == 0), \
+                (f"{blocksparse_block_size=} needs to be a multiple of"
+                 f"{block_size=} used in block_tables.")
+
+        output = torch.empty_like(query)
+        block_size = value_cache.shape[3]
+        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
+
+        rocm_aiter.pa_fwd_asm(query, key_cache, value_cache, block_tables,
+                              seq_lens, max_num_blocks_per_seq, k_scale,
+                              v_scale, output)
+        return output
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 745818eb6cf..23ac7d7dc84 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -1,31 +1,237 @@
-#!/usr/bin/env python
 # SPDX-License-Identifier: Apache-2.0
 """
 Fused Attention
 ===============
 
-This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
-(https://tridao.me/publications/flash2/flash2.pdf)
-Credits: OpenAI kernel team, AMD ML Frameworks Triton team
+This is a Triton implementation of the Flash Attention v2 algorithm
+See https://tridao.me/publications/flash2/flash2.pdf
 
-Features supported:
+Credits:
+AMD Triton kernels team
+OpenAI kernel team
 
-1) Fwd with causal masking
-2) Any sequence lengths without padding (currently fwd kernel only)
-3) Support for different sequence lengths for q and k
-4) Nested tensor API currently does not support dropout or bias.
-
-Not currently supported:
+Currently only the forward kernel is supported, and contains these features:
 
-1) Non power of two head dims
+1) Fwd with causal masking
+2) Arbitrary Q and KV sequence lengths
+3) Arbitrary head sizes
+4) Multi and grouped query attention
+5) Variable sequence lengths
+6) ALiBi and matrix bias
+7) FP8 support
 
 """
 
+from typing import Optional
+
 import torch
 import triton
 import triton.language as tl
 
-torch_dtype: tl.constexpr = torch.float16
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+SUPPORTED_LAYOUTS = ['thd', 'bhsd', 'bshd']
+
+default_eight_bit_dtype_triton = tl.float8e4b8
+default_eight_bit_dtype_torch = current_platform.fp8_dtype()
+default_float8_info = torch.finfo(default_eight_bit_dtype_torch)
+
+FP8_MIN = triton.language.constexpr(default_float8_info.min)
+
+# According to https://github.com/vllm-project/vllm/blob/main
+#              /csrc/quantization/utils.cuh#L31,
+# need to make the max for the uz datatype be 224.0 for accuracy reasons.
+FP8_MAX = triton.language.constexpr(
+    default_float8_info.max if default_eight_bit_dtype_torch !=
+    torch.float8_e4m3fnuz else 224.0)
+
+
+class MetaData:
+    cu_seqlens_q = None
+    cu_seqlens_k = None
+    max_seqlens_q = 0
+    max_seqlens_k = 0
+    bias = None
+    alibi_slopes = None
+    causal = False
+    num_contexts = 0
+    varlen = False
+    eight_bit = False
+    layout = None
+    return_encoded_softmax = False
+    eight_bit_dtype_triton = default_eight_bit_dtype_triton
+    eight_bit_dtype_torch = default_eight_bit_dtype_torch
+    output_dtype = None
+
+    # Note about layouts:
+    #
+    # thd - [num_tokens, num_heads, head_size]
+    # bshd - [batch_size, seq_len, num_heads, head_size]
+    # bhsd - [batch_size, num_heads, seq_len, head_size]
+    #
+    # This is for each tensor, all tensors must have same layout.
+    # Q can have num_heads and seq_len differ from  from K and V,
+    # however K and V must agree on this.
+    #
+    # Notes about varlen and bias:
+    # Only one or the other is implemented, meaning can't combine
+    # both varlen and bias right now.
+    #
+    # Note about quantization:
+    # Only 8-bit quantization supported (for now) and specifically fp8.
+    # Scales must be tensors.
+    # o_scale: This is 'output scaling', but comes from parameter called
+    # 'input_scale', this is applied to the output from the kernel.
+    # o_scale should be None if none of the other quantization parameters
+    # are used.
+    #
+    # NOTE: Object is in a tentatively good state after initialized, however,
+    # to verify, call check_args(q,k,v,o) where o is the output tensor.
+    def __init__(
+        self,
+        sm_scale=1.0,
+        layout=None,  # layout can be 'bshd', 'bhsd', or 'thd'
+        output_dtype=None,
+        max_seqlens_q=0,
+        max_seqlens_k=0,
+        # varlen params
+        cu_seqlens_q=None,  # only 'thd' layout supported for varlen 
+        cu_seqlens_k=None,
+        # quant params
+        q_descale=None,
+        k_descale=None,
+        v_descale=None,
+        p_scale=None,
+        o_scale=None,
+        # bias params
+        bias=None,  # varlen not implemented for bias
+        seqlen_q=None,
+        seqlen_k=None,
+        # alibi params
+        alibi_slopes=None,
+        alibi_batch=None,
+        alibi_nheads=None,
+        # causal
+        causal=None,
+    ):
+        self.sm_scale = sm_scale
+        self.output_dtype = output_dtype
+        self.max_seqlens_q = max_seqlens_q
+        self.max_seqlens_k = max_seqlens_k
+        self.layout = layout
+        if cu_seqlens_q is not None or cu_seqlens_k is not None:
+            assert cu_seqlens_q is not None and cu_seqlens_k is not None
+            assert layout is None or layout not in [
+                'bshd', 'bhsd'
+            ], "Varlen only implemented for thd layout"
+            self.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
+        quant_params = [q_descale, k_descale, v_descale, p_scale, o_scale]
+        if any(x is not None for x in quant_params):
+            p_descale = 1.0 / p_scale if p_scale is not None else None
+            self.set_eight_bit_params(q_descale, k_descale, v_descale, p_scale,
+                                      p_descale, o_scale)
+        if bias is not None:
+            self.need_bias(bias, seqlen_q, seqlen_k)
+        if alibi_slopes is not None:
+            self.need_alibi(alibi_slopes, alibi_batch, alibi_nheads)
+        if causal is not None and causal:
+            self.need_causal()
+
+    def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k):
+        self.varlen = True
+        self.layout = 'thd'
+        self.cu_seqlens_q = cu_seqlens_q
+        self.cu_seqlens_k = cu_seqlens_k
+        # Without "varlen", there should still be one sequence.
+        assert len(cu_seqlens_q) >= 2
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+        self.num_contexts = len(cu_seqlens_q) - 1
+        for i in range(0, self.num_contexts):
+            self.max_seqlens_q = max(
+                cu_seqlens_q[i + 1].item() - cu_seqlens_q[i].item(),
+                self.max_seqlens_q)
+            self.max_seqlens_k = max(
+                cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item(),
+                self.max_seqlens_k)
+
+    def set_eight_bit_params(self, q_descale, k_descale, v_descale, p_scale,
+                             p_descale, o_scale):
+        self.eight_bit = True
+        self.q_descale = q_descale
+        self.k_descale = k_descale
+        self.v_descale = v_descale
+        self.p_scale = p_scale
+        self.p_descale = p_descale
+        self.o_scale = o_scale
+        self.use_p_scale = (p_scale is not None) and (
+            p_descale is not None) and (v_descale is not None)
+        self.eight_bit_kv = ((q_descale is None) and (k_descale is not None)
+                             and (v_descale is not None))
+        self.eight_bit_dtype_torch = default_eight_bit_dtype_torch
+
+    def need_bias(self, bias, seqlen_q, seqlen_k):
+        assert bias is not None
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        assert bias.shape[0] == 1
+        assert bias.shape[2:] == (seqlen_q, seqlen_k)
+        self.bias = bias
+
+    def need_alibi(self, alibi_slopes, batch, nheads):
+        assert alibi_slopes.is_cuda
+        assert alibi_slopes.dim() == 2
+        assert alibi_slopes.shape[0] == batch
+        assert alibi_slopes.shape[1] == nheads
+        self.alibi_slopes = alibi_slopes
+
+    def need_causal(self):
+        self.causal = True
+
+    def check_args(self, q, k, v, o):
+        assert q.dim() == k.dim() and q.dim() == v.dim()
+
+        batch, nheads_q, nheads_k, head_size = get_shape_from_layout(
+            q, k, self)
+        if self.varlen:
+            assert q.dim() == 3
+            assert self.cu_seqlens_q is not None
+            assert self.cu_seqlens_k is not None
+            assert len(self.cu_seqlens_q) == len(self.cu_seqlens_k)
+            # TODO: Remove once bias is supported with varlen
+            assert self.bias is None
+            assert not self.return_encoded_softmax
+        else:
+            assert q.dim() == 4
+            assert self.max_seqlens_q > 0 and self.max_seqlens_k > 0
+            assert self.cu_seqlens_q is None and self.cu_seqlens_k is None
+        assert k.shape == v.shape
+        assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+        # TODO: Change assert if we support qkl f8 and v f16
+        if self.eight_bit:
+            if self.eight_bit_kv:
+                assert (v.dtype == k.dtype
+                        and k.dtype == self.eight_bit_dtype_torch)
+                assert q.dtype != k.dtype
+                assert (self.v_descale is not None) and (self.k_descale
+                                                         is not None)
+            else:
+                assert (q.dtype == k.dtype and q.dtype == v.dtype
+                        and q.dtype == self.eight_bit_dtype_torch)
+                assert (self.q_descale
+                        is not None) and (self.k_descale
+                                          is not None) and (self.v_descale
+                                                            is not None)
+                if self.use_p_scale:
+                    assert (self.p_scale is not None) and (self.p_descale
+                                                           is not None)
+        else:
+            assert (q.dtype == k.dtype) and (q.dtype == v.dtype)
+        assert head_size <= 256
+        assert o.shape == q.shape
+        assert (nheads_q % nheads_k) == 0
+        assert self.layout is not None
+        assert self.layout == 'thd' or not self.varlen
 
 
 @triton.jit
@@ -38,40 +244,85 @@ def max_fn(x, y):
     return tl.math.max(x, y)
 
 
+# Convenience function to load with optional boundary checks.
+# "First" is the major dim, "second" is the minor dim.
 @triton.jit
-def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
-    ms = tl.arange(0, m)
-    ns = tl.arange(0, n)
-    return philox_offset + ms[:, None] * stride + ns[None, :]
+def masked_load(ptrs, offset_first, offset_second, boundary_first,
+                boundary_second):
+    if offset_first is not None and offset_second is not None:
+        mask = (offset_first[:, None] < boundary_first) & \
+               (offset_second[None, :] < boundary_second)
+        tensor = tl.load(ptrs, mask=mask, other=0.0)
+    elif offset_first is not None:
+        mask = offset_first[:, None] < boundary_first
+        tensor = tl.load(ptrs, mask=mask, other=0.0)
+    elif offset_second is not None:
+        mask = offset_second[None, :] < boundary_second
+        tensor = tl.load(ptrs, mask=mask, other=0.0)
+    else:
+        tensor = tl.load(ptrs)
+    return tensor
 
 
 @triton.jit
-def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n,
-                                  stride).to(tl.uint32)
-    # TODO: use tl.randint for better performance
-    return tl.rand(philox_seed, rng_offsets)
+def compute_alibi_block(alibi_slope,
+                        seqlen_q,
+                        seqlen_k,
+                        offs_m,
+                        offs_n,
+                        transpose=False):
+    # when seqlen_k and seqlen_q are different we want the diagonal to stick to
+    # the bottom right of the attention matrix
+    # for casual mask we want something like this where (1 is kept and 0 is
+    # masked)
+    # seqlen_q = 2 and seqlen_k = 5
+    #   1 1 1 1 0
+    #   1 1 1 1 1
+    # seqlen_q = 5 and seqlen_k = 2
+    #        0 0
+    #        0 0
+    #        0 0
+    #        1 0
+    #        1 1
+    # for alibi the diagonal is 0 indicating no penalty for attending to that
+    # spot and increasing penalty for attending further from the diagonal
+    # e.g. alibi_slope = 1, seqlen_q = 2, seqlen_k = 5,
+    # offs_m = [0, 1, 2, 3], offs_n = [0, 1, 2, 3, 4], transpose = False
+    # 1. offs_m[:,None] = [[0],
+    #                       [1],
+    # 2. offs_m[:,None] + seqlen_k = [[5],
+    #                                  [6],
+    # 3. offs_m[:,None] + seqlen_k - seqlen_q = [[3],
+    #                                             [4],
+    # 4. offs_m[:,None] + seqlen_k - seqlen_q - offs_n[None,:] =
+    # [[3], - [[0, 1, 2, 3, 4]] =  [[ 3, 2, 1, 0,-1], [4], [ 4, 3, 2, 1, 0]]
+    # 5. -1 * alibi_slope * tl.abs(relative_pos_block) = [[ -3, -2, -1, 0,-1],
+    #                                                    [ -4, -3, -2, -1, 0]],
+    relative_pos_block = (offs_m[:, None] + seqlen_k - seqlen_q -
+                          offs_n[None, :])
+    alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block)
+    if transpose:
+        return alibi_block.T
+    else:
+        return alibi_block
 
 
-@triton.jit
-def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
-    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n,
-                             stride)
-    rng_keep = rng_output > dropout_p
-    return rng_keep
+def compute_alibi_tensor(alibi_slopes, seqlen_q, seqlen_k):
+    q_idx = torch.arange(seqlen_q, dtype=torch.int32,
+                         device="cuda").unsqueeze(-1)  # (N_CTX_Q, 1)
+    k_idx = torch.arange(seqlen_k, dtype=torch.int32,
+                         device="cuda").unsqueeze(0)  # (1, N_CTX_K)
+    relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q -
+                             k_idx)  # (N_CTX_Q, N_CTX_K)
+    return -1 * alibi_slopes.unsqueeze(-1).unsqueeze(
+        -1) * relative_pos  # (Z, H, N_CTX_Q, N_CTX_K)
 
 
 @triton.jit
-def load_fn(block_ptr, first, second, pad):
-    if first and second:
-        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
-    elif first:
-        tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad)
-    elif second:
-        tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad)
-    else:
-        tensor = tl.load(block_ptr)
-    return tensor
+def quant_fp8(x, scale):
+    x *= scale
+    x = tl.clamp(x, FP8_MIN, FP8_MAX)
+    return x
 
 
 @triton.jit
@@ -80,58 +331,68 @@ def _attn_fwd_inner(
     l_i,
     m_i,
     q,
-    K_block_ptr,
-    V_block_ptr,
+    k_ptrs,
+    v_ptrs,
+    bias_ptrs,
+    stride_kn,
+    stride_vk,
+    stride_bn,
     start_m,
     actual_seqlen_k,
-    dropout_p,
+    actual_seqlen_q,
     philox_seed,
     batch_philox_offset,
-    encoded_softmax_block_ptr,
+    encoded_sm_ptrs,
     block_min,
     block_max,
     offs_n_causal,
     masked_blocks,
     n_extra_tokens,
-    bias_ptr,
+    alibi_slope,
+    q_descale,
+    k_descale,
+    v_descale,
+    p_scale,
     IS_CAUSAL: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
     OFFS_M: tl.constexpr,
     OFFS_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    MASK_STEPS: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
-    PADDED_HEAD: tl.constexpr,
+    SHOULD_PRE_LOAD_V: tl.constexpr,
+    SHOULD_MASK_STEPS: tl.constexpr,
+    SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    USE_PADDED_HEAD: tl.constexpr,
+    IS_ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    QK_SCALE: tl.constexpr,
+    IS_EIGHT_BIT_GEMM: tl.constexpr,
+    USE_P_SCALE: tl.constexpr,
+    IS_EIGHT_BIT_KV: tl.constexpr,
+    QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton,
 ):
+
     # loop over k, v, and update accumulator
     for start_n in range(block_min, block_max, BLOCK_N):
         # For padded blocks, we will overrun the tensor size if
         # we load all BLOCK_N. For others, the blocks are all within range.
-        k = load_fn(
-            K_block_ptr,
-            PADDED_HEAD,
-            MASK_STEPS and (n_extra_tokens != 0),
-            "zero",
-        )
-        if PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
+        k_offs_n = start_n + tl.arange(0,
+                                       BLOCK_N) if SHOULD_MASK_STEPS else None
+        k_offs_k = None if not USE_PADDED_HEAD else tl.arange(0, BLOCK_DMODEL)
+        k = masked_load(k_ptrs, k_offs_k, k_offs_n, IS_ACTUAL_BLOCK_DMODEL,
+                        actual_seqlen_k)
+        if SHOULD_PRE_LOAD_V:
+            # We can use the same offsets as k, just with dims transposed.
+            v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k,
+                            IS_ACTUAL_BLOCK_DMODEL)
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         # We start from end of seqlen_k so only the first iteration would need
         # to be checked for padding if it is not a multiple of block_n
         # TODO: This can be optimized to only be true for the padded block.
-        if MASK_STEPS:  # noqa: SIM102
+        if SHOULD_MASK_STEPS:  # noqa: SIM102
             # If this is the last block / iteration, we want to
             # mask if the sequence length is not a multiple of block size
-            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
-            # if not is_modulo_mn. last step might get wasted but that is okay.
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not
+            # is_modulo_mn. last step might get wasted but that is okay.
             # check if this masking works for that case.
             if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
                 boundary_m = tl.full([BLOCK_M],
@@ -144,167 +405,276 @@ def _attn_fwd_inner(
             causal_boundary = start_n + offs_n_causal
             causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
             qk = tl.where(causal_mask, qk, float("-inf"))
+
         # -- compute qk ----
-        qk += tl.dot(q, k)
-        if bias_ptr is not None:
-            bias = load_fn(bias_ptr, False, MASK_STEPS
-                           and (n_extra_tokens != 0), "zero")
-            # While bias is added after multiplying qk with sm_scale, our
-            # optimization to use 2^x instead of e^x results in an additional
-            # scale factor of log2(e) which we must also multiply the bias with.
-            qk += bias * 1.44269504089
+        if IS_EIGHT_BIT_GEMM:
+            qk += ((((tl.dot(q, k).to(tl.float32) * q_descale)) * k_descale) *
+                   QK_SCALE)
+        else:
+            if IS_EIGHT_BIT_KV:
+                k = (k * k_descale).to(q.type.element_ty)
+            qk += (tl.dot(q, k) * QK_SCALE)
+
+        if bias_ptrs is not None:
+            bias_offs_n = start_n + tl.arange(
+                0, BLOCK_N) if SHOULD_MASK_STEPS else None
+            bias = masked_load(bias_ptrs, OFFS_M, bias_offs_n, actual_seqlen_q,
+                               actual_seqlen_k)
+            # While bias is added after multiplying qk with sm_scale,
+            # our optimization to use 2^x instead of e^x results in an
+            # additional scale factor of log2(e) which we must also multiply
+            # the bias with.
+            qk += (bias * 1.44269504089)
+
+        if alibi_slope is not None:
+            # Compute the global position of each token within the sequence
+            global_m_positions = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+            global_n_positions = start_n + tl.arange(0, BLOCK_N)
+            alibi_block = compute_alibi_block(alibi_slope, actual_seqlen_q,
+                                              actual_seqlen_k,
+                                              global_m_positions,
+                                              global_n_positions)
+            qk += (alibi_block * 1.44269504089)  # scale factor of log2(e)
+
+        # softmax
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk = qk - m_ij[:, None]
         p = tl.math.exp2(qk)
 
         # CAVEAT: Must update l_ij before applying dropout
         l_ij = tl.sum(p, 1)
-        if ENABLE_DROPOUT:
-            philox_offset = (batch_philox_offset +
-                             start_m * BLOCK_M * actual_seqlen_k + start_n -
-                             BLOCK_N)
-            keep = dropout_mask(
-                philox_seed,
-                philox_offset,
-                dropout_p,
-                BLOCK_M,
-                BLOCK_N,
-                actual_seqlen_k,
-            )
-            if RETURN_ENCODED_SOFTMAX:
-                tl.store(
-                    encoded_softmax_block_ptr,
-                    tl.where(keep, p,
-                             -p).to(encoded_softmax_block_ptr.type.element_ty),
-                )
-            p = tl.where(keep, p, 0.0)
-        elif RETURN_ENCODED_SOFTMAX:
-            tl.store(
-                encoded_softmax_block_ptr,
-                p.to(encoded_softmax_block_ptr.type.element_ty),
-            )
+        if SHOULD_RETURN_ENCODED_SOFTMAX:
+            tl.store(encoded_sm_ptrs, p.to(encoded_sm_ptrs.type.element_ty))
         # -- update output accumulator --
         alpha = tl.math.exp2(m_i - m_ij)
         acc = acc * alpha[:, None]
-        if not PRE_LOAD_V:
-            v = load_fn(
-                V_block_ptr,
-                MASK_STEPS and (n_extra_tokens != 0),
-                PADDED_HEAD,
-                "zero",
-            )
+        if not SHOULD_PRE_LOAD_V:
+            v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k,
+                            IS_ACTUAL_BLOCK_DMODEL)
         # -- update m_i and l_i
         l_i = l_i * alpha + l_ij
         # update m_i and l_i
         m_i = m_ij
-        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
-        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
-        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
-                                                   (0, BLOCK_N))
+
+        if IS_EIGHT_BIT_GEMM:
+            if USE_P_SCALE:
+                p = quant_fp8(p, p_scale).to(QUANT_DTYPE)
+                acc += tl.dot(p, v)
+            else:
+                # v is in eight_bit but p is not, we want the gemm in p's type
+                acc += tl.dot(p, v.to(p.type.element_ty))
+        else:
+            if IS_EIGHT_BIT_KV:
+                v = (v * v_descale).to(p.type.element_ty)
+            acc += tl.dot(p.to(v.type.element_ty), v)
+
+        k_ptrs += BLOCK_N * stride_kn
+        v_ptrs += BLOCK_N * stride_vk
+        if bias_ptrs is not None:
+            bias_ptrs += BLOCK_N * stride_bn
+        if SHOULD_RETURN_ENCODED_SOFTMAX:
+            encoded_sm_ptrs += BLOCK_N
     return acc, l_i, m_i
 
 
-@triton.autotune(
-    configs=[
+def get_cdna_autotune_configs():
+    return [
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 128,
+                'waves_per_eu': 2,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'waves_per_eu': 2,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'waves_per_eu': 3,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
+            },
+            num_stages=1,
+            num_warps=4),
         triton.Config(
             {
-                "BLOCK_M": 256,
-                "BLOCK_N": 64,
-                "waves_per_eu": 2,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'waves_per_eu': 1,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=8,
-        ),
+            num_warps=4),
         triton.Config(
             {
-                "BLOCK_M": 128,
-                "BLOCK_N": 128,
-                "waves_per_eu": 2,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 128,
+                'BLOCK_N': 32,
+                'waves_per_eu': 2,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=4,
-        ),
+            num_warps=4),
+    ], [
+        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
+        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
+    ]
+
+
+def get_rdna_autotune_configs():
+    return [
         triton.Config(
             {
-                "BLOCK_M": 256,
-                "BLOCK_N": 128,
-                "waves_per_eu": 2,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 32,
+                'BLOCK_N': 32,
+                'waves_per_eu': 4,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=8,
-        ),
+            num_warps=2),
         triton.Config(
             {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "waves_per_eu": 1,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 32,
+                'BLOCK_N': 32,
+                'waves_per_eu': 2,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=4,
-        ),
+            num_warps=2),
         triton.Config(
             {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "waves_per_eu": 3,
-                "PRE_LOAD_V": True,
+                'BLOCK_M': 32,
+                'BLOCK_N': 16,
+                'waves_per_eu': 4,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=4,
-        ),
+            num_warps=2),
         triton.Config(
             {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "waves_per_eu": 3,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 32,
+                'BLOCK_N': 16,
+                'waves_per_eu': 2,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=4,
-        ),
+            num_warps=2),
         triton.Config(
             {
-                "BLOCK_M": 64,
-                "BLOCK_N": 64,
-                "waves_per_eu": 4,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 16,
+                'BLOCK_N': 16,
+                'waves_per_eu': 4,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=8,
-        ),
+            num_warps=2),
         triton.Config(
             {
-                "BLOCK_M": 32,
-                "BLOCK_N": 32,
-                "waves_per_eu": 4,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 16,
+                'BLOCK_N': 16,
+                'waves_per_eu': 2,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=8,
-        ),
-        # TODO: This config fails with head_size not pow2 with data mismatches.
-        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
-        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+            num_warps=2),
+        # Fall-back config.
         triton.Config(
             {
-                "BLOCK_M": 16,
-                "BLOCK_N": 16,
-                "waves_per_eu": 1,
-                "PRE_LOAD_V": False,
+                'BLOCK_M': 16,
+                'BLOCK_N': 16,
+                'waves_per_eu': 1,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
             },
             num_stages=1,
-            num_warps=4,
-        ),
-    ],
-    key=['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL'],
+            num_warps=2),
+    ], [
+        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
+        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
+    ]
+
+
+def get_general_autotune_configs():
+    return [
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 128,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 64,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
+            },
+            num_stages=1,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_M': 128,
+                'BLOCK_N': 32,
+                'SHOULD_PRE_LOAD_V': False,
+                'GRID_CU_MULTIP': 2
+            },
+            num_stages=1,
+            num_warps=4),
+    ], [
+        'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K',
+        'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK'
+    ]
+
+
+def has_cdna_target():
+    ROCM_CDNA_TARGETS = ["gfx942", "gfx90a", "gfx908"]
+    return triton.runtime.driver.active.get_current_target(
+    ).arch in ROCM_CDNA_TARGETS
+
+
+def is_rocm_cdna():
+    return current_platform.is_rocm() and has_cdna_target()
+
+
+def get_autotune_configs():
+    if is_rocm_cdna():
+        return get_cdna_autotune_configs()
+    elif current_platform.is_rocm():
+        return get_rdna_autotune_configs()
+    else:
+        return get_general_autotune_configs()
+
+
+autotune_configs, autotune_keys = get_autotune_configs()
+
+
+@triton.autotune(
+    configs=autotune_configs,
+    key=autotune_keys,
+    use_cuda_graph=True,
 )
 @triton.jit
 def attn_fwd(
@@ -312,38 +682,53 @@ def attn_fwd(
     K,
     V,
     bias,
-    sm_scale,
+    SM_SCALE: tl.constexpr,
     L,
     Out,
-    stride_qz,
-    stride_qh,
-    stride_qm,
-    stride_qk,
-    stride_kz,
-    stride_kh,
-    stride_kn,
-    stride_kk,
-    stride_vz,
-    stride_vh,
-    stride_vk,
-    stride_vn,
-    stride_oz,
-    stride_oh,
-    stride_om,
-    stride_on,
-    stride_bz,
-    stride_bh,
-    stride_bm,
-    stride_bn,
+    stride_qz: tl.int64,
+    stride_qh: tl.int64,
+    stride_qm: tl.int64,
+    stride_qk: tl.int64,
+    stride_kz: tl.int64,
+    stride_kh: tl.int64,
+    stride_kn: tl.int64,
+    stride_kk: tl.int64,
+    stride_vz: tl.int64,
+    stride_vh: tl.int64,
+    stride_vk: tl.int64,
+    stride_vn: tl.int64,
+    stride_oz: tl.int64,
+    stride_oh: tl.int64,
+    stride_om: tl.int64,
+    stride_on: tl.int64,
+    stride_bz: tl.int64,
+    stride_bh: tl.int64,
+    stride_bm: tl.int64,
+    stride_bn: tl.int64,
+    stride_az: tl.int64,
+    stride_ah: tl.int64,
+    q_descale_ptr,
+    k_descale_ptr,
+    p_scale_ptr,
+    p_descale_ptr,
+    o_descale_ptr,
+    v_descale_ptr,
+    q_descale_has_singleton: tl.constexpr,
+    k_descale_has_singleton: tl.constexpr,
+    p_descale_has_singleton: tl.constexpr,
+    v_descale_has_singleton: tl.constexpr,
     cu_seqlens_q,
     cu_seqlens_k,
-    dropout_p,
     philox_seed,
+    NUM_CU: tl.constexpr,
+    GRID_CU_MULTIP: tl.constexpr,
+    B: tl.constexpr,
     philox_offset_base,
     encoded_softmax,
+    alibi_slopes,
     HQ: tl.constexpr,
     HK: tl.constexpr,
-    ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    IS_ACTUAL_BLOCK_DMODEL: tl.constexpr,
     MAX_SEQLENS_Q: tl.constexpr,
     MAX_SEQLENS_K: tl.constexpr,
     VARLEN: tl.constexpr,
@@ -351,24 +736,39 @@ def attn_fwd(
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
-    PRE_LOAD_V: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    ENABLE_DROPOUT: tl.constexpr,
-    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    SHOULD_PRE_LOAD_V: tl.constexpr,
+    USE_BIAS: tl.constexpr,
+    SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    USE_ALIBI: tl.constexpr,
+    IS_EIGHT_BIT: tl.constexpr,
+    USE_P_SCALE: tl.constexpr,
+    IS_EIGHT_BIT_KV: tl.constexpr,
+    QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton,
 ):
-    start_m = tl.program_id(0)
-    off_h_q = tl.program_id(1)
-    off_z = tl.program_id(2)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
+
+    if o_descale_ptr is not None:
+        o_descale = tl.load(o_descale_ptr)
+
+    start_m: tl.int64 = tl.program_id(0)
+    off_h_q: tl.int64 = tl.program_id(1)
+    off_z: tl.int64 = tl.program_id(2)
+
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M).to(tl.int64)
+    offs_n = tl.arange(0, BLOCK_N).to(tl.int64)
+    offs_d = tl.arange(0, BLOCK_DMODEL).to(tl.int64)
+
+    # as we can't have return statements inside while loop in Triton
+    continue_condition = True
+
     if VARLEN:
         cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
         cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
         seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
-        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
-        # small for all start_m so for those we return early.
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be
+        # too small for all start_m so for those we return early.
         if start_m * BLOCK_M > seqlen_q:
-            return
+            continue_condition = False
+            # return
         cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
         cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
         seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
@@ -378,444 +778,598 @@ def attn_fwd(
         seqlen_q = MAX_SEQLENS_Q
         seqlen_k = MAX_SEQLENS_K
 
-    # Now we compute whether we need to exit early due to causal masking.
-    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
-    # are completely masked, resulting in 0s written to the output, and
-    # inf written to LSE. We don't need to do any GEMMs in this case.
-    # This block of code determines what N is, and if this WG is operating
-    # on those M rows.
-    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
-    if IS_CAUSAL:
-        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
-        # If seqlen_q != seqlen_k, attn scores are rectangular which means
-        # the causal mask boundary is bottom right aligned, and ends at either
-        # the top edge (seqlen_q < seqlen_k) or left edge.
-        # This captures the decrease in n_blocks if we have a rectangular attn
-        # matrix
-        n_blocks_seqlen = cdiv_fn(
-            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
-        # This is what adjusts the block_max for the current WG, only
-        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
-        n_blocks = min(n_blocks, n_blocks_seqlen)
-        # If we have no blocks after adjusting for seqlen deltas, this WG is
-        # part of the blocks that are all 0. We exit early.
-        if n_blocks <= 0:
-            o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
-                        off_h_q * stride_oh)
-            O_block_ptr = tl.make_block_ptr(
-                base=Out + o_offset,
-                shape=(seqlen_q, BLOCK_DMODEL),
-                strides=(stride_om, stride_on),
-                offsets=(start_m * BLOCK_M, 0),
-                block_shape=(BLOCK_M, BLOCK_DMODEL),
-                order=(1, 0),
-            )
-            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
-            # We still need to write 0s to the result
-            # tl.store(O_block_ptr,
-            # acc.to(Out.type.element_ty), boundary_check=(0,1))
-            # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
-            #          + offs_m
-            # We store inf to LSE, not -inf because in the bwd pass,
-            # we subtract this
-            # from qk which makes it -inf, such that exp(qk - inf) = 0
-            # for these masked blocks.
-            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
-            # tl.store(l_ptrs, l)
-            # TODO: Should dropout and return encoded softmax be handled here?
-            return
-
-    # If MQA / GQA, set the K and V head offsets appropriately.
-    GROUP_SIZE: tl.constexpr = HQ // HK
-    off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
-
-    n_extra_tokens = 0
-    if seqlen_k < BLOCK_N:
-        n_extra_tokens = BLOCK_N - seqlen_k
-    elif seqlen_k % BLOCK_N:
-        n_extra_tokens = seqlen_k % BLOCK_N
-    padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
-
-    # Compute pointers for all the tensors used in this kernel.
-    q_offset = (off_z * stride_qz + off_h_q * stride_qh +
-                cu_seqlens_q_start * stride_qm)
-    Q_block_ptr = tl.make_block_ptr(
-        base=Q + q_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_qm, stride_qk),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    k_offset = (off_z * stride_kz + off_h_k * stride_kh +
-                cu_seqlens_k_start * stride_kn)
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
-        strides=(stride_kk, stride_kn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_DMODEL, BLOCK_N),
-        order=(0, 1),
-    )
-    v_offset = (off_z * stride_vz + off_h_k * stride_vh +
-                cu_seqlens_k_start * stride_vk)
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_vk, stride_vn),
-        offsets=(0, 0),
-        block_shape=(BLOCK_N, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    if BIAS_TYPE != 0:
-        bias_ptr = tl.make_block_ptr(
-            base=bias + off_h_q * stride_bh,
-            shape=(seqlen_q, seqlen_k),
-            strides=(stride_bm, stride_bn),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        bias_ptr = None
-    if ENABLE_DROPOUT:
-        batch_philox_offset = philox_offset_base \
-                              + (off_z * HQ + off_h_q) \
-                              * seqlen_q * seqlen_k
-    else:
-        batch_philox_offset = 0
-    # We can ask to return the dropout mask without actually doing any dropout.
-    # In this case, we return an invalid pointer so indicate the mask is not i
-    # valid.
-    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
-    if RETURN_ENCODED_SOFTMAX:
-        encoded_softmax_block_ptr = tl.make_block_ptr(
-            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
-            shape=(seqlen_q, seqlen_k),
-            strides=(seqlen_k, 1),
-            offsets=(start_m * BLOCK_M, 0),
-            block_shape=(BLOCK_M, BLOCK_N),
-            order=(1, 0),
-        )
-    else:
-        encoded_softmax_block_ptr = 0
-    # initialize pointer to m and l
-    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
-    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
-    # have native e^x support in HW.
-    qk_scale = sm_scale * 1.44269504089
-    # Q is loaded once at the beginning and shared by all N blocks.
-    q = load_fn(Q_block_ptr, True, padded_head, "zero")
-    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
-
-    # Here we compute how many full and masked blocks we have.
-    padded_block_k = n_extra_tokens != 0
-    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
-    if IS_CAUSAL:
-        # There are always at least BLOCK_M // BLOCK_N masked blocks.
-        # Additionally there might be one more due to dissimilar seqlens.
-        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
-    else:
-        # Padding on Q does not need to be masked in the FA loop.
-        masked_blocks = padded_block_k
-    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
-    # block. In this case we might exceed n_blocks so pick the min.
-    masked_blocks = min(masked_blocks, n_blocks)
-    n_full_blocks = n_blocks - masked_blocks
-    block_min = 0
-    block_max = n_blocks * BLOCK_N
-    # Compute for full blocks. Here we set causal to false regardless of its
-    # value because there is no masking. Similarly we do not need padding.
-    if n_full_blocks > 0:
-        block_max = (n_blocks - masked_blocks) * BLOCK_N
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
-            block_min,
-            block_max,
-            0,
-            0,
-            0,
-            bias_ptr,
-            # IS_CAUSAL, ....
-            False,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            False,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-        )
-        block_min = block_max
-        block_max = n_blocks * BLOCK_N
-
-    tl.debug_barrier()
-    # Remaining blocks, if any, are full / not masked.
-    if masked_blocks > 0:
-        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
-        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
-        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
-        if bias_ptr is not None:
-            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
-        if RETURN_ENCODED_SOFTMAX:
-            encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr,
-                                                   (0, n_full_blocks))
-        acc, l_i, m_i = _attn_fwd_inner(
-            acc,
-            l_i,
-            m_i,
-            q,
-            K_block_ptr,
-            V_block_ptr,
-            start_m,
-            seqlen_k,
-            dropout_p,
-            philox_seed,
-            batch_philox_offset,
-            encoded_softmax_block_ptr,
-            block_min,
-            block_max,
-            offs_n_causal,
-            masked_blocks,
-            n_extra_tokens,
-            bias_ptr,
-            IS_CAUSAL,
-            BLOCK_M,
-            BLOCK_DMODEL,
-            BLOCK_N,
-            offs_m,
-            offs_n,
-            # _, MASK_STEPS, ...
-            PRE_LOAD_V,
-            True,
-            ENABLE_DROPOUT,
-            RETURN_ENCODED_SOFTMAX,
-            padded_head,
-        )
-    # epilogue
-    acc = acc / l_i[:, None]
-    if ENABLE_DROPOUT:
-        acc = acc / (1 - dropout_p)
-    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
-    # then we have one block with a row of all NaNs which come from computing
-    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
-    # and store 0s where there are NaNs as these rows should've been zeroed out.
-    end_m_idx = (start_m + 1) * BLOCK_M
-    start_m_idx = start_m * BLOCK_M
-    causal_start_idx = seqlen_q - seqlen_k
-    acc = acc.to(Out.type.element_ty)
-    if IS_CAUSAL:  # noqa: SIM102
-        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
-            out_mask_boundary = tl.full((BLOCK_DMODEL, ),
-                                        causal_start_idx,
-                                        dtype=tl.int32)
-            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
-            out_ptrs_mask = (mask_m_offsets[:, None]
-                             >= out_mask_boundary[None, :])
-            z = 0.0
-            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
-    # write back LSE
-    # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
-    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
-    # few rows. This is only true for the last M block. For others,
-    # overflow_size will be -ve
-    # overflow_size = end_m_idx - seqlen_q
-    # if overflow_size > 0:
-    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
-    #    # This is a > check because mask being 0 blocks the store.
-    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
-    # else:
-    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
-
-    # write back O
-    o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om +
-                off_h_q * stride_oh)
-    O_block_ptr = tl.make_block_ptr(
-        base=Out + o_offset,
-        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
-        strides=(stride_om, stride_on),
-        offsets=(start_m * BLOCK_M, 0),
-        block_shape=(BLOCK_M, BLOCK_DMODEL),
-        order=(1, 0),
-    )
-    # Need boundary check on this to make sure the padding from the
-    # Q and KV tensors in both dims are not part of what we store back.
-    # TODO: Do the boundary check optionally.
-    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
-
-
-def check_args(
-    q,
-    k,
-    v,
-    o,
-    varlen=True,
-    max_seqlens=None,
-    cu_seqlens_q=None,
-    cu_seqlens_k=None,
-):
-    assert q.dim() == k.dim() and q.dim() == v.dim()
-    if varlen:
-        assert q.dim() == 3
-        total_q, nheads_q, head_size = q.shape
-        total_k, nheads_k, _ = k.shape
-        assert cu_seqlens_q is not None
-        assert cu_seqlens_k is not None
-        assert len(cu_seqlens_q) == len(cu_seqlens_k)
-    else:
-        assert q.dim() == 4
-        batch, nheads_q, seqlen_q, head_size = q.shape
-        _, nheads_k, seqlen_k, _ = k.shape
-        assert max_seqlens > 0
-    assert k.shape == v.shape
-    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
-    # TODO: Change assert if we support qkl f8 and v f16
-    assert q.dtype == k.dtype and q.dtype == v.dtype
-    assert head_size <= 256
-    assert o.shape == q.shape
-    assert (nheads_q % nheads_k) == 0
+    if continue_condition:
+        # Now we compute whether we need to exit early due to causal
+        # masking. This is because for seqlen_q > seqlen_k, M rows of the
+        # attn scores are completely masked, resulting in 0s written to the
+        # output, and inf written to LSE. We don't need to do any GEMMs in
+        # this case. This block of code determines what N is, and if this
+        # WG is operating on those M rows.
+        n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+        if (IS_CAUSAL):
+            # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+            # If seqlen_q != seqlen_k, attn scores are rectangular which
+            # means the causal mask boundary is bottom right aligned, and
+            # ends at either the top edge (seqlen_q < seqlen_k) or left
+            # edge. This captures the decrease in n_blocks if we have a
+            # rectangular attn matrix
+            n_blocks_seqlen = cdiv_fn(
+                (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N)
+            # This is what adjusts the block_max for the current WG, only
+            # if IS_CAUSAL. Otherwise we want to always iterate through all
+            # n_blocks
+            n_blocks = min(n_blocks, n_blocks_seqlen)
+            # If we have no blocks after adjusting for seqlen deltas, this
+            # WG is part of the blocks that are all 0. We exit early.
+            if n_blocks <= 0:
+                o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh +
+                            cu_seqlens_q_start * stride_om)
+                o_ptrs = (o_offset + offs_m[:, None] * stride_om +
+                          offs_d[None, :] * stride_on)
+                acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+                o_ptrs_mask = (offs_m[:, None] < seqlen_q).broadcast_to(
+                    [BLOCK_M, BLOCK_DMODEL])
+                # We still need to write 0s to the result
+                tl.store(o_ptrs, acc, mask=o_ptrs_mask)
+                # The tensor allocated for L is based on MAX_SEQLENS_Q as
+                # that is statically known.
+                l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q +
+                          off_h_q * MAX_SEQLENS_Q + offs_m)
+                # We store inf to LSE, not -inf because in the bwd pass,
+                # we subtract this from qk which makes it -inf, such that
+                # exp(qk - inf) = 0 for these masked blocks.
+                l_value = tl.full([BLOCK_M],
+                                  value=float("inf"),
+                                  dtype=tl.float32)
+                l_ptrs_mask = offs_m < MAX_SEQLENS_Q
+                tl.store(l_ptrs, l_value, mask=l_ptrs_mask)
+                # TODO: Should dropout and return encoded softmax be
+                # handled here too?
+                continue_condition = False
+                # return
+
+        if continue_condition:
+            # If MQA / GQA, set the K and V head offsets appropriately.
+            GROUP_SIZE: tl.constexpr = HQ // HK
+            off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q
+            n_extra_tokens = 0
+            if seqlen_k < BLOCK_N:
+                n_extra_tokens = BLOCK_N - seqlen_k
+            elif seqlen_k % BLOCK_N:
+                n_extra_tokens = seqlen_k % BLOCK_N
+            USE_PADDED_HEAD: tl.constexpr = (IS_ACTUAL_BLOCK_DMODEL
+                                             != BLOCK_DMODEL)
+
+            # Compute pointers for all the tensors used in this kernel.
+            q_offset = (Q + off_z * stride_qz + off_h_q * stride_qh +
+                        cu_seqlens_q_start * stride_qm)
+            q_ptrs = (q_offset + offs_m[:, None] * stride_qm +
+                      offs_d[None, :] * stride_qk)
+            k_offset = (K + off_z * stride_kz + off_h_k * stride_kh +
+                        cu_seqlens_k_start * stride_kn)
+            k_ptrs = (k_offset + offs_d[:, None] * stride_kk +
+                      offs_n[None, :] * stride_kn)
+            v_offset = (V + off_z * stride_vz + off_h_k * stride_vh +
+                        cu_seqlens_k_start * stride_vk)
+            v_ptrs = (v_offset + offs_n[:, None] * stride_vk +
+                      offs_d[None, :] * stride_vn)
+            # Compute pointers for all scale tensors used in this kernel.
+
+            IS_EIGHT_BIT_GEMM: tl.constexpr = IS_EIGHT_BIT & (
+                not IS_EIGHT_BIT_KV)
+            if IS_EIGHT_BIT:
+                if k_descale_has_singleton:
+                    k_descale_ptrs = k_descale_ptr
+                else:
+                    k_descale_ptrs = k_descale_ptr + off_h_k
+
+                if v_descale_has_singleton:
+                    v_descale_ptrs = v_descale_ptr
+                else:
+                    v_descale_ptrs = v_descale_ptr + off_h_k
+
+                if not IS_EIGHT_BIT_KV:
+                    if q_descale_has_singleton:
+                        q_descale_ptrs = q_descale_ptr
+                    else:
+                        q_descale_ptrs = q_descale_ptr + off_h_q
+                if USE_P_SCALE:
+                    if p_descale_has_singleton:
+                        p_scale_ptrs = p_scale_ptr
+                        p_descale_ptrs = p_descale_ptr
+                    else:
+                        p_scale_ptrs = p_scale_ptr + off_h_q
+                        p_descale_ptrs = p_descale_ptr + off_h_q
+
+            if USE_BIAS:
+                bias_offset = off_h_q * stride_bh
+                bias_ptrs = (bias + bias_offset + offs_m[:, None] * stride_bm +
+                             offs_n[None, :] * stride_bn)
+            else:
+                bias_ptrs = None
+
+            if USE_ALIBI:
+                a_offset = off_z * stride_az + off_h_q * stride_ah
+                alibi_slope = tl.load(alibi_slopes + a_offset)
+            else:
+                alibi_slope = None
+
+            batch_philox_offset = 0
+            # We can ask to return the dropout mask without doing any
+            # dropout. In this case, we return an invalid pointer so
+            # indicate the mask is not valid.
+            if SHOULD_RETURN_ENCODED_SOFTMAX:
+                encoded_sm_base = (encoded_softmax +
+                                   off_h_q * seqlen_q * seqlen_k)
+                encoded_sm_ptrs = (encoded_sm_base +
+                                   offs_m[:, None] * seqlen_k +
+                                   offs_n[None, :])
+            else:
+                encoded_sm_ptrs = None
+            # initialize pointer to m and l
+            m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+            l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+            # scale sm_scale by log_2(e) and use 2^x in the loop as we do
+            # not have native e^x support in HW.
+            QK_SCALE: tl.constexpr = SM_SCALE * 1.44269504089
+            # Q is loaded once at the beginning and shared by all N blocks.
+            q_ptrs_mask = offs_m[:, None] < seqlen_q
+            if USE_PADDED_HEAD:
+                q_ptrs_mask = q_ptrs_mask & (offs_d[None, :]
+                                             < IS_ACTUAL_BLOCK_DMODEL)
+            q = tl.load(q_ptrs, mask=q_ptrs_mask, other=0.0)
+
+            if IS_EIGHT_BIT:
+                k_descale = tl.load(k_descale_ptrs)
+                v_descale = tl.load(v_descale_ptrs)
+                q_descale = None if IS_EIGHT_BIT_KV else tl.load(
+                    q_descale_ptrs)
+                if USE_P_SCALE:
+                    p_scale = tl.load(p_scale_ptrs)
+                    p_descale = tl.load(p_descale_ptrs)
+                else:
+                    p_scale = None
+                    p_descale = None
+            else:
+                q_descale = None
+                k_descale = None
+                v_descale = None
+                p_scale = None
+                p_descale = None
+            # Here we compute how many full and masked blocks we have.
+            padded_block_k = n_extra_tokens != 0
+            is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+            if IS_CAUSAL:
+                # There are always at least BLOCK_M // BLOCK_N masked
+                # blocks.  Additionally there might be one more due to
+                # dissimilar seqlens.
+                masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+            else:
+                # Padding on Q does not need to be masked in the FA loop.
+                masked_blocks = padded_block_k
+            # if IS_CAUSAL, not is_modulo_mn does not always result in an
+            # additional block. In this case we might exceed n_blocks so
+            # pick the min.
+            masked_blocks = min(masked_blocks, n_blocks)
+            n_full_blocks = n_blocks - masked_blocks
+            block_min = 0
+            block_max = n_blocks * BLOCK_N
+            # Compute for full blocks. Here we set causal to false
+            # regardless of its actual value because there is no masking.
+            # Similarly we do not need padding.
+            if n_full_blocks > 0:
+                block_max = (n_blocks - masked_blocks) * BLOCK_N
+                acc, l_i, m_i = _attn_fwd_inner(
+                    acc,
+                    l_i,
+                    m_i,
+                    q,
+                    k_ptrs,
+                    v_ptrs,
+                    bias_ptrs,
+                    stride_kn,
+                    stride_vk,
+                    stride_bn,
+                    start_m,
+                    seqlen_k,
+                    seqlen_q,
+                    philox_seed,
+                    batch_philox_offset,
+                    encoded_sm_ptrs,
+                    # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+                    block_min,
+                    block_max,
+                    0,
+                    0,
+                    0,
+                    alibi_slope,
+                    q_descale,
+                    k_descale,
+                    v_descale,
+                    p_scale,
+                    # IS_CAUSAL, ....
+                    False,
+                    BLOCK_M,
+                    BLOCK_DMODEL,
+                    BLOCK_N,
+                    offs_m,
+                    offs_n,
+                    # _, SHOULD_MASK_STEPS, ...
+                    SHOULD_PRE_LOAD_V,
+                    False,
+                    SHOULD_RETURN_ENCODED_SOFTMAX,
+                    USE_PADDED_HEAD,
+                    IS_ACTUAL_BLOCK_DMODEL,
+                    QK_SCALE,
+                    IS_EIGHT_BIT_GEMM,
+                    USE_P_SCALE,
+                    IS_EIGHT_BIT_KV,
+                    QUANT_DTYPE)
+                block_min = block_max
+                block_max = n_blocks * BLOCK_N
+
+            tl.debug_barrier()
+            # Remaining blocks, if any, are full / not masked.
+            if (masked_blocks > 0):
+                if IS_CAUSAL:
+                    offs_n_causal = offs_n + (seqlen_q - seqlen_k)
+                else:
+                    offs_n_causal = 0
+                k_ptrs += n_full_blocks * BLOCK_N * stride_kn
+                v_ptrs += n_full_blocks * BLOCK_N * stride_vk
+                if USE_BIAS:
+                    bias_ptrs += n_full_blocks * BLOCK_N * stride_bn
+                if SHOULD_RETURN_ENCODED_SOFTMAX:
+                    encoded_sm_ptrs += n_full_blocks * BLOCK_N
+                acc, l_i, m_i = _attn_fwd_inner(
+                    acc,
+                    l_i,
+                    m_i,
+                    q,
+                    k_ptrs,
+                    v_ptrs,
+                    bias_ptrs,
+                    stride_kn,
+                    stride_vk,
+                    stride_bn,
+                    start_m,
+                    seqlen_k,
+                    seqlen_q,
+                    philox_seed,
+                    batch_philox_offset,
+                    encoded_sm_ptrs,
+                    block_min,
+                    block_max,
+                    offs_n_causal,
+                    masked_blocks,
+                    n_extra_tokens,
+                    alibi_slope,
+                    q_descale,
+                    k_descale,
+                    v_descale,
+                    p_scale,
+                    IS_CAUSAL,
+                    BLOCK_M,
+                    BLOCK_DMODEL,
+                    BLOCK_N,
+                    offs_m,
+                    offs_n,
+                    # _, SHOULD_MASK_STEPS, ...
+                    SHOULD_PRE_LOAD_V,
+                    True,
+                    SHOULD_RETURN_ENCODED_SOFTMAX,
+                    USE_PADDED_HEAD,
+                    IS_ACTUAL_BLOCK_DMODEL,
+                    QK_SCALE,
+                    IS_EIGHT_BIT_GEMM,
+                    USE_P_SCALE,
+                    IS_EIGHT_BIT_KV,
+                    QUANT_DTYPE)
+
+            if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV:
+                if USE_P_SCALE:
+                    acc *= p_descale
+                acc *= v_descale
+
+            # epilogue
+            # This helps the compiler do Newton Raphson on l_i vs on acc
+            # which is much larger.
+            l_recip = 1 / l_i[:, None]
+            acc = acc * l_recip
+
+            # If seqlen_q > seqlen_k but the delta is not a multiple of
+            # BLOCK_M, then we have one block with a row of all NaNs which
+            # come from computing softmax over a row of all
+            # -infs (-inf - inf = NaN). We check for that here and store 0s
+            # where there are NaNs as these rows should've been zeroed out.
+            end_m_idx = (start_m + 1) * BLOCK_M
+            start_m_idx = start_m * BLOCK_M
+            causal_start_idx = seqlen_q - seqlen_k
+            if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV:  # noqa: SIM102
+                if o_descale_ptr is not None:
+                    acc = quant_fp8(acc, o_descale)
+
+            acc = acc.to(Out.type.element_ty)
+            if IS_CAUSAL:  # noqa: SIM102
+                if (causal_start_idx > start_m_idx
+                        and causal_start_idx < end_m_idx):
+                    out_mask_boundary = tl.full((BLOCK_DMODEL, ),
+                                                causal_start_idx,
+                                                dtype=tl.int32)
+                    mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+                    out_ptrs_mask = (mask_m_offsets[:, None]
+                                     >= out_mask_boundary[None, :])
+                    z = tl.zeros((1, ), tl.float32)
+                    acc = tl.where(out_ptrs_mask, acc,
+                                   z.to(acc.type.element_ty))
+            # write back LSE
+            l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q +
+                      off_h_q * MAX_SEQLENS_Q + offs_m)
+            # If seqlen_q not multiple of BLOCK_M, we need to mask out the
+            # last few rows. This is only true for the last M block.
+            # For others, overflow_size will be -ve
+            overflow_size = end_m_idx - seqlen_q
+            if overflow_size > 0:
+                boundary = tl.full((BLOCK_M, ),
+                                   BLOCK_M - overflow_size,
+                                   dtype=tl.int32)
+                l_ptrs_mask = tl.arange(0, BLOCK_M) < boundary
+                tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+            else:
+                tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+            # write back O
+            o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh +
+                        cu_seqlens_q_start * stride_om)
+            o_ptrs = (o_offset + offs_m[:, None] * stride_om +
+                      offs_d[None, :] * stride_on)
+            o_ptrs_mask = tl.full([BLOCK_M, BLOCK_DMODEL], 1, dtype=tl.int1)
+            if overflow_size > 0:
+                o_ptrs_mask = o_ptrs_mask & (offs_m[:, None] < seqlen_q)
+            if USE_PADDED_HEAD:
+                o_ptrs_mask = o_ptrs_mask & (offs_d[None, :]
+                                             < IS_ACTUAL_BLOCK_DMODEL)
+            tl.store(o_ptrs, acc.to(Out.dtype.element_ty), mask=o_ptrs_mask)
+
+
+def get_shape_from_layout(q, k, metadata):
+    assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
+
+    if metadata.layout == 'thd':
+        nheads_q, nheads_k = q.shape[1], k.shape[1]
+        head_size = q.shape[-1]
+        batch = metadata.num_contexts
+    elif metadata.layout == 'bhsd':
+        batch, nheads_q, _, head_size = q.shape
+        nheads_k = k.shape[1]
+    elif metadata.layout == 'bshd':
+        batch, _, nheads_q, head_size = q.shape
+        nheads_k = k.shape[2]
+    return batch, nheads_q, nheads_k, head_size
+
+
+def get_strides_from_layout(q, k, v, o, metadata):
+    assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
+
+    STRIDE_PERMUTATIONS = {
+        'thd': (None, 1, 0, 2),
+        'bhsd': (0, 1, 2, 3),
+        'bshd': (0, 2, 1, 3),
+    }
+
+    perm = STRIDE_PERMUTATIONS[metadata.layout]
+    stride = lambda x, p: (0 if p is None else x.stride(p))
+    strides = lambda x: (stride(x, p) for p in perm)
+
+    return tuple(strides(x) for x in [q, k, v, o])
 
 
 class _attention(torch.autograd.Function):
 
     @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        o,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlens_q,
-        max_seqlens_k,
-        causal=False,
-        sm_scale=1.0,
-        bias=None,
-    ):
+    def forward(ctx, q, k, v, o, metadata: MetaData):
+        # NOTE: a large bias tensor leads to overflow during pointer arithmetic
+        if (metadata.bias is not None):
+            assert (metadata.bias.numel() < 2**31)
+
         if o is None:
-            o = torch.empty_like(q, dtype=v.dtype)
+            if metadata.eight_bit:
+                o = torch.empty_like(
+                    q,
+                    dtype=metadata.output_dtype if metadata.output_dtype
+                    is not None else metadata.eight_bit_dtype_torch)
+            else:
+                o = torch.empty_like(q, dtype=q.dtype)
 
-        check_args(
-            q,
-            k,
-            v,
-            o,
-            varlen=True,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-        )
-        if True:  # varlen
-            total_q, nheads_q, head_size = q.shape
-            total_k, nheads_k, _ = k.shape
-            batch = len(cu_seqlens_q) - 1
-            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
-            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
-            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
-            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
-        else:
-            batch, seqlen_q, nheads_q, head_size = q.shape
-            _, seqlen_k, nheads_k, _ = k.shape
-            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
-            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
-            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
-            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
+        metadata.check_args(q, k, v, o)
+
+        batch, nheads_q, nheads_k, head_size = get_shape_from_layout(
+            q, k, metadata)
+        q_strides, k_strides, v_strides, o_strides = get_strides_from_layout(
+            q, k, v, o, metadata)
 
         # Get closest power of 2 over or equal to 32.
-        unpadded_head_dims = {32, 64, 128, 256}
-        if head_size not in unpadded_head_dims:
-            padded_d_model = None
-            for i in unpadded_head_dims:
-                if i > head_size:
-                    padded_d_model = i
-                    break
-            assert padded_d_model is not None
-        else:
-            padded_d_model = head_size
+        padded_d_model = 1 << (head_size - 1).bit_length()
+        # Smallest head_dim supported is 16. If smaller, the tile in the
+        # kernel is padded - there is no padding in memory for any dims.
+        padded_d_model = max(padded_d_model, 16)
 
-        grid = lambda META: (
-            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
-            nheads_q,
-            batch,
-        )
+        # encoded_softmax is used to validate dropout behavior vs the
+        # PyTorch SDPA math backend reference.  We zero this out to give a
+        # consistent starting point and then populate it with the output of
+        # softmax with the sign bit set according to the dropout mask.
+        # The resulting return allows this mask to be fed into the reference
+        # implementation for testing only.  This return holds no useful output
+        # aside from debugging.
+        if metadata.return_encoded_softmax:
+            encoded_softmax = torch.zeros(
+                (q.shape[0], q.shape[1], q.shape[2], k.shape[2]),
+                device=q.device,
+                dtype=torch.float32)
+        else:
+            encoded_softmax = None
 
-        encoded_softmax = None
+        M = torch.empty((batch, nheads_q, metadata.max_seqlens_q),
+                        device=q.device,
+                        dtype=torch.float32)
 
         # Seed the RNG so we get reproducible results for testing.
         philox_seed = 0x1BF52
         philox_offset = 0x1D4B42
 
-        if bias is not None:
-            bias_strides = (
-                bias.stride(0),
-                bias.stride(1),
-                bias.stride(2),
-                bias.stride(3),
-            )
+        if metadata.bias is not None:
+            bias_strides = (metadata.bias.stride(0), metadata.bias.stride(1),
+                            metadata.bias.stride(2), metadata.bias.stride(3))
         else:
             bias_strides = (0, 0, 0, 0)
 
+        if metadata.alibi_slopes is not None:
+            alibi_strides = (metadata.alibi_slopes.stride(0),
+                             metadata.alibi_slopes.stride(1))
+        else:
+            alibi_strides = (0, 0)
+
+        if metadata.eight_bit:
+            q_descale, k_descale, p_scale, p_descale, v_descale, o_scale = (
+                metadata.q_descale, metadata.k_descale, metadata.p_scale,
+                metadata.p_descale, metadata.v_descale, metadata.o_scale)
+            o_descale = 1.0 / o_scale if o_scale is not None else None
+        else:
+            q_descale = k_descale = p_scale = None
+            p_descale = v_descale = o_descale = None
+
+        # number of compute units available
+        NUM_CU = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+        grid = lambda META: (triton.cdiv(metadata.max_seqlens_q, META[
+            'BLOCK_M']), nheads_q, batch)
+
         attn_fwd[grid](
             q,
             k,
             v,
-            bias,
-            sm_scale,
-            None,
+            metadata.bias,
+            metadata.sm_scale,
+            M,
             o,
             *q_strides,
             *k_strides,
             *v_strides,
             *o_strides,
             *bias_strides,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            dropout_p=0.0,
+            *alibi_strides,
+            q_descale,
+            k_descale,
+            p_scale,
+            p_descale,
+            o_descale,
+            v_descale,
+            q_descale.numel() == 1 if q_descale is not None else False,
+            k_descale.numel() == 1 if k_descale is not None else False,
+            p_descale.numel() == 1 if p_descale is not None else False,
+            v_descale.numel() == 1 if v_descale is not None else False,
+            metadata.cu_seqlens_q,
+            metadata.cu_seqlens_k,
             philox_seed=philox_seed,
             philox_offset_base=philox_offset,
             encoded_softmax=encoded_softmax,
+            alibi_slopes=metadata.alibi_slopes,
             HQ=nheads_q,
             HK=nheads_k,
-            ACTUAL_BLOCK_DMODEL=head_size,
-            MAX_SEQLENS_Q=max_seqlens_q,
-            MAX_SEQLENS_K=max_seqlens_k,
-            IS_CAUSAL=causal,
-            VARLEN=True,
+            IS_ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=metadata.max_seqlens_q,
+            MAX_SEQLENS_K=metadata.max_seqlens_k,
+            IS_CAUSAL=metadata.causal,
+            VARLEN=metadata.varlen,
             BLOCK_DMODEL=padded_d_model,
-            BIAS_TYPE=0 if bias is None else 1,
-            ENABLE_DROPOUT=False,
-            RETURN_ENCODED_SOFTMAX=False,
-        )
+            USE_BIAS=metadata.bias is not None,
+            USE_ALIBI=metadata.alibi_slopes is not None,
+            SHOULD_RETURN_ENCODED_SOFTMAX=metadata.return_encoded_softmax,
+            IS_EIGHT_BIT=metadata.eight_bit,
+            USE_P_SCALE=metadata.eight_bit and metadata.use_p_scale,
+            IS_EIGHT_BIT_KV=metadata.eight_bit and metadata.eight_bit_kv,
+            NUM_CU=NUM_CU,
+            B=batch,
+            QUANT_DTYPE=metadata.eight_bit_dtype_triton)
 
         ctx.grid = grid
-        ctx.sm_scale = sm_scale
+        ctx.sm_scale = metadata.sm_scale
         ctx.BLOCK_DMODEL = head_size
-        ctx.causal = causal
-        ctx.dropout_p = 0.0
+        ctx.causal = metadata.causal
+        ctx.alibi_slopes = metadata.alibi_slopes
         ctx.philox_seed = philox_seed
         ctx.philox_offset = philox_offset
         ctx.encoded_softmax = encoded_softmax
-        ctx.return_encoded_softmax = False
+        ctx.return_encoded_softmax = metadata.return_encoded_softmax
         return o, encoded_softmax
 
 
-triton_attention = _attention.apply
+triton_attention_rocm = _attention.apply
+
+
+def scale_fp8(t, scale=None):
+    t_scaled, scale_out = ops.scaled_fp8_quant(t.reshape(-1, t.shape[-1]),
+                                               scale)
+    return t_scaled.reshape(t.shape), scale_out
+
+
+def maybe_quantize_fp8(t, scale):
+    eight_bit_dtype = current_platform.fp8_dtype()
+    if t.dtype != eight_bit_dtype:
+        t, _ = scale_fp8(t, scale)
+    return t
+
+
+def check_and_maybe_quantize_qkv(q, k, v, fp8_scales):
+    (q_scale, k_scale, v_scale, p_scale) = fp8_scales
+
+    q = maybe_quantize_fp8(q, q_scale)
+    k = maybe_quantize_fp8(k, k_scale)
+    v = maybe_quantize_fp8(v, v_scale)
+
+    return q, k, v
+
+
+# query   - [num_tokens, num_heads, head_size]
+# key     - [num_tokens, num_kv_heads, head_size]
+# value   - [num_tokens, num_kv_heads, head_size
+# output  - [num_tokens, num_heads, head_size]
+def triton_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    o: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlens_q: int,
+    max_seqlens_k: int,
+    causal: bool = False,
+    sm_scale: float = 1.0,
+    bias: Optional[torch.Tensor] = None,
+    fp8_scales: Optional[tuple[float, ...]] = None,
+    input_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if fp8_scales is not None:
+        q_descale, k_descale, v_descale, p_scale = fp8_scales
+    else:
+        q_descale = k_descale = v_descale = p_scale = None
+
+    attn_metadata = MetaData(sm_scale=sm_scale,
+                             max_seqlens_q=max_seqlens_q,
+                             max_seqlens_k=max_seqlens_k,
+                             causal=causal,
+                             bias=bias,
+                             output_dtype=q.dtype,
+                             cu_seqlens_q=cu_seqlens_q,
+                             cu_seqlens_k=cu_seqlens_k,
+                             q_descale=q_descale,
+                             k_descale=k_descale,
+                             v_descale=v_descale,
+                             p_scale=p_scale,
+                             o_scale=input_scale)
+
+    if fp8_scales is not None:
+        q, k, v = check_and_maybe_quantize_qkv(q, k, v, fp8_scales)
+
+    return triton_attention_rocm(q, k, v, o, attn_metadata)
diff --git a/vllm/vllm_flash_attn/fa_utils.py b/vllm/attention/utils/fa_utils.py
similarity index 100%
rename from vllm/vllm_flash_attn/fa_utils.py
rename to vllm/attention/utils/fa_utils.py
diff --git a/tests/models/embedding/vision_language/__init__.py b/vllm/benchmarks/__init__.py
similarity index 100%
rename from tests/models/embedding/vision_language/__init__.py
rename to vllm/benchmarks/__init__.py
diff --git a/collect_env.py b/vllm/collect_env.py
similarity index 96%
rename from collect_env.py
rename to vllm/collect_env.py
index e11271a1364..9cfceb7c45c 100644
--- a/collect_env.py
+++ b/vllm/collect_env.py
@@ -282,13 +282,21 @@ def get_vllm_version():
 
     if __version__ == "dev":
         return "N/A (dev)"
-
-    if len(__version_tuple__) == 4: # dev build
-        git_sha = __version_tuple__[-1][1:] # type: ignore
-        return f"{__version__} (git sha: {git_sha}"
-
+    version_str = __version_tuple__[-1]
+    if isinstance(version_str, str) and version_str.startswith('g'):
+        # it's a dev build
+        if '.' in version_str:
+            # it's a dev build containing local changes
+            git_sha = version_str.split('.')[0][1:]
+            date = version_str.split('.')[-1][1:]
+            return f"{__version__} (git sha: {git_sha}, date: {date})"
+        else:
+            # it's a dev build without local changes
+            git_sha = version_str[1:]  # type: ignore
+            return f"{__version__} (git sha: {git_sha})"
     return __version__
 
+
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
     return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
@@ -502,7 +510,9 @@ def run_with_pip():
             print("uv is set")
             cmd = ["uv", "pip", "list", "--format=freeze"]
         else:
-            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+            raise RuntimeError(
+                "Could not collect pip list output (pip or uv module not available)"
+            )
 
         out = run_and_read_all(run_lambda, cmd)
         return "\n".join(line for line in out.splitlines()
@@ -535,13 +545,12 @@ def is_xnnpack_available():
     else:
         return "N/A"
 
+
 def get_env_vars():
     env_vars = ''
-    secret_terms=('secret', 'token', 'api', 'access', 'password')
-    report_prefix = ("TORCH", "NCCL", "PYTORCH",
-                     "CUDA", "CUBLAS", "CUDNN",
-                     "OMP_", "MKL_",
-                     "NVIDIA")
+    secret_terms = ('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_", "NVIDIA")
     for k, v in os.environ.items():
         if any(term in k.lower() for term in secret_terms):
             continue
@@ -552,6 +561,7 @@ def get_env_vars():
 
     return env_vars
 
+
 def get_env_info():
     run_lambda = run
     pip_version, pip_list_output = get_pip_packages(run_lambda)
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
new file mode 100644
index 00000000000..1917ed8bbeb
--- /dev/null
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import (PatternMatcherPass, fwd_only,
+                                             register_replacement)
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+def silu_mul_pattern_static(result: torch.Tensor,
+                            result_silu_mul: torch.Tensor, input: torch.Tensor,
+                            scale: torch.Tensor):
+    at1 = auto_functionalized(torch.ops._C.silu_and_mul.default,
+                              result=result_silu_mul,
+                              input=input)
+    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
+                              result=result,
+                              input=at1[1],
+                              scale=scale)
+    return at2[1]
+
+
+def silu_mul_replacement_static(result: torch.Tensor,
+                                result_silu_mul: torch.Tensor,
+                                input: torch.Tensor, scale: torch.Tensor):
+    at = auto_functionalized(torch.ops._C.silu_and_mul_quant.default,
+                             result=result,
+                             input=input,
+                             scale=scale)
+    return at[1]
+
+
+def empty_bf16(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+
+
+def empty_fp8(*args, **kwargs):
+    fp8 = torch.float8_e4m3fn
+    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
+
+
+def empty_fp32(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
+
+
+class ActivationQuantFusionPass(VllmInductorPass):
+    """
+    This pass fuses a pre-defined set of custom ops into fused ops.
+    It uses the torch pattern matcher to find the patterns and replace them.
+
+    Because patterns can only be registered once, the pass is a singleton.
+    This will be addressed in a future version of PyTorch:
+    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
+    """
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="activation_quant_fusion_pass")
+
+        inputs = [
+            empty_fp8(5, 4),  # Quant output
+            empty_bf16(5, 4),  # Silu_and_mul output
+            empty_bf16(5, 4),  # Input
+            empty_fp32(1, 1)  # Scale
+        ]
+        register_replacement(silu_mul_pattern_static,
+                             silu_mul_replacement_static, inputs, fwd_only,
+                             self.patterns)
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_act_quant_fusion")
+
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns in ActivationQuantFusionPass",
+                     count)
+
+        self.dump_graph(graph, "after_act_quant_fusion")
+        self.end_and_log()
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 45988c2e9b0..fcaf4a0f987 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -45,6 +45,7 @@ def __init__(self, use_inductor: bool):
         self.cache: Dict[Tuple[Optional[int], int, str], Any] = dict()
         cls = InductorAdaptor if use_inductor else EagerAdaptor
         self.compiler = cls()
+        self.is_cache_updated = False
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         return self.compiler.compute_hash(vllm_config)
@@ -66,11 +67,11 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
                                        disable_cache=disable_cache)
 
     def save_to_file(self):
-        if self.disable_cache:
+        if self.disable_cache or not self.is_cache_updated:
             return
+        printer = pprint.PrettyPrinter(indent=4)
+        data = printer.pformat(self.cache)
         with open(self.cache_file_path, "w") as f:
-            printer = pprint.PrettyPrinter(indent=4)
-            data = printer.pformat(self.cache)
             f.write(data)
 
     def load(self,
@@ -110,10 +111,14 @@ def compile(self,
         compiled_graph = self.load(graph, example_inputs, graph_index,
                                    runtime_shape)
         if compiled_graph is not None:
-            if graph_index == 0:
-                # adds some info logging for the first graph
-                logger.info("Directly load the compiled graph for shape %s "
-                            "from the cache", str(runtime_shape))  # noqa
+            if graph_index == num_graphs - 1:
+                # after loading the last graph for this shape, record the time.
+                # there can be multiple graphs due to piecewise compilation.
+                now = time.time()
+                elapsed = now - compilation_start_time
+                logger.info(
+                    "Directly load the compiled graph(s) for shape %s "
+                    "from the cache, took %.3f s", str(runtime_shape), elapsed)
             return compiled_graph
 
         # no compiler cached the graph, or the cache is disabled,
@@ -127,6 +132,7 @@ def compile(self,
         if handle is not None:
             self.cache[(runtime_shape, graph_index,
                         self.compiler.name)] = handle
+            self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
                 logger.info("Cache the graph of shape %s for later use",
@@ -335,7 +341,7 @@ def __init__(
 
     def configure_post_pass(self):
         config = self.compilation_config
-        self.post_grad_pass_manager.configure(config.pass_config)
+        self.post_grad_pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
@@ -343,8 +349,12 @@ def configure_post_pass(self):
         PASS_KEY = "post_grad_custom_post_pass"
         if PASS_KEY in inductor_config:
             # Config should automatically wrap all inductor passes
-            assert isinstance(inductor_config[PASS_KEY], InductorPass)
-            self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
+            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
+                assert (inductor_config[PASS_KEY].uuid() ==
+                        self.post_grad_pass_manager.uuid())
+            else:
+                assert isinstance(inductor_config[PASS_KEY], InductorPass)
+                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
@@ -378,6 +388,10 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             hash_content = []
             for filepath in forward_code_files:
                 hash_content.append(filepath)
+                if filepath == "<string>":
+                    # This means the function was dynamically generated, with
+                    # e.g. exec(). We can't actually check these.
+                    continue
                 with open(filepath) as f:
                     hash_content.append(f.read())
             import hashlib
@@ -400,8 +414,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             )
             self.compilation_config.cache_dir = cache_dir
 
-        cache_dir = self.compilation_config.cache_dir
+        if compilation_counter.num_graphs_seen > 0:
+            cache_dir = self.compilation_config.cache_dir + \
+                f'-{compilation_counter.num_graphs_seen}'
+        else:
+            cache_dir = self.compilation_config.cache_dir
         os.makedirs(cache_dir, exist_ok=True)
+        self.compilation_config.cache_dir = cache_dir
         rank = vllm_config.parallel_config.rank
         dp_rank = vllm_config.parallel_config.data_parallel_rank
         local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 6c8875916ef..b7e7a79bef0 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -11,9 +11,12 @@
 import torch._inductor.compile_fx
 import torch.fx as fx
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.utils import is_torch_equal_or_newer
 
+from .inductor_pass import pass_context
+
 
 class CompilerInterface:
     """
@@ -36,7 +39,7 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         Gather all the relevant information from the vLLM config,
         to compute a hash so that we can cache the compiled model.
 
-        See :meth:`VllmConfig.compute_hash` to check what information
+        See {meth}`VllmConfig.compute_hash` to check what information
         is already considered by default. This function should only
         consider the information that is specific to the compiler.
         """
@@ -167,8 +170,7 @@ def compile(
         compiler_config: Dict[str, Any],
         runtime_shape: Optional[int] = None
     ) -> Tuple[Optional[Callable], Optional[Any]]:
-        from torch._inductor import config
-        current_config = config.get_config_copy()
+        current_config = {}
         from torch._inductor.compile_fx import compile_fx
 
         # disable remote cache
@@ -196,7 +198,6 @@ def compile(
         hash_str, file_path = None, None
         from torch._inductor.codecache import (FxGraphCache,
                                                compiled_fx_graph_hash)
-
         if torch.__version__.startswith("2.5"):
             original_load = FxGraphCache.load
             original_load_name = "torch._inductor.codecache.FxGraphCache.load"
@@ -281,6 +282,16 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
                 patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
                       _get_shape_env))
 
+            from torch._functorch._aot_autograd.autograd_cache import (
+                AOTAutogradCache)
+
+            # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
+            if hasattr(AOTAutogradCache, "_get_shape_env"):
+                stack.enter_context(
+                    patch(
+                        "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
+                        _get_shape_env))
+
             # for forcing the graph to be cached
             stack.enter_context(
                 patch(
@@ -290,16 +301,34 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             # Dynamo metrics context, see method for more details.
             stack.enter_context(self.metrics_context())
 
-            compiled_graph = compile_fx(
-                graph,
-                example_inputs,
-                inner_compile=hijacked_compile_fx_inner,
-                config_patches=current_config)
-
-        assert hash_str is not None, (
-            "failed to get the hash of the compiled graph")
-        assert file_path is not None, (
-            "failed to get the file path of the compiled graph")
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
+
+            with pass_context(runtime_shape):
+                compiled_graph = compile_fx(
+                    graph,
+                    example_inputs,
+                    inner_compile=hijacked_compile_fx_inner,
+                    config_patches=current_config)
+
+        # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
+        # compilation cache. So turn off the checks if we disable the
+        # compilation cache.
+        if not envs.VLLM_DISABLE_COMPILE_CACHE:
+            assert hash_str is not None, (
+                "failed to get the hash of the compiled graph")
+            assert file_path is not None, (
+                "failed to get the file path of the compiled graph")
         return compiled_graph, (hash_str, file_path)
 
     def load(self,
@@ -313,11 +342,19 @@ def load(self,
         assert isinstance(handle[1], str)
         hash_str = handle[0]
 
+        from torch._functorch._aot_autograd.autograd_cache import (
+            AOTAutogradCache)
         from torch._inductor.codecache import FxGraphCache
         with ExitStack() as exit_stack:
             exit_stack.enter_context(
                 patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
                       lambda *args, **kwargs: AlwaysHitShapeEnv()))
+            # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
+            if hasattr(AOTAutogradCache, "_get_shape_env"):
+                exit_stack.enter_context(
+                    patch(
+                        "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
+                        lambda *args, **kwargs: AlwaysHitShapeEnv()))
 
             # Dynamo metrics context, see method for more details.
             exit_stack.enter_context(self.metrics_context())
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 9b0e9c5d040..7f312066032 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -68,18 +68,25 @@ def __call__(self, graph: torch.fx.Graph):
                 self.defunctionalize(graph, node, mutated_args)
             elif at_target in [
                     torch.ops._C.rms_norm.default,
-                    torch.ops._C.rms_norm_static_fp8_quant.default
+                    torch.ops._C.rms_norm_static_fp8_quant.default,
             ]:
                 mutated_args = {1: 'result'}
                 self.defunctionalize(graph, node, mutated_args)
-
+            # For some reason we need to specify the args for both
+            # silu_and_mul and silu_and_mul_quant. The kwargs
+            # pathway gets the wrong answer.
             elif at_target == torch.ops._C.silu_and_mul.default:
-                mutated_args = {1: 'out'}
-                # Because we have an 'out', need to specify args directly
+                mutated_args = {1: 'result'}
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('result', 'input'))
+            elif at_target == torch.ops._C.silu_and_mul_quant.default:
+                mutated_args = {1: 'result'}
                 self.defunctionalize(graph,
                                      node,
                                      mutated_args,
-                                     args=('out', 'input'))
+                                     args=('result', 'input', 'scale'))
             else:
                 continue  # skip the count
 
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index b46f5f52244..8f32fdb03f8 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -9,7 +9,7 @@
 from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._ops import OpOverload
 
-from vllm.config import CompilationConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -531,7 +531,7 @@ class FusionPass(VllmInductorPass):
     _instance: 'Optional[FusionPass]' = None
 
     @classmethod
-    def instance(cls, config: CompilationConfig.PassConfig):
+    def instance(cls, config: VllmConfig):
         """
         Get the singleton instance of the FusionPass.
         If the instance exists, the config is updated but
@@ -540,10 +540,10 @@ def instance(cls, config: CompilationConfig.PassConfig):
         if cls._instance is None:
             cls._instance = FusionPass(config)
         else:
-            cls._instance.config = config
+            cls._instance.pass_config = config.compilation_config.pass_config
         return cls._instance
 
-    def __init__(self, config: CompilationConfig.PassConfig):
+    def __init__(self, config: VllmConfig):
         assert self.__class__._instance is None, \
             "FusionPass singleton instance already exists"
         super().__init__(config)
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index b9a8d3112e7..f9427e48ac3 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -12,6 +12,22 @@ def is_func(node: fx.Node, target) -> bool:
     return node.op == "call_function" and node.target == target
 
 
+# Returns the first specified node with the given op (if it exists)
+def find_specified_fn_maybe(nodes: Iterable[fx.Node],
+                            op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if node.target == op:
+            return node
+    return None
+
+
+# Returns the first specified node with the given op
+def find_specified_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_specified_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
 # Returns the first auto_functionalized node with the given op (if it exists)
 def find_auto_fn_maybe(nodes: Iterable[fx.Node],
                        op: OpOverload) -> Optional[fx.Node]:
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 00a2e89f21a..6cd7720fca2 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -4,6 +4,7 @@
 import inspect
 import json
 import types
+from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Union
 
 import torch
@@ -18,6 +19,34 @@
     from .torch25_custom_graph_pass import (  # noqa: yapf
         Torch25CustomGraphPass as CustomGraphPass)
 
+_pass_context = None
+
+
+class PassContext:
+
+    def __init__(self, runtime_shape: Optional[int]):
+        self.runtime_shape = runtime_shape
+
+
+def get_pass_context() -> PassContext:
+    """Get the current pass context."""
+    assert _pass_context is not None
+    return _pass_context
+
+
+@contextmanager
+def pass_context(runtime_shape: Optional[int]):
+    """A context manager that stores the current pass context,
+    usually it is a list of sizes to specialize.
+    """
+    global _pass_context
+    prev_context = _pass_context
+    _pass_context = PassContext(runtime_shape)
+    try:
+        yield
+    finally:
+        _pass_context = prev_context
+
 
 class InductorPass(CustomGraphPass):
     """
@@ -62,6 +91,9 @@ def hash_dict(dict_: Dict[Any, Any]):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
+    def is_applicable_for_shape(self, shape: Optional[int]):
+        return True
+
 
 class CallableInductorPass(InductorPass):
     """
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 530a88b2b09..b1646914c7e 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -4,13 +4,16 @@
 
 from torch import fx as fx
 
-from vllm.config import CompilationConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 
+from .activation_quant_fusion import ActivationQuantFusionPass
 from .fix_functionalization import FixFunctionalizationPass
 from .fusion import FusionPass
-from .inductor_pass import CustomGraphPass, InductorPass
+from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass
+from .sequence_parallelism import SequenceParallelismPass
+from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
@@ -31,24 +34,30 @@ class PostGradPassManager(CustomGraphPass):
     """
 
     def __init__(self):
-        self.passes: List[InductorPass] = []
+        self.passes: List[VllmInductorPass] = []
 
     def __call__(self, graph: fx.Graph):
+        shape = get_pass_context().runtime_shape
         for pass_ in self.passes:
-            pass_(graph)
+            if pass_.is_applicable_for_shape(shape):
+                pass_(graph)
 
         # always run fix_functionalization last
         self.fix_functionalization(graph)
 
-    def configure(self, pass_config: CompilationConfig.PassConfig):
-        self.pass_config = pass_config
-        if pass_config.enable_noop:
-            self.passes += [NoOpEliminationPass(pass_config)]
+    def configure(self, config: VllmConfig):
+        self.pass_config = config.compilation_config.pass_config
+        if self.pass_config.enable_noop:
+            self.passes += [NoOpEliminationPass(config)]
 
-        if pass_config.enable_fusion:
-            self.passes += [FusionPass.instance(pass_config)]
+        if self.pass_config.enable_fusion:
+            self.passes += [FusionPass.instance(config)]
+            self.passes += [ActivationQuantFusionPass(config)]
 
-        self.fix_functionalization = FixFunctionalizationPass(pass_config)
+        if self.pass_config.enable_sequence_parallelism:
+            self.passes += [SequenceParallelismPass(config)]
+
+        self.fix_functionalization = FixFunctionalizationPass(config)
 
     def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
new file mode 100644
index 00000000000..95db63d34f7
--- /dev/null
+++ b/vllm/compilation/sequence_parallelism.py
@@ -0,0 +1,266 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+import torch._inductor.pattern_matcher as pm
+import torch.fx as fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
+
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class AllReduceRMSNormPattern:
+
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.device = device
+
+
+class EmbeddingAllReduceRMSNormPattern(AllReduceRMSNormPattern):
+
+    def get_inputs(self):
+        arg2_1 = torch.empty([16, 4], device=self.device, dtype=self.dtype)
+        mul_6 = torch.tensor([[3, 7, 1, 4, 9, 2, 5, 0]],
+                             device=self.device,
+                             dtype=torch.long)
+        unsqueeze = torch.rand([1, 8, 1], device=self.device, \
+            dtype=self.dtype) > 0.5
+        full_default = torch.zeros([1, 8, 4], device=self.device, \
+            dtype=self.dtype)
+        permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
+        arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
+
+        return [arg2_1, mul_6, unsqueeze, full_default, permute, arg3_1]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            arg2_1: torch.Tensor,
+            mul_6: torch.Tensor,
+            unsqueeze: torch.Tensor,
+            full_default: torch.Tensor,
+            permute: torch.Tensor,
+            arg3_1: torch.Tensor,
+        ):
+            embedding = torch.ops.aten.embedding.default(arg2_1, mul_6)
+            where = torch.ops.aten.where.self(unsqueeze, full_default,
+                                              embedding)
+            all_reduce = tensor_model_parallel_all_reduce(where)
+            rmsnorm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.rms_norm.default,
+                result=permute,
+                input=all_reduce,
+                weight=arg3_1,
+                epsilon=self.epsilon,
+            )
+
+            return rmsnorm[1], all_reduce
+
+        def replacement(
+            arg2_1: torch.Tensor,
+            mul_6: torch.Tensor,
+            unsqueeze: torch.Tensor,
+            full_default: torch.Tensor,
+            permute: torch.Tensor,
+            arg3_1: torch.Tensor,
+        ):
+            embedding = torch.ops.aten.embedding.default(arg2_1, mul_6)
+            where = torch.ops.aten.where.self(unsqueeze, full_default,
+                                              embedding)
+
+            tp = get_tp_group()
+            tp_size = get_tensor_model_parallel_world_size()
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                where, dim=0, world_size=tp_size, group_name=tp.unique_name)
+
+            rmsnorm_result = torch.empty_like(reduce_scatter)
+            rmsnorm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.rms_norm.default,
+                result=rmsnorm_result,
+                input=reduce_scatter,
+                weight=arg3_1,
+                epsilon=self.epsilon,
+            )
+
+            all_gather = torch.ops.vllm.all_gather.default(
+                rmsnorm[1],
+                dim=0,
+                world_size=tp_size,
+                group_name=tp.unique_name)
+
+            return all_gather, reduce_scatter
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class MiddleAllReduceRMSNormPattern(AllReduceRMSNormPattern):
+
+    def get_inputs(self):
+        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.dtype)
+
+        return [
+            residual,
+            mm_1,
+            rms_norm_weights,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            all_reduce = tensor_model_parallel_all_reduce(mm_1)
+
+            rmsnorm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.fused_add_rms_norm.default,
+                input=all_reduce,
+                residual=residual,
+                weight=rms_norm_weights,
+                epsilon=self.epsilon,
+            )
+
+            return rmsnorm[1], rmsnorm[2]
+
+        def replacement(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            tp = get_tp_group()
+            tp_size = get_tensor_model_parallel_world_size()
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                mm_1, dim=0, world_size=tp_size, group_name=tp.unique_name)
+
+            # TODO is it possible to extract epsilon from somewhere
+            rmsnorm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.fused_add_rms_norm.default,
+                input=reduce_scatter,
+                residual=residual,
+                weight=rms_norm_weights,
+                epsilon=self.epsilon,
+            )
+
+            all_gather = torch.ops.vllm.all_gather.default(
+                rmsnorm[1],
+                dim=0,
+                world_size=tp_size,
+                group_name=tp.unique_name)
+            return all_gather, rmsnorm[2]
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class LastAllReduceRMSNormPattern(AllReduceRMSNormPattern):
+
+    def get_inputs(self):
+        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        rms_norm_weights = torch.empty([4, 4],
+                                       device=self.device,
+                                       dtype=self.dtype)
+
+        return [
+            residual,
+            mm_1,
+            rms_norm_weights,
+        ]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            all_reduce = tensor_model_parallel_all_reduce(mm_1)
+
+            rmsnorm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.fused_add_rms_norm.default,
+                input=all_reduce,
+                residual=residual,
+                weight=rms_norm_weights,
+                epsilon=self.epsilon,
+            )
+
+            return rmsnorm[1]
+
+        def replacement(
+            residual: torch.Tensor,
+            mm_1: torch.Tensor,
+            rms_norm_weights: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            tp = get_tp_group()
+            tp_size = get_tensor_model_parallel_world_size()
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                mm_1, dim=0, world_size=tp_size, group_name=tp.unique_name)
+
+            # TODO is it possible to extract epsilon from somewhere
+            rmsnorm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.fused_add_rms_norm.default,
+                input=reduce_scatter,
+                residual=residual,
+                weight=rms_norm_weights,
+                epsilon=self.epsilon,
+            )
+
+            normalized = torch.ops.vllm.all_gather.default(
+                rmsnorm[1],
+                dim=0,
+                world_size=tp_size,
+                group_name=tp.unique_name)
+
+            return normalized
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class SequenceParallelismPass(VllmInductorPass):
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="sequence_parallelism_pass")
+        for epsilon in [1e-5, 1e-6]:
+            EmbeddingAllReduceRMSNormPattern(
+                epsilon, self.dtype, self.device).register(self.patterns)
+
+            MiddleAllReduceRMSNormPattern(epsilon, self.dtype,
+                                          self.device).register(self.patterns)
+
+            LastAllReduceRMSNormPattern(epsilon, self.dtype,
+                                        self.device).register(self.patterns)
+            # WARNING: This is a hack to clear the pattern matcher cache
+            # and allow multiple values of epsilon.
+            torch._inductor.pattern_matcher._seen_patterns.clear()
+
+    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+        # only do replace for specific shapes
+        tp_size = get_tensor_model_parallel_world_size()
+        return shape is not None and shape % tp_size == 0
+
+    def __call__(self, graph: fx.Graph):
+        self.dump_graph(graph, "before_sequence_parallelism_pass")
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_sequence_parallelism_pass")
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 98ed6f1472a..e8bffb406f1 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 # yapf: disable
 from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank
 from vllm.distributed import (
@@ -24,16 +24,19 @@ class VllmInductorPass(InductorPass):
     It provides timing, logging, and dumping utilities.
     """
 
-    def __init__(self, config: CompilationConfig.PassConfig):
-        self.config = config
+    def __init__(self, config: VllmConfig):
+        self.pass_config = config.compilation_config.pass_config
+        self.dtype = config.model_config.dtype if config.model_config else None
+        self.device = config.device_config.device if config.device_config \
+            else None
         self.pass_name = self.__class__.__name__
 
     def dump_graph(self, graph: torch.fx.Graph, stage: str, always=False):
-        if stage in self.config.dump_graph_stages or always:
+        if stage in self.pass_config.dump_graph_stages or always:
             # Make sure filename includes rank in the distributed setting
             parallel = p_is_init() and get_tp_world_size() > 1
             rank = f"-{get_tp_rank()}" if parallel else ""
-            filepath = self.config.dump_graph_dir / f"{stage}{rank}.py"
+            filepath = self.pass_config.dump_graph_dir / f"{stage}{rank}.py"
 
             logger.info("%s printing graph to %s", self.pass_name, filepath)
             with open(filepath, "w") as f:
diff --git a/vllm/config.py b/vllm/config.py
index 60ea4a517bd..40beace3040 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -6,32 +6,35 @@
 import hashlib
 import inspect
 import json
+import re
 import sys
 import textwrap
 import warnings
 from collections import Counter
-from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
                          replace)
+from functools import cached_property
 from importlib.util import find_spec
 from pathlib import Path
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
-                    Optional, Protocol, TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
+                    Protocol, TypeVar, Union, cast, get_args, get_origin)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
+from typing_extensions import deprecated
 
 import vllm.envs as envs
+from vllm import version
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
+                                                     QuantizationMethods,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import CpuArchEnum, current_platform
-from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -52,16 +55,16 @@
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
     from vllm.model_executor.model_loader.loader import BaseModelLoader
-    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-        BaseTokenizerGroup)
 
-    Config = TypeVar("Config", bound=DataclassInstance)
+    ConfigType = type[DataclassInstance]
 else:
     QuantizationConfig = None
-    Config = TypeVar("Config")
+    ConfigType = type
 
 logger = init_logger(__name__)
 
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
 # This value is chosen to have a balance between ITL and TTFT. Note it is
 # not optimized for throughput.
 _DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
@@ -121,7 +124,7 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
     def pairwise(iterable):
         """
         Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
-        
+
         Can be removed when Python 3.9 support is dropped.
         """
         iterator = iter(iterable)
@@ -163,7 +166,7 @@ def pairwise(iterable):
     return out
 
 
-def config(cls: type[Config]) -> type[Config]:
+def config(cls: ConfigT) -> ConfigT:
     """
     A decorator that ensures all fields in a dataclass have default values
     and that each field has a docstring.
@@ -176,109 +179,224 @@ def config(cls: type[Config]) -> type[Config]:
             raise ValueError(
                 f"Field '{f.name}' in {cls.__name__} must have a default value."
             )
+
         if f.name not in attr_docs:
             raise ValueError(
                 f"Field '{f.name}' in {cls.__name__} must have a docstring.")
+
+        if get_origin(f.type) is Union:
+            args = get_args(f.type)
+            literal_args = [arg for arg in args if get_origin(arg) is Literal]
+            if len(literal_args) > 1:
+                raise ValueError(
+                    f"Field '{f.name}' in {cls.__name__} must use a single "
+                    "Literal type. Please use 'Literal[Literal1, Literal2]' "
+                    "instead of 'Union[Literal1, Literal2]'.")
     return cls
 
 
-class ModelConfig:
-    """Configuration for the model.
+def get_field(cls: ConfigType, name: str) -> Field:
+    """Get the default factory field of a dataclass by name. Used for getting
+    default factory fields in `EngineArgs`."""
+    if not is_dataclass(cls):
+        raise TypeError("The given class is not a dataclass.")
+    cls_fields = {f.name: f for f in fields(cls)}
+    if name not in cls_fields:
+        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
+    named_field: Field = cls_fields.get(name)
+    if (default_factory := named_field.default_factory) is not MISSING:
+        return field(default_factory=default_factory)
+    if (default := named_field.default) is not MISSING:
+        return field(default=default)
+    raise ValueError(
+        f"{cls.__name__}.{name} must have a default value or default factory.")
 
-    Args:
-        model: Name or path of the huggingface model to use.
-            It is also used as the content for `model_name` tag in metrics
-            output when `served_model_name` is not specified.
-        task: The task to use the model for. Each vLLM instance only supports
-            one task, even if the same model can be used for multiple tasks.
-            When the model only supports one task, "auto" can be used to select
-            it; otherwise, you must specify explicitly which task to use.
-        tokenizer: Name or path of the huggingface tokenizer to use.
-        tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
-            available, "slow" will always use the slow tokenizer,
-            "mistral" will always use the tokenizer from `mistral_common`, and
-            "custom" will use --tokenizer to select the preregistered tokenizer.
-        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
-            downloading the model and tokenizer.
-        allowed_local_media_path: Allowing API requests to read local images or
-            videos from directories specified by the server file system.
-            This is a security risk. Should only be enabled in trusted
-            environments.
-        dtype: Data type for model weights and activations. The "auto" option
-            will use FP16 precision for FP32 and FP16 models, and BF16 precision
-            for BF16 models.
-        seed: Random seed for reproducibility.
-        revision: The specific model version to use. It can be a branch name,
-            a tag name, or a commit id. If unspecified, will use the default
-            version.
-        code_revision: The specific revision to use for the model code on
-            Hugging Face Hub. It can be a branch name, a tag name, or a
-            commit id. If unspecified, will use the default version.
-        tokenizer_revision: The specific tokenizer version to use. It can be a
-            branch name, a tag name, or a commit id. If unspecified, will use
-            the default version.
-        max_model_len: Maximum length of a sequence (including prompt and
-            output). If None, will be derived from the model.
-        spec_target_max_model_len: Specify the the maximum length for spec
-            decoding draft models.
-        quantization: Quantization method that was used to quantize the model
-            weights. If None, we assume the model weights are not quantized.
-        enforce_eager: Whether to enforce eager execution. If True, we will
-            disable CUDA graph and always execute the model in eager mode.
-            If False, we will use CUDA graph and eager execution in hybrid.
-            If None, the user did not specify, so default to False.
-        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
-            When a sequence has context length larger than this, we fall back
-            to eager mode. Additionally for encoder-decoder models, if the
-            sequence length of the encoder input is larger than this, we fall
-            back to the eager mode.
-        max_logprobs: Maximum number of log probabilities. Defaults to 20.
-        disable_sliding_window: Whether to disable sliding window. If True,
-            we will disable the sliding window functionality of the model.
-            If the model does not support sliding window, this argument is
-            ignored.
-        skip_tokenizer_init: If true, skip initialization of tokenizer and
-            detokenizer.
-        served_model_name: The model name used in metrics tag `model_name`,
-            matches the model name exposed via the APIs. If multiple model
-            names provided, the first name will be used. If not specified,
-            the model name will be the same as `model`.
-        limit_mm_per_prompt: Maximum number of data items per modality
-            per prompt. Only applicable for multimodal models.
-        use_async_output_proc: Whether to use async output processor.
-            Defaults to True.
-        config_format: The config format which shall be loaded.
-            Defaults to 'auto' which defaults to 'hf'.
-        hf_token: The token to use as HTTP bearer authorization for remote files
-            . If `True`, will use the token generated when running 
-            `huggingface-cli login` (stored in `~/.huggingface`).
-        hf_overrides: If a dictionary, contains arguments to be forwarded to the
-            HuggingFace config. If a callable, it is called to update the
-            HuggingFace config.
-        mm_processor_kwargs: Arguments to be forwarded to the model's processor
-            for multi-modal data, e.g., image processor.
-        disable_mm_preprocessor_cache: If true, then disables caching of the
-            multi-modal preprocessor/mapper. (not recommended)
-        override_neuron_config: Initialize non default neuron config or
-            override default neuron config that are specific to Neuron devices,
-            this argument will be used to configure the neuron config that
-            can not be gathered from the vllm arguments.
-        override_pooler_config: Initialize non default pooling config or
-            override default pooling config for the pooling model.
-        logits_processor_pattern: Optional regex pattern specifying valid
-            logits processor qualified names that can be passed with the
-            `logits_processors` extra completion argument. Defaults to None,
-            which allows no processors.
-        generation_config: Configuration parameter file for generation.
-        model_impl: Which implementation of the model to use:
-            "auto" will try to use the vLLM implementation if it exists and
-                fall back to the Transformers implementation if no vLLM
-                implementation is available.
-            "vllm" will use the vLLM model implementation.
-            "transformers" will use the Transformers model implementation.
-        override_generation_config: Override the generation config with the
-            given config.
+
+TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
+ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
+
+
+@config
+@dataclass
+class ModelConfig:
+    """Configuration for the model."""
+
+    model: str = "facebook/opt-125m"
+    """Name or path of the Hugging Face model to use. It is also used as the
+    content for `model_name` tag in metrics output when `served_model_name` is
+    not specified."""
+    task: Literal[TaskOption, Literal["draft"]] = "auto"
+    """The task to use the model for. Each vLLM instance only supports one
+    task, even if the same model can be used for multiple tasks. When the model
+    only supports one task, "auto" can be used to select it; otherwise, you
+    must specify explicitly which task to use."""
+    tokenizer: str = None  # type: ignore
+    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
+    name or path will be used."""
+    tokenizer_mode: TokenizerMode = "auto"
+    """Tokenizer mode:\n
+    - "auto" will use the fast tokenizer if available.\n
+    - "slow" will always use the slow tokenizer.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "custom" will use --tokenizer to select the preregistered tokenizer."""
+    trust_remote_code: bool = False
+    """Trust remote code (e.g., from HuggingFace) when downloading the model
+    and tokenizer."""
+    dtype: Union[ModelDType, torch.dtype] = "auto"
+    """Data type for model weights and activations:\n
+    - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
+    precision for BF16 models.\n
+    - "half" for FP16. Recommended for AWQ quantization.\n
+    - "float16" is the same as "half".\n
+    - "bfloat16" for a balance between precision and range.\n
+    - "float" is shorthand for FP32 precision.\n
+    - "float32" for FP32 precision."""
+    seed: Optional[int] = None
+    """Random seed for reproducibility."""
+    hf_config_path: Optional[str] = None
+    """Name or path of the Hugging Face config to use. If unspecified, model
+    name or path will be used."""
+    allowed_local_media_path: str = ""
+    """Allowing API requests to read local images or videos from directories
+    specified by the server file system. This is a security risk. Should only
+    be enabled in trusted environments."""
+    revision: Optional[str] = None
+    """The specific model version to use. It can be a branch name, a tag name,
+    or a commit id. If unspecified, will use the default version."""
+    code_revision: Optional[str] = None
+    """The specific revision to use for the model code on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
+    rope_scaling: dict[str, Any] = field(default_factory=dict)
+    """RoPE scaling configuration. For example,
+    `{"rope_type":"dynamic","factor":2.0}`."""
+    rope_theta: Optional[float] = None
+    """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
+    theta improves the performance of the scaled model."""
+    tokenizer_revision: Optional[str] = None
+    """The specific revision to use for the tokenizer on the Hugging Face Hub.
+    It can be a branch name, a tag name, or a commit id. If unspecified, will
+    use the default version."""
+    max_model_len: int = None  # type: ignore
+    """Model context length (prompt and output). If unspecified, will be
+    automatically derived from the model config.
+
+    When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
+    format. Examples:\n
+    - 1k -> 1000\n
+    - 1K -> 1024\n
+    - 25.6k -> 25,600"""
+    spec_target_max_model_len: Optional[int] = None
+    """Specify the the maximum length for spec decoding draft models."""
+    quantization: Optional[QuantizationMethods] = None
+    """Method used to quantize the weights. If `None`, we first check the
+    `quantization_config` attribute in the model config file. If that is
+    `None`, we assume the model weights are not quantized and use `dtype` to
+    determine the data type of the weights."""
+    enforce_eager: bool = False
+    """Whether to always use eager-mode PyTorch. If True, we will disable CUDA
+    graph and always execute the model in eager mode. If False, we will use
+    CUDA graph and eager execution in hybrid for maximal performance and
+    flexibility."""
+    max_seq_len_to_capture: int = 8192
+    """Maximum sequence len covered by CUDA graphs. When a sequence has context
+    length larger than this, we fall back to eager mode. Additionally for
+    encoder-decoder models, if the sequence length of the encoder input is
+    larger than this, we fall back to the eager mode."""
+    max_logprobs: int = 20
+    """Maximum number of log probabilities to return when `logprobs` is
+    specified in `SamplingParams`. The default value comes the default for the
+    OpenAI Chat Completions API."""
+    disable_sliding_window: bool = False
+    """Whether to disable sliding window. If True, we will disable the sliding
+    window functionality of the model, capping to sliding window size. If the
+    model does not support sliding window, this argument is ignored."""
+    disable_cascade_attn: bool = False
+    """Disable cascade attention for V1. While cascade attention does not
+    change the mathematical correctness, disabling it could be useful for
+    preventing potential numerical issues. Note that even if this is set to
+    False, cascade attention will be only used when the heuristic tells that
+    it's beneficial."""
+    skip_tokenizer_init: bool = False
+    """Skip initialization of tokenizer and detokenizer. Expects valid
+    `prompt_token_ids` and `None` for prompt from the input. The generated
+    output will contain token ids."""
+    enable_prompt_embeds: bool = False
+    """If `True`, enables passing text embeddings as inputs via the
+    `prompt_embeds` key. Note that enabling this will double the time required
+    for graph compilation."""
+    served_model_name: Optional[Union[str, list[str]]] = None
+    """The model name(s) used in the API. If multiple names are provided, the
+    server will respond to any of the provided names. The model name in the
+    model field of a response will be the first name in this list. If not
+    specified, the model name will be the same as the `--model` argument. Noted
+    that this name(s) will also be used in `model_name` tag content of
+    prometheus metrics, if multiple names provided, metrics tag will take the
+    first one."""
+    limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
+    """Maximum number of data items per modality per prompt. Only applicable
+    for multimodal models."""
+    use_async_output_proc: bool = True
+    """Whether to use async output processor."""
+    config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
+    """The format of the model config to load:\n
+    - "auto" will try to load the config in hf format if available else it
+    will try to load in mistral format.\n
+    - "hf" will load the config in hf format.\n
+    - "mistral" will load the config in mistral format."""
+    hf_token: Optional[Union[bool, str]] = None
+    """The token to use as HTTP bearer authorization for remote files . If
+    `True`, will use the token generated when running `huggingface-cli login`
+    (stored in `~/.huggingface`)."""
+    hf_overrides: HfOverrides = field(default_factory=dict)
+    """If a dictionary, contains arguments to be forwarded to the Hugging Face
+    config. If a callable, it is called to update the HuggingFace config."""
+    mm_processor_kwargs: Optional[dict[str, Any]] = None
+    """Arguments to be forwarded to the model's processor for multi-modal data,
+    e.g., image processor. Overrides for the multi-modal processor obtained
+    from `AutoProcessor.from_pretrained`. The available overrides depend on the
+    model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
+    """
+    disable_mm_preprocessor_cache: bool = False
+    """If `True`, disable caching of the multi-modal preprocessor/mapper (not
+    recommended)."""
+    override_neuron_config: dict[str, Any] = field(default_factory=dict)
+    """Initialize non-default neuron config or override default neuron config
+    that are specific to Neuron devices, this argument will be used to
+    configure the neuron config that can not be gathered from the vllm
+    arguments. e.g. `{"cast_logits_dtype": "bloat16"}`."""
+    pooler_config: Optional["PoolerConfig"] = field(init=False)
+    """Pooler config which controls the behaviour of output pooling in pooling
+    models."""
+    override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
+    """Initialize non-default pooling config or override default pooling config
+    for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`.
     """
+    logits_processor_pattern: Optional[str] = None
+    """Optional regex pattern specifying valid logits processor qualified names
+    that can be passed with the `logits_processors` extra completion argument.
+    Defaults to `None`, which allows no processors."""
+    generation_config: str = "auto"
+    """The folder path to the generation config. Defaults to `"auto"`, the
+    generation config will be loaded from model path. If set to `"vllm"`, no
+    generation config is loaded, vLLM defaults will be used. If set to a folder
+    path, the generation config will be loaded from the specified folder path.
+    If `max_new_tokens` is specified in generation config, then it sets a
+    server-wide limit on the number of output tokens for all requests."""
+    override_generation_config: dict[str, Any] = field(default_factory=dict)
+    """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If
+    used with `--generation-config auto`, the override parameters will be
+    merged with the default config from the model. If used with
+    `--generation-config vllm`, only the override parameters are used."""
+    enable_sleep_mode: bool = False
+    """Enable sleep mode for the engine (only cuda platform is supported)."""
+    model_impl: Union[str, ModelImpl] = ModelImpl.AUTO.value
+    """Which implementation of the model to use:\n
+    - "auto" will try to use the vLLM implementation, if it exists, and fall
+    back to the Transformers implementation if no vLLM implementation is
+    available.\n
+    - "vllm" will use the vLLM model implementation.\n
+    - "transformers" will use the Transformers model implementation."""
 
     def compute_hash(self) -> str:
         """
@@ -302,7 +420,6 @@ def compute_hash(self) -> str:
         factors.append(self.max_logprobs)
         factors.append(self.disable_sliding_window)
         factors.append(self.trust_remote_code)
-        factors.append(self.mm_processor_kwargs)
         factors.append(self.generation_config)
         factors.append(self.model_impl)
         factors.append(self.override_generation_config)
@@ -310,94 +427,47 @@ def compute_hash(self) -> str:
         factors.append(self.rope_theta)
         # hf_config can control how the model looks!
         factors.append(self.hf_config.to_json_string())
+        str_factors = str(factors)
+        assert_hashable(str_factors)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
-    def __init__(
-        self,
-        model: str,
-        task: Union[TaskOption, Literal["draft"]],
-        tokenizer: str,
-        tokenizer_mode: str,
-        trust_remote_code: bool,
-        dtype: Union[str, torch.dtype],
-        seed: int,
-        hf_config_path: Optional[str] = None,
-        allowed_local_media_path: str = "",
-        revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        rope_theta: Optional[float] = None,
-        tokenizer_revision: Optional[str] = None,
-        max_model_len: Optional[int] = None,
-        spec_target_max_model_len: Optional[int] = None,
-        quantization: Optional[str] = None,
-        enforce_eager: Optional[bool] = None,
-        max_seq_len_to_capture: Optional[int] = None,
-        max_logprobs: int = 20,
-        disable_sliding_window: bool = False,
-        disable_cascade_attn: bool = False,
-        skip_tokenizer_init: bool = False,
-        served_model_name: Optional[Union[str, list[str]]] = None,
-        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
-        use_async_output_proc: bool = True,
-        config_format: ConfigFormat = ConfigFormat.AUTO,
-        hf_token: Optional[Union[bool, str]] = None,
-        hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        disable_mm_preprocessor_cache: bool = False,
-        override_neuron_config: Optional[dict[str, Any]] = None,
-        override_pooler_config: Optional["PoolerConfig"] = None,
-        logits_processor_pattern: Optional[str] = None,
-        generation_config: str = "auto",
-        enable_sleep_mode: bool = False,
-        override_generation_config: Optional[dict[str, Any]] = None,
-        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
-    ) -> None:
-        self.model = maybe_model_redirect(model)
-        self.tokenizer = maybe_model_redirect(tokenizer)
-
-        self.hf_config_path = hf_config_path
-        if isinstance(hf_config_path, str):
-            self.hf_config_path = maybe_model_redirect(hf_config_path)
-
-        self.tokenizer_mode = tokenizer_mode
-        self.trust_remote_code = trust_remote_code
-        self.allowed_local_media_path = allowed_local_media_path
-        self.seed = seed
-        self.revision = revision
-        self.code_revision = code_revision
-        self.rope_scaling = rope_scaling
-        self.rope_theta = rope_theta
-        self.model_impl = model_impl
-
-        if hf_overrides is None:
-            hf_overrides = {}
-
-        if callable(hf_overrides):
+    def __post_init__(self) -> None:
+        self.model = maybe_model_redirect(self.model)
+        # The tokenizer is consistent with the model by default.
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+        if self.tokenizer_revision is None:
+            self.tokenizer_revision = self.revision
+        self.tokenizer = maybe_model_redirect(self.tokenizer)
+
+        if isinstance(self.hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(self.hf_config_path)
+
+        if callable(self.hf_overrides):
             hf_overrides_kw = {}
-            hf_overrides_fn = hf_overrides
+            hf_overrides_fn = self.hf_overrides
         else:
-            hf_overrides_kw = hf_overrides
+            hf_overrides_kw = self.hf_overrides
             hf_overrides_fn = None
 
-        if rope_scaling is not None:
-            hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
+        if self.rope_scaling:
+            hf_override: dict[str, Any] = {"rope_scaling": self.rope_scaling}
             hf_overrides_kw.update(hf_override)
-            hf_overrides_str = json.dumps(hf_overrides)
+            hf_overrides_str = json.dumps(hf_overrides_kw)
             msg = (
                 "`--rope-scaling` will be removed in a future release. "
                 f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
-        if rope_theta is not None:
-            hf_override = {"rope_theta": rope_theta}
+        if self.rope_theta is not None:
+            hf_override = {"rope_theta": self.rope_theta}
             hf_overrides_kw.update(hf_override)
-            hf_overrides_str = json.dumps(hf_overrides)
+            hf_overrides_str = json.dumps(hf_overrides_kw)
             msg = (
                 "`--rope-theta` will be removed in a future release. "
                 f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
 
-        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
+        self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer)
 
         if (backend := envs.VLLM_ATTENTION_BACKEND
             ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
@@ -407,20 +477,6 @@ def __init__(
                 "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
                 "for instructions on how to install it.")
 
-        # The tokenizer version is consistent with the model version by default.
-        if tokenizer_revision is None:
-            self.tokenizer_revision = revision
-        else:
-            self.tokenizer_revision = tokenizer_revision
-        self.quantization = quantization
-        self.enforce_eager = enforce_eager
-        self.max_seq_len_to_capture = max_seq_len_to_capture
-        self.max_logprobs = max_logprobs
-        self.disable_sliding_window = disable_sliding_window
-        self.disable_cascade_attn = disable_cascade_attn
-        self.skip_tokenizer_init = skip_tokenizer_init
-        self.enable_sleep_mode = enable_sleep_mode
-
         from vllm.platforms import current_platform
 
         if (self.enable_sleep_mode
@@ -428,9 +484,12 @@ def __init__(
             raise ValueError(
                 "Sleep mode is not supported on current platform.")
 
+        if isinstance(self.config_format, str):
+            self.config_format = ConfigFormat(self.config_format)
+
         hf_config = get_config(self.hf_config_path or self.model,
-                               trust_remote_code, revision, code_revision,
-                               config_format)
+                               self.trust_remote_code, self.revision,
+                               self.code_revision, self.config_format)
 
         if hf_overrides_kw:
             logger.info("Overriding HF config with %s", hf_overrides_kw)
@@ -446,15 +505,8 @@ def __init__(
                                             "attention_chunk_size", None)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, hf_token=hf_token, revision=revision)
-        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
-        self.use_async_output_proc = use_async_output_proc
-        self.mm_processor_kwargs = mm_processor_kwargs
-        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
-
-        # Set enforce_eager to False if the value is unset.
-        if self.enforce_eager is None:
-            self.enforce_eager = False
+            self.model, hf_token=self.hf_token, revision=self.revision)
+        self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype)
 
         interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
@@ -469,11 +521,11 @@ def __init__(
                     self.hf_text_config.sliding_window)
 
                 logger.warning_once(
-                    f"{self.hf_text_config.model_type} has interleaved "
-                    "attention, which is currently not supported by the "
-                    f"{backend} backend. Disabling sliding window and capping "
-                    "the max length to the sliding window size "
-                    f"({sliding_window_len_min}).")
+                    "%s has interleaved attention, which is currently not supported by the %s backend. Disabling sliding window and capping the max length to the sliding window size (%d).",  # noqa: E501
+                    self.hf_text_config.model_type,
+                    backend,
+                    sliding_window_len_min,
+                )
                 self.disable_sliding_window = True
             else:
                 # for a model with interleaved attention,
@@ -487,15 +539,14 @@ def __init__(
 
         self.max_model_len = _get_and_verify_max_len(
             hf_config=self.hf_text_config,
-            max_model_len=max_model_len,
+            max_model_len=self.max_model_len,
             disable_sliding_window=self.disable_sliding_window,
             sliding_window_len=self.get_hf_config_sliding_window(),
-            spec_target_max_model_len=spec_target_max_model_len,
+            spec_target_max_model_len=self.spec_target_max_model_len,
             encoder_config=self.encoder_config)
-        self.served_model_name = get_served_model_name(model,
-                                                       served_model_name)
-        self.multimodal_config = self._init_multimodal_config(
-            limit_mm_per_prompt)
+        self.served_model_name = get_served_model_name(self.model,
+                                                       self.served_model_name)
+        self.multimodal_config = self._init_multimodal_config()
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
@@ -504,24 +555,19 @@ def __init__(
         self.has_noops = self._init_has_noops()
         self.has_inner_state = self._init_has_inner_state()
 
-        if current_platform.is_neuron():
-            self.override_neuron_config = override_neuron_config
-        else:
-            self.override_neuron_config = None
+        if (not current_platform.is_neuron() and self.override_neuron_config):
+            raise ValueError(
+                "`override_neuron_config` is only supported on Neuron.")
 
-        supported_tasks, task = self._resolve_task(task)
+        supported_tasks, task = self._resolve_task(self.task)
         self.supported_tasks = supported_tasks
-        self.task: Final = task
+        self.task = task
         if self.task in ("draft", "generate"):
             self.truncation_side = "left"
         else:
             self.truncation_side = "right"
 
-        self.pooler_config = self._init_pooler_config(override_pooler_config)
-        self.logits_processor_pattern = logits_processor_pattern
-
-        self.generation_config = generation_config
-        self.override_generation_config = override_generation_config or {}
+        self.pooler_config = self._init_pooler_config()
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -560,15 +606,23 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
                     model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
                 self.tokenizer = s3_tokenizer.dir
 
-    def _init_multimodal_config(
-        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
-    ) -> Optional["MultiModalConfig"]:
+    def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
         if self.registry.is_multimodal_model(self.architectures):
-            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
+            return MultiModalConfig(
+                limit_per_prompt=self.limit_mm_per_prompt,
+                mm_processor_kwargs=self.mm_processor_kwargs,
+                disable_mm_preprocessor_cache=self.
+                disable_mm_preprocessor_cache)
 
-        if limit_mm_per_prompt:
+        if self.limit_mm_per_prompt:
             raise ValueError("`limit_mm_per_prompt` is only supported for "
                              "multimodal models.")
+        if self.mm_processor_kwargs:
+            raise ValueError("`mm_processor_kwargs` is only supported for "
+                             "multimodal models.")
+        if self.disable_mm_preprocessor_cache:
+            raise ValueError("`disable_mm_preprocessor_cache` is only "
+                             "supported for multimodal models.")
 
         return None
 
@@ -576,31 +630,32 @@ def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
 
-    def _init_pooler_config(
-        self,
-        override_pooler_config: Optional["PoolerConfig"],
-    ) -> Optional["PoolerConfig"]:
+    def _init_pooler_config(self) -> Optional["PoolerConfig"]:
 
         if self.runner_type == "pooling":
-            user_config = override_pooler_config or PoolerConfig()
+            if isinstance(self.override_pooler_config, dict):
+                self.override_pooler_config = PoolerConfig(
+                    **self.override_pooler_config)
+
+            pooler_config = self.override_pooler_config or PoolerConfig()
 
             base_config = get_pooling_config(self.model, self.revision)
             if base_config is not None:
                 # Only set values that are not overridden by the user
                 for k, v in base_config.items():
-                    if getattr(user_config, k) is None:
-                        setattr(user_config, k, v)
+                    if getattr(pooler_config, k) is None:
+                        setattr(pooler_config, k, v)
 
             if self.is_matryoshka:
-                if user_config.normalize is None:
-                    user_config.normalize = True
-                elif not user_config.normalize:
+                if pooler_config.normalize is None:
+                    pooler_config.normalize = True
+                elif not pooler_config.normalize:
                     raise ValueError(
                         "`normalize` must be enabled (set to True) "
                         "for models that are compatible with "
                         "Matryoshka Representation.")
 
-            return user_config
+            return pooler_config
 
         return None
 
@@ -618,11 +673,11 @@ def _init_has_inner_state(self) -> bool:
         return self.registry.model_has_inner_state(self.architectures)
 
     def _verify_tokenizer_mode(self) -> None:
-        tokenizer_mode = self.tokenizer_mode.lower()
-        if tokenizer_mode not in ["auto", "slow", "mistral", "custom"]:
+        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
+        if tokenizer_mode not in get_args(TokenizerMode):
             raise ValueError(
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                "either 'auto', 'slow', 'mistral' or 'custom'.")
+                f"one of {get_args(TokenizerMode)}.")
         self.tokenizer_mode = tokenizer_mode
 
     def _get_preferred_task(
@@ -658,7 +713,7 @@ def _get_preferred_task(
 
     def _resolve_task(
         self,
-        task_option: Union[TaskOption, Literal["draft"]],
+        task_option: Literal[TaskOption, Literal["draft"]],
     ) -> tuple[set[_ResolvedTask], _ResolvedTask]:
         if task_option == "draft":
             return {"draft"}, "draft"
@@ -733,24 +788,59 @@ def _verify_quantization(self) -> None:
         supported_quantization = QUANTIZATION_METHODS
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
-            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark", "nvfp4"
+            "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
+            "quark", "nvfp4", "bitblas", "gptq_bitblas"
         ]
         if self.quantization is not None:
-            self.quantization = self.quantization.lower()
+            self.quantization = cast(QuantizationMethods,
+                                     self.quantization.lower())
 
         # Parse quantization method from the HF model config, if available.
         quant_cfg = self._parse_quant_hf_config()
 
         if quant_cfg is not None:
             quant_method = quant_cfg.get("quant_method", "").lower()
+            quant_method = quant_method.replace("compressed_tensors",
+                                                "compressed-tensors")
+            quant_cfg["quant_method"] = quant_method
+
+            # Quantization methods which are overrides (i.e. they have a
+            # `override_quantization_method` method) must be checked in order
+            # of preference (this is particularly important for GPTQ).
+            overrides = [
+                "marlin",
+                "bitblas",
+                "gptq_marlin_24",
+                "gptq_marlin",
+                "gptq_bitblas",
+                "awq_marlin",
+                "ipex",
+                "moe_wna16",
+            ]
+            quantization_methods = [
+                q for q in supported_quantization if q not in overrides
+            ]
+            # Any custom overrides will be in quantization_methods so we place
+            # them at the start of the list so custom overrides have preference
+            # over the built in ones.
+            quantization_methods = quantization_methods + overrides
 
             # Detect which checkpoint is it
-            for name in QUANTIZATION_METHODS:
+            for name in quantization_methods:
                 method = get_quantization_config(name)
                 quantization_override = method.override_quantization_method(
                     quant_cfg, self.quantization)
-                if quantization_override:
+                if quantization_override is not None:
+                    # Raise error if the override is not custom (custom would
+                    # be in QUANTIZATION_METHODS but not QuantizationMethods)
+                    # and hasn't been added to the overrides list.
+                    if (name in get_args(QuantizationMethods)
+                            and name not in overrides):
+                        raise ValueError(
+                            f"Quantization method {name} is an override but "
+                            "is has not been added to the `overrides` list "
+                            "above. This is necessary to ensure that the "
+                            "overrides are checked in order of preference.")
                     quant_method = quantization_override
                     self.quantization = quantization_override
                     break
@@ -779,8 +869,6 @@ def _verify_quantization(self) -> None:
                     "non-quantized models.", self.quantization)
 
     def _verify_cuda_graph(self) -> None:
-        if self.max_seq_len_to_capture is None:
-            self.max_seq_len_to_capture = self.max_model_len
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
         ROCM_UNSUPPORTED_MODELS = ['mllama']
@@ -958,8 +1046,10 @@ def get_head_size(self) -> int:
         if self.is_attention_free:
             return 0
 
-        if hasattr(self.hf_text_config, "head_dim"):
+        # NOTE: Some configs may set head_dim=None in the config
+        if getattr(self.hf_text_config, "head_dim", None) is not None:
             return self.hf_text_config.head_dim
+
         # FIXME(woosuk): This may not be true for all models.
         return (self.hf_text_config.hidden_size //
                 self.hf_text_config.num_attention_heads)
@@ -1216,7 +1306,7 @@ def supported_runner_types(self) -> set[RunnerType]:
 
     @property
     def runner_type(self) -> RunnerType:
-        return _TASK_RUNNER[self.task]
+        return _TASK_RUNNER[cast(_ResolvedTask, self.task)]
 
     @property
     def is_v1_compatible(self) -> bool:
@@ -1228,23 +1318,78 @@ def is_matryoshka(self) -> bool:
         return (hasattr(self.hf_config, "matryoshka_dimensions")
                 or getattr(self.hf_config, "is_matryoshka", False))
 
+    @property
+    def matryoshka_dimensions(self):
+        return getattr(self.hf_config, "matryoshka_dimensions", None)
+
 
+BlockSize = Literal[1, 8, 16, 32, 64, 128]
+CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
+PrefixCachingHashAlgo = Literal["builtin", "sha256"]
+
+
+@config
+@dataclass
 class CacheConfig:
-    """Configuration for the KV cache.
+    """Configuration for the KV cache."""
 
-    Args:
-        block_size: Size of a cache block in number of tokens.
-        gpu_memory_utilization: Fraction of GPU memory to use for the
-            vLLM execution.
-        swap_space: Size of the CPU swap space per GPU (in GiB).
-        cache_dtype: Data type for kv cache storage.
-        is_attention_free: Whether the model is attention-free.
-        num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
-            profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache.
-        enable_prefix_caching: Whether to enable prefix caching.
-        cpu_offload_gb: Size of the CPU offload buffer in GiB.
+    block_size: BlockSize = None  # type: ignore
+    """Size of a contiguous cache block in number of tokens. This is ignored on
+    neuron devices and set to `--max-model-len`. On CUDA devices, only block
+    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `Platform.check_and_update_configs()` based on the current
+    platform."""
+    gpu_memory_utilization: float = 0.9
+    """The fraction of GPU memory to be used for the model executor, which can
+    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
+    utilization. If unspecified, will use the default value of 0.9. This is a
+    per-instance limit, and only applies to the current vLLM instance. It does
+    not matter if you have another vLLM instance running on the same GPU. For
+    example, if you have two vLLM instances running on the same GPU, you can
+    set the GPU memory utilization to 0.5 for each instance."""
+    swap_space: float = 4
+    """Size of the CPU swap space per GPU (in GiB)."""
+    cache_dtype: CacheDType = "auto"
+    """Data type for kv cache storage. If "auto", will use model data type.
+    CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+    fp8 (=fp8_e4m3)."""
+    is_attention_free: bool = False
+    """Whether the model is attention-free. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    num_gpu_blocks_override: Optional[int] = None
+    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
+    if specified. Does nothing if `None`. Used for testing preemption."""
+    sliding_window: Optional[int] = None
+    """Sliding window size for the KV cache. This is primarily set in
+    `ModelConfig` and that value should be manually duplicated here."""
+    enable_prefix_caching: Optional[bool] = None
+    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
+    default for V1."""
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
+    """Set the hash algorithm for prefix caching:\n
+    - "builtin" is Python's built-in hash.\n
+    - "sha256" is collision resistant but with certain overheads."""
+    cpu_offload_gb: float = 0
+    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
+    no offloading. Intuitively, this argument can be seen as a virtual way to
+    increase the GPU memory size. For example, if you have one 24 GB GPU and
+    set this to 10, virtually you can think of it as a 34 GB GPU. Then you can
+    load a 13B model with BF16 weight, which requires at least 26GB GPU memory.
+    Note that this requires fast CPU-GPU interconnect, as part of the model is
+    loaded from CPU memory to GPU memory on the fly in each model forward pass.
     """
+    calculate_kv_scales: bool = False
+    """This enables dynamic calculation of `k_scale` and `v_scale` when
+    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
+    checkpoint if available. Otherwise, the scales will default to 1.0."""
+
+    # Will be set after profiling.
+    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for GPU memory."""
+    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    """The number of blocks to allocate for CPU memory."""
 
     def compute_hash(self) -> str:
         """
@@ -1265,43 +1410,13 @@ def compute_hash(self) -> str:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __init__(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        swap_space: float,
-        cache_dtype: str,
-        is_attention_free: bool = False,
-        num_gpu_blocks_override: Optional[int] = None,
-        sliding_window: Optional[int] = None,
-        enable_prefix_caching: bool = False,
-        prefix_caching_hash_algo: str = "builtin",
-        cpu_offload_gb: float = 0,
-        calculate_kv_scales: Optional[bool] = None,
-    ) -> None:
-        self.block_size = block_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * GiB_bytes
-        self.num_gpu_blocks_override = num_gpu_blocks_override
-        self.cache_dtype = cache_dtype
-        self.is_attention_free = is_attention_free
-        self.sliding_window = sliding_window
-        self.enable_prefix_caching = enable_prefix_caching
-        self.prefix_caching_hash_algo = prefix_caching_hash_algo
-        self.cpu_offload_gb = cpu_offload_gb
-        self.calculate_kv_scales = calculate_kv_scales
+    def __post_init__(self) -> None:
+        self.swap_space_bytes = self.swap_space * GiB_bytes
+
         self._verify_args()
         self._verify_cache_dtype()
         self._verify_prefix_caching()
 
-        # Will be set after profiling.
-        self.num_gpu_blocks: Optional[int] = None
-        self.num_cpu_blocks: Optional[int] = None
-
-        # Set calculate_kv_scales to False if the value is unset.
-        if self.calculate_kv_scales is None:
-            self.calculate_kv_scales = False
-
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
@@ -1320,7 +1435,7 @@ def _verify_args(self) -> None:
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
-        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+        elif self.cache_dtype in get_args(CacheDType):
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
@@ -1338,12 +1453,12 @@ def _verify_prefix_caching(self) -> None:
                 "Prefix caching is not supported with sliding window. "
                 "Run with --disable-sliding-window to use prefix caching.")
 
-        if self.enable_prefix_caching and self.prefix_caching_hash_algo not in (
-                "builtin", "sha256"):
+        if (self.enable_prefix_caching and self.prefix_caching_hash_algo
+                not in get_args(PrefixCachingHashAlgo)):
             raise ValueError(
                 "Unknown prefix caching hash algorithm: "
-                f"{self.prefix_caching_hash_algo}. Must be either "
-                "'builtin' or 'sha256'.")
+                f"{self.prefix_caching_hash_algo}. Must be one of "
+                f"{get_args(PrefixCachingHashAlgo)}.")
 
     def verify_with_parallel_config(
         self,
@@ -1364,77 +1479,33 @@ def verify_with_parallel_config(
             logger.warning("Possibly too large swap space. %s", msg)
 
 
+@config
 @dataclass
 class TokenizerPoolConfig:
-    """Configuration for the tokenizer pool.
+    """This config is deprecated and will be removed in a future release.
 
-    Args:
-        pool_size: Number of tokenizer workers in the pool.
-        pool_type: Type of the pool.
-        extra_config: Additional config for the pool.
-            The way the config will be used depends on the
-            pool type.
+    Passing these parameters will have no effect. Please remove them from your
+    configurations.
     """
-    pool_size: int
-    pool_type: Union[str, type["BaseTokenizerGroup"]]
-    extra_config: dict
 
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
+    pool_size: int = 0
+    """This parameter is deprecated and will be removed in a future release.
+    Passing this parameter will have no effect. Please remove it from your
+    configurations."""
+    pool_type: str = "ray"
+    """This parameter is deprecated and will be removed in a future release.
+    Passing this parameter will have no effect. Please remove it from your
+    configurations."""
+    extra_config: dict = field(default_factory=dict)
+    """This parameter is deprecated and will be removed in a future release.
+    Passing this parameter will have no effect. Please remove it from your
+    configurations."""
 
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        if self.pool_type not in ("ray", ) and not isinstance(
-                self.pool_type, type):
-            raise ValueError(f"Unknown pool type: {self.pool_type}")
-        if not isinstance(self.extra_config, dict):
-            raise ValueError("extra_config must be a dictionary.")
-
-    @classmethod
-    def create_config(
-        cls, tokenizer_pool_size: int,
-        tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]],
-        tokenizer_pool_extra_config: Optional[Union[str, dict]]
-    ) -> Optional["TokenizerPoolConfig"]:
-        """Create a TokenizerPoolConfig from the given parameters.
-
-        If tokenizer_pool_size is 0, return None.
-
-        Args:
-            tokenizer_pool_size: Number of tokenizer workers in the pool.
-            tokenizer_pool_type: Type of the pool.
-            tokenizer_pool_extra_config: Additional config for the pool.
-                The way the config will be used depends on the
-                pool type. This can be a JSON string (will be parsed).
-        """
-        if tokenizer_pool_size:
-            if isinstance(tokenizer_pool_extra_config, str):
-                tokenizer_pool_extra_config_parsed = json.loads(
-                    tokenizer_pool_extra_config)
-            else:
-                tokenizer_pool_extra_config_parsed = (
-                    tokenizer_pool_extra_config or {})
-            tokenizer_pool_config = cls(tokenizer_pool_size,
-                                        tokenizer_pool_type,
-                                        tokenizer_pool_extra_config_parsed)
-        else:
-            tokenizer_pool_config = None
-        return tokenizer_pool_config
+    def __post_init__(self) -> None:
+        logger.warning_once(
+            "TokenizerPoolConfig is deprecated and will be removed in a "
+            "future release. Passing this parameter will have no effect. "
+            "Please remove it from your configurations.")
 
 
 class LoadFormat(str, enum.Enum):
@@ -1449,6 +1520,7 @@ class LoadFormat(str, enum.Enum):
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
     RUNAI_STREAMER = "runai_streamer"
+    RUNAI_STREAMER_SHARDED = "runai_streamer_sharded"
     FASTSAFETENSORS = "fastsafetensors"
 
 
@@ -1483,16 +1555,25 @@ class LoadConfig:
     download_dir: Optional[str] = None
     """Directory to download and load the weights, default to the default
     cache directory of Hugging Face."""
-    model_loader_extra_config: Optional[Union[str, dict]] = None
+    model_loader_extra_config: dict = field(default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
-    corresponding to the chosen load_format. This should be a JSON string that
-    will be parsed into a dictionary."""
+    corresponding to the chosen load_format."""
     ignore_patterns: Optional[Union[list[str], str]] = None
     """The list of patterns to ignore when loading the model. Default to
     "original/**/*" to avoid repeated loading of llama's checkpoints."""
     use_tqdm_on_load: bool = True
     """Whether to enable tqdm for showing progress bar when loading model
     weights."""
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
+    """
+    pt_load_map_location: the map location for loading pytorch checkpoint, to
+    support loading checkpoints can only be loaded on certain devices like
+    "cuda", this is equivalent to {"": "cuda"}. Another supported format is
+    mapping from different devices like from GPU 1 to GPU 0:
+    {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
+    in dictionary needs to be double quoted for json parsing. For more details,
+    see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
+    """
 
     def compute_hash(self) -> str:
         """
@@ -1514,10 +1595,6 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        model_loader_extra_config = self.model_loader_extra_config or {}
-        if isinstance(model_loader_extra_config, str):
-            self.model_loader_extra_config = json.loads(
-                model_loader_extra_config)
         if isinstance(self.load_format, str):
             load_format = self.load_format.lower()
             self.load_format = LoadFormat(load_format)
@@ -1547,8 +1624,21 @@ class ParallelConfig:
     the product of the tensor parallel size and data parallel size."""
     data_parallel_rank: int = 0
     """Rank of the data parallel group."""
-    data_parallel_rank_local: Optional[int] = None
-    """Local rank of the data parallel group, defaults to global rank."""
+    _data_parallel_rank_local: Optional[int] = field(default=None, init=False)
+    """Private field to store the local rank of the data parallel group."""
+
+    @property
+    def data_parallel_rank_local(self) -> int:
+        """Local rank of the data parallel group, defaults to global rank."""
+        if self._data_parallel_rank_local is None:
+            return self.data_parallel_rank
+        return self._data_parallel_rank_local
+
+    @data_parallel_rank_local.setter
+    def data_parallel_rank_local(self, value: int) -> None:
+        """Set the local rank of the data parallel group."""
+        self._data_parallel_rank_local = value
+
     data_parallel_master_ip: str = "127.0.0.1"
     """IP of the data parallel master."""
     data_parallel_master_port: int = 29500
@@ -1557,7 +1647,7 @@ class ParallelConfig:
     """Use expert parallelism instead of tensor parallelism for MoE layers."""
 
     max_parallel_loading_workers: Optional[int] = None
-    """Maximum number of parallal loading workers when loading model
+    """Maximum number of parallel loading workers when loading model
     sequentially in multiple batches. To avoid RAM OOM when using tensor
     parallel and large models."""
 
@@ -1565,8 +1655,8 @@ class ParallelConfig:
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
     tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
-    """Config for the tokenizer pool. If None, will use synchronous
-    tokenization."""
+    """This parameter is deprecated and will be removed in a future release.
+    Please remove it from your configs"""
 
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
@@ -1588,7 +1678,7 @@ class ParallelConfig:
     """The full name of the worker class to use. If "auto", the worker class
     will be determined based on the platform."""
     sd_worker_cls: str = "auto"
-    """The full name of the worker class to use for speculative decofing. 
+    """The full name of the worker class to use for speculative decofing.
     If "auto", the worker class will be determined based on the platform."""
     worker_extension_cls: str = ""
     """The full name of the worker extension class to use. The worker extension
@@ -1657,6 +1747,7 @@ def compute_hash(self):
         factors: list[Any] = []
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
+        factors.append(self.enable_expert_parallel)
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __post_init__(self) -> None:
@@ -1766,6 +1857,7 @@ def _verify_args(self) -> None:
             "worker_extension_cls must be a string (qualified class name).")
 
 
+PreemptionMode = Literal["swap", "recompute"]
 SchedulerPolicy = Literal["fcfs", "priority"]
 
 
@@ -1779,13 +1871,13 @@ class SchedulerConfig:
 
     max_num_batched_tokens: int = None  # type: ignore
     """Maximum number of tokens to be processed in a single iteration.
-    
+
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
     max_num_seqs: int = None  # type: ignore
     """Maximum number of sequences to be processed in a single iteration.
-    
+
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
@@ -1817,6 +1909,13 @@ class SchedulerConfig:
     NOTE: This will be replaced by speculative config in the future; it is
     present to enable correctness tests until then."""
 
+    cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
+    """Cuda graph capture sizes, default is 512.
+    1. if one value is provided, then the capture list would follow the
+    pattern: [1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
+    2. more than one value (e.g. 1 2 128) is provided, then the capture list
+    will follow the provided list."""
+
     delay_factor: float = 0.0
     """Apply a delay (of delay factor multiplied by previous
     prompt latency) before scheduling next prompt."""
@@ -1831,7 +1930,7 @@ class SchedulerConfig:
     # TODO (ywang96): Make this configurable.
     max_num_encoder_input_tokens: int = field(init=False)
     """Multimodal encoder compute budget, only used in V1.
-    
+
     NOTE: This is not currently configurable. It will be overridden by
     max_num_batched_tokens in case max multimodal embedding size is larger."""
 
@@ -1842,7 +1941,7 @@ class SchedulerConfig:
     NOTE: This is not currently configurable. It will be overridden by
     max_num_batched_tokens in case max multimodal embedding size is larger."""
 
-    preemption_mode: Optional[str] = None
+    preemption_mode: Optional[PreemptionMode] = None
     """Whether to perform preemption by swapping or
     recomputation. If not specified, we determine the mode as follows:
     We use recomputation by default since it incurs lower overhead than
@@ -1880,6 +1979,8 @@ class SchedulerConfig:
     some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
     it will be scheduled as TTTT in one step and IIIIIIIIII in the next."""
 
+    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
+    # or "mod.custom_class".
     scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
     """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
     default scheduler. Can be a class directly or the path to a class of form
@@ -2029,9 +2130,19 @@ def is_multi_step(self) -> bool:
         return self.num_scheduler_steps > 1
 
 
+Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
+
+
+@config
+@dataclass
 class DeviceConfig:
-    device: Optional[torch.device]
-    device_type: str
+    """Configuration for the device to use for vLLM execution."""
+
+    device: Union[Device, torch.device] = "auto"
+    """Device type for vLLM execution."""
+    device_type: str = field(init=False)
+    """Device type from the current platform. This is set in
+    `__post_init__`."""
 
     def compute_hash(self) -> str:
         """
@@ -2053,8 +2164,8 @@ def compute_hash(self) -> str:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __init__(self, device: str = "auto") -> None:
-        if device == "auto":
+    def __post_init__(self):
+        if self.device == "auto":
             # Automated device type detection
             from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
@@ -2065,7 +2176,7 @@ def __init__(self, device: str = "auto") -> None:
                     "to turn on verbose logging to help debug the issue.")
         else:
             # Device type is assigned explicitly
-            self.device_type = device
+            self.device_type = self.device
 
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron"]:
@@ -2077,139 +2188,113 @@ def __init__(self, device: str = "auto") -> None:
             self.device = torch.device(self.device_type)
 
 
+SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
+                            "draft_model"]
+SpeculativeAcceptanceMethod = Literal["rejection_sampler",
+                                      "typical_acceptance_sampler"]
+
+
+@config
 @dataclass
 class SpeculativeConfig:
-    """
-    Configuration for speculative decoding.
-    Configurable parameters include:
-    - General Speculative Decoding Control:
-        - num_speculative_tokens (int): The number of speculative
-            tokens, if provided. It will default to the number in the draft
-            model config if present, otherwise, it is required.
-        - model (Optional[str]): The name of the draft model, eagle head,
-            or additional weights, if provided.
-        - method (Optional[str]): The name of the speculative method to use.
-            If users provide and set the `model` param, the speculative method
-            type will be detected automatically if possible, if `model` param
-            is not provided, the method name must be provided.
-            - Possible values:
-                - ngram
-                    Related additional configuration:
-                    - prompt_lookup_max (Optional[int]):
-                        Maximum size of ngram token window when using Ngram
-                        proposer, required when method is set to ngram.
-                    - prompt_lookup_min (Optional[int]):
-                        Minimum size of ngram token window when using Ngram
-                        proposer, if provided. Defaults to 1.
-                - eagle
-                - medusa
-                - mlp_speculator
-                - draft_model
-        - acceptance_method (str): The method to use for accepting draft
-            tokens. This can take two possible values: 'rejection_sampler' and
-            'typical_acceptance_sampler' for RejectionSampler and
-            TypicalAcceptanceSampler respectively. If not specified, it
-            defaults to 'rejection_sampler'.
-            - Possible values:
-                - rejection_sampler
-                - typical_acceptance_sampler
-                    Related additional configuration:
-                    - posterior_threshold (Optional[float]):
-                        A threshold value that sets a lower bound on the
-                        posterior probability of a token in the target model
-                        for it to be accepted. This threshold is used only
-                        when we use the TypicalAcceptanceSampler for token
-                        acceptance.
-                    - posterior_alpha (Optional[float]):
-                        Scaling factor for entropy-based threshold, applied
-                        when using TypicalAcceptanceSampler.
-        - draft_tensor_parallel_size (Optional[int]): The degree of the tensor
-            parallelism for the draft model. Can only be 1 or the same as the
-            target model's tensor parallel size.
-        - disable_logprobs (bool): If set to True, token log probabilities are
-            not returned during speculative decoding. If set to False, token
-            log probabilities are returned according to the log probability
-            settings in SamplingParams. If not specified, it defaults to True.
-
-    - Draft Model Configuration:
-        - quantization (Optional[str]): Quantization method that was used to
-            quantize the draft model weights. If None, we assume the
-            model weights are not quantized. Note that it only takes effect
-            when using the draft model-based speculative method.
-        - max_model_len (Optional[int]): The maximum model length of the
-            draft model. Used when testing the ability to skip
-            speculation for some sequences.
-        - revision: The specific model version to use for the draft model. It
-            can be a branch name, a tag name, or a commit id. If unspecified,
-            will use the default version.
-        - code_revision: The specific revision to use for the draft model code
-            on Hugging Face Hub. It can be a branch name, a tag name, or a
-            commit id. If unspecified, will use the default version.
-
-    - Advanced Control:
-        - disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
-            batch expansion for scoring proposals. If not specified, it
-            defaults to False.
-        - disable_by_batch_size (Optional[int]): Disable speculative decoding
-            for new incoming requests when the number of enqueued requests is
-            larger than this value, if provided.
-
-    Although the parameters above are structured hierarchically, there is no
-    need to nest them during configuration.
-
-    Non-configurable internal parameters include:
-    - Model Configuration:
-        - target_model_config (ModelConfig): The configuration of the target
-            model.
-        - draft_model_config (ModelConfig): The configuration of the draft
-            model initialized internal.
-    - Parallelism Configuration:
-        - target_parallel_config (ParallelConfig): The parallel configuration
-            for the target model.
-        - draft_parallel_config (ParallelConfig): The parallel configuration
-            for the draft model initialized internal.
-    - Execution Control:
-        - enable_chunked_prefill (bool): Whether vLLM is configured to use
-            chunked prefill or not. Used for raising an error since it's not
-            yet compatible with speculative decode.
-        - disable_log_stats (bool): Whether to disable the periodic printing of
-            stage times in speculative decoding.
-    """
-    # speculative configs from cli args
+    """Configuration for speculative decoding."""
+
+    # General speculative decoding control
     num_speculative_tokens: int = field(default=None,
                                         init=True)  # type: ignore
-    method: Optional[str] = None
-    acceptance_method: str = "rejection_sampler"
+    """The number of speculative tokens, if provided. It will default to the
+    number in the draft model config if present, otherwise, it is required."""
+    model: Optional[str] = None
+    """The name of the draft model, eagle head, or additional weights, if
+    provided."""
+    method: Optional[SpeculativeMethod] = None
+    """The name of the speculative method to use. If users provide and set the
+    `model` param, the speculative method type will be detected automatically
+    if possible, if `model` param is not provided, the method name must be
+    provided.
+
+    If using `ngram` method, the related configuration `prompt_lookup_max` and
+    `prompt_lookup_min` should be considered."""
+    acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler"
+    """The method to use for accepting draft tokens:\n
+    - "rejection_sampler" maps to `RejectionSampler`.\n
+    - "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`.
+
+    If using `typical_acceptance_sampler`, the related configuration
+    `posterior_threshold` and `posterior_alpha` should be considered."""
     draft_tensor_parallel_size: Optional[int] = None
+    """The degree of the tensor parallelism for the draft model. Can only be 1
+    or the same as the target model's tensor parallel size."""
     disable_logprobs: bool = True
-
-    model: Optional[str] = None
-    quantization: Optional[str] = None
+    """If set to True, token log probabilities are not returned during
+    speculative decoding. If set to False, token log probabilities are returned
+    according to the log probability settings in SamplingParams."""
+
+    # Draft model configuration
+    quantization: Optional[QuantizationMethods] = None
+    """Quantization method that was used to quantize the draft model weights.
+    If `None`, we assume the model weights are not quantized. Note that it only
+    takes effect when using the draft model-based speculative method."""
     max_model_len: Optional[int] = None
+    """The maximum model length of the draft model. Used when testing the
+    ability to skip speculation for some sequences."""
     revision: Optional[str] = None
+    """The specific model version to use for the draft model. It can be a
+    branch name, a tag name, or a commit id. If unspecified, will use the
+    default version."""
     code_revision: Optional[str] = None
+    """The specific revision to use for the draft model code on Hugging Face
+    Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
+    will use the default version."""
 
+    # Advanced control
     disable_mqa_scorer: bool = False
+    """Disable the MQA scorer and fall back to batch expansion for scoring
+    proposals."""
     disable_by_batch_size: Optional[int] = None
+    """Disable speculative decoding for new incoming requests when the number
+    of enqueued requests is larger than this value, if provided."""
+
+    # Ngram proposer configuration
     prompt_lookup_max: Optional[int] = None
+    """Maximum size of ngram token window when using Ngram proposer, required
+    when method is set to ngram."""
     prompt_lookup_min: Optional[int] = None
+    """Minimum size of ngram token window when using Ngram proposer, if
+    provided. Defaults to 1."""
+
+    # Typical acceptance sampler configuration
     posterior_threshold: Optional[float] = None
+    """A threshold value that sets a lower bound on the posterior probability
+    of a token in the target model for it to be accepted. This threshold is
+    used only when we use the `TypicalAcceptanceSampler` for token acceptance.
+    """
     posterior_alpha: Optional[float] = None
+    """Scaling factor for entropy-based threshold, applied when using
+    `TypicalAcceptanceSampler`."""
 
     # required configuration params passed from engine
     target_model_config: ModelConfig = field(default=None,
                                              init=True)  # type: ignore
+    """The configuration of the target model."""
     target_parallel_config: ParallelConfig = field(default=None,
                                                    init=True)  # type: ignore
+    """The parallel configuration for the target model."""
     enable_chunked_prefill: bool = field(default=None,
                                          init=True)  # type: ignore
+    """Whether vLLM is configured to use chunked prefill or not. Used for
+    raising an error since it's not yet compatible with speculative decode."""
     disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+    """Whether to disable the periodic printing of stage times in speculative
+    decoding."""
 
     # params generated in the post-init stage
     draft_model_config: ModelConfig = field(default=None,
                                             init=True)  # type: ignore
+    """The configuration of the draft model initialized internal."""
     draft_parallel_config: ParallelConfig = field(default=None,
                                                   init=True)  # type: ignore
+    """The parallel configuration for the draft model initialized internal."""
 
     def compute_hash(self) -> str:
         """
@@ -2223,9 +2308,10 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        # no factors to consider.
-        # spec decode does not use `torch.compile` yet.
         factors: list[Any] = []
+        # Eagle3 affects the computation graph because it returns intermediate
+        # hidden states in addition to the final hidden state.
+        factors.append(self.method == "eagle3")
         hash_str = hashlib.md5(str(factors).encode(),
                                usedforsecurity=False).hexdigest()
         return hash_str
@@ -2260,7 +2346,8 @@ def __post_init__(self):
         if self.model is None and self.num_speculative_tokens is not None:
             # TODO(Shangming): Refactor mtp configuration logic when supporting
             # mtp acceleration for more models besides deepseek_v3
-            if self.target_model_config.hf_text_config.model_type \
+            if self.target_model_config and \
+                self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3":
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
@@ -2329,7 +2416,6 @@ def __post_init__(self):
                     code_revision=self.code_revision,
                     tokenizer_revision=self.target_model_config.
                     tokenizer_revision,
-                    max_model_len=None,
                     spec_target_max_model_len=self.target_model_config.
                     max_model_len,
                     quantization=self.quantization,
@@ -2341,7 +2427,10 @@ def __post_init__(self):
                 )
 
                 # Automatically detect the method
-                if "eagle-" in self.draft_model_config.model.lower():
+                if self.method in ('eagle', 'eagle3'):
+                    pass
+                elif "eagle-" in self.draft_model_config.model.lower() or \
+                        "eagle3-" in self.draft_model_config.model.lower():
                     self.method = "eagle"
                 elif self.draft_model_config.hf_config.model_type == "medusa":
                     self.method = "medusa"
@@ -2352,7 +2441,7 @@ def __post_init__(self):
                     self.method = "draft_model"
 
                 # Replace hf_config for EAGLE draft_model
-                if self.method == "eagle":
+                if self.method in ("eagle", "eagle3"):
                     if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                         raise ValueError(
                             "Chunked prefill and EAGLE are not compatible "
@@ -2365,7 +2454,8 @@ def __post_init__(self):
                         pass
                     else:
                         eagle_config = EAGLEConfig(
-                            self.draft_model_config.hf_config)
+                            self.draft_model_config.hf_config,
+                            method=self.method)
                         self.draft_model_config.hf_config = eagle_config
 
                 if (self.num_speculative_tokens is not None
@@ -2497,7 +2587,6 @@ def create_draft_parallel_config(
             max_parallel_loading_workers,
             disable_custom_all_reduce=target_parallel_config.
             disable_custom_all_reduce,
-            tokenizer_pool_config=target_parallel_config.tokenizer_pool_config,
             ray_workers_use_nsight=target_parallel_config.
             ray_workers_use_nsight,
             placement_group=target_parallel_config.placement_group,
@@ -2550,6 +2639,12 @@ def _verify_args(self) -> None:
                              "speculative decoding is > 1, but got "
                              f"{self.disable_by_batch_size=}")
 
+        if self.method == "eagle3" and self.target_model_config and \
+            "llama" not in self.target_model_config.hf_text_config.model_type:
+            raise ValueError(
+                "Eagle3 is only supported for Llama models. "
+                f"Got {self.target_model_config.hf_text_config.model_type=}")
+
     @property
     def num_lookahead_slots(self) -> int:
         """The number of additional slots the scheduler should allocate per
@@ -2560,6 +2655,9 @@ def num_lookahead_slots(self) -> int:
         """
         return self.num_speculative_tokens
 
+    def use_eagle(self) -> bool:
+        return self.method in ("eagle", "eagle3")
+
     def __repr__(self) -> str:
         method = self.method
         model = None if method == "ngram" else self.draft_model_config.model
@@ -2567,18 +2665,41 @@ def __repr__(self) -> str:
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
+LoRADType = Literal["auto", "float16", "bfloat16"]
+
+
+@config
 @dataclass
 class LoRAConfig:
-    max_lora_rank: int
-    max_loras: int
+    """Configuration for LoRA."""
+
+    max_lora_rank: int = 16
+    """Max LoRA rank."""
+    max_loras: int = 1
+    """Max number of LoRAs in a single batch."""
     fully_sharded_loras: bool = False
+    """By default, only half of the LoRA computation is sharded with tensor
+    parallelism. Enabling this will use the fully sharded layers. At high
+    sequence length, max rank or tensor parallel size, this is likely faster.
+    """
     max_cpu_loras: Optional[int] = None
-    lora_dtype: Optional[Union[torch.dtype, str]] = None
+    """Maximum number of LoRAs to store in CPU memory. Must be >= than
+    `max_loras`."""
+    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    """Data type for LoRA. If auto, will default to base model dtype."""
     lora_extra_vocab_size: int = 256
+    """Maximum size of extra vocabulary that can be present in a LoRA adapter
+    (added to the base model vocabulary)."""
     # This is a constant.
     lora_vocab_padding_size: ClassVar[int] = 256
-    long_lora_scaling_factors: Optional[tuple[float]] = None
+    long_lora_scaling_factors: Optional[tuple[float, ...]] = None
+    """Specify multiple scaling factors (which can be different from base model
+    scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
+    trained with those scaling factors to be used at the same time. If not
+    specified, only adapters trained with the base model scaling factor are
+    allowed."""
     bias_enabled: bool = False
+    """Enable bias for LoRA adapters."""
 
     def compute_hash(self) -> str:
         """
@@ -2637,25 +2758,27 @@ def verify_with_model_config(self, model_config: ModelConfig):
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
 
-    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if scheduler_config.chunked_prefill_enabled:
-            logger.warning("LoRA with chunked prefill is still experimental "
-                           "and may be unstable.")
-
     def verify_lora_support(self):
         if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
             raise ValueError(
                 "V1 LoRA does not support long LoRA, please use V0.")
 
 
+@config
 @dataclass
 class PromptAdapterConfig:
-    max_prompt_adapters: int
-    max_prompt_adapter_token: int
+    """Configuration for PromptAdapters."""
+
+    max_prompt_adapters: int = 1
+    """Max number of PromptAdapters in a batch."""
+    max_prompt_adapter_token: int = 0
+    """Max number of PromptAdapters tokens."""
     max_cpu_prompt_adapters: Optional[int] = None
-    prompt_adapter_dtype: Optional[torch.dtype] = None
+    """Maximum number of PromptAdapters to store in CPU memory. Must be >= than
+    `max_prompt_adapters`."""
+    prompt_adapter_dtype: Union[torch.dtype, str] = "auto"
+    """Data type for PromptAdapter. If auto, will default to base model dtype.
+    """
 
     def compute_hash(self) -> str:
         """
@@ -2687,20 +2810,42 @@ def __post_init__(self):
             self.max_cpu_prompt_adapters = self.max_prompt_adapters
 
     def verify_with_model_config(self, model_config: ModelConfig):
-        if self.prompt_adapter_dtype in (None, "auto"):
+        if self.prompt_adapter_dtype == "auto":
             self.prompt_adapter_dtype = model_config.dtype
         elif isinstance(self.prompt_adapter_dtype, str):
             self.prompt_adapter_dtype = getattr(torch,
                                                 self.prompt_adapter_dtype)
 
 
+@config
 @dataclass
 class MultiModalConfig:
     """Controls the behavior of multimodal models."""
 
-    limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
+    limit_per_prompt: dict[str, int] = get_field(ModelConfig,
+                                                 "limit_mm_per_prompt")
     """
     The maximum number of input items allowed per prompt for each modality.
+    Defaults to 1 (V0) or 999 (V1) for each modality.
+
+    For example, to allow up to 16 images and 2 videos per prompt:
+    `{"images": 16, "videos": 2}`
+    """
+
+    mm_processor_kwargs: Optional[dict[str, object]] = None
+    """
+    Overrides for the multi-modal processor obtained from
+    `transformers.AutoProcessor.from_pretrained`.
+
+    The available overrides depend on the model that is being run.
+
+    For example, for Phi-3-Vision:
+    `{"num_crops": 4}`.
+    """
+
+    disable_mm_preprocessor_cache: bool = False
+    """
+    If `True`, disable caching of the processed multi-modal inputs.
     """
 
     def compute_hash(self) -> str:
@@ -2722,24 +2867,20 @@ def compute_hash(self) -> str:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    def get_default_limit_per_prompt(self) -> int:
-        """
-        Return the default number of input items allowed per prompt
-        for any modality if not specified by the user.
-        """
-        return 999 if envs.VLLM_USE_V1 else 1
-
     def get_limit_per_prompt(self, modality: str) -> int:
         """
         Get the maximum number of input items allowed per prompt
         for the given modality.
         """
-        default = self.get_default_limit_per_prompt()
-        return self.limit_per_prompt.get(modality, default)
+        return self.limit_per_prompt.get(
+            modality,
+            999 if envs.VLLM_USE_V1 else 1,
+        )
 
     # TODO: Add configs to init vision tower or not.
 
 
+@config
 @dataclass
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
@@ -2747,7 +2888,7 @@ class PoolerConfig:
     pooling_type: Optional[str] = None
     """
     The pooling method of the pooling model. This should be a key in
-    :class:`vllm.model_executor.layers.pooler.PoolingType`.
+    {class}`vllm.model_executor.layers.pooler.PoolingType`.
     """
 
     normalize: Optional[bool] = None
@@ -2795,10 +2936,6 @@ def compute_hash(self) -> str:
                                usedforsecurity=False).hexdigest()
         return hash_str
 
-    @staticmethod
-    def from_json(json_str: str) -> "PoolerConfig":
-        return PoolerConfig(**json.loads(json_str))
-
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.float16,
@@ -2821,8 +2958,8 @@ def _get_and_verify_dtype(
 
     # Fallbacks for multi-modal models if the root config
     # does not define torch_dtype
-    if config_dtype is None and hasattr(config, "text_config"):
-        config_dtype = getattr(config.text_config, "torch_dtype", None)
+    if config_dtype is None:
+        config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
         config_dtype = getattr(config.vision_config, "torch_dtype", None)
 
@@ -2838,6 +2975,13 @@ def _get_and_verify_dtype(
             else:
                 torch_dtype = config_dtype
 
+            if config.model_type == "plamo2":
+                logger.info(
+                    "For PLaMo2, we cast models to bfloat16 instead of using "
+                    "float16 by default. This is because float16 does not work."
+                )
+                torch_dtype = torch.bfloat16
+
             from vllm.platforms import current_platform
             if (current_platform.is_cpu()
                     and current_platform.get_cpu_architecture()
@@ -2867,6 +3011,11 @@ def _get_and_verify_dtype(
                     "using float16 by default. Please specify `dtype` if you "
                     "want to use float16.")
                 torch_dtype = torch.bfloat16
+        elif dtype == "float16" and config.model_type == "plamo2":
+            logger.warning(
+                "For PLaMo2, using float16 is unstable and might cause "
+                "unexpected behavior. Please use bfloat16 or float32 instead.")
+            torch_dtype = torch.float16
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                 raise ValueError(f"Unknown dtype: {dtype}")
@@ -2997,6 +3146,14 @@ def _get_and_verify_max_len(
     # derived length from the HF model config.
     if max_model_len is None:
         max_model_len = int(derived_max_model_len)
+        if current_platform.is_tpu():
+            logger.warning(
+                "--max-model-len is not specified, "
+                "it's currently using model's default length %s, "
+                "which might be too large."
+                "Please input with --max-model-len based on your "
+                "request input length and output length, to avoid "
+                "unnecessary degradation.", max_model_len)
     elif max_model_len > derived_max_model_len:
         # Some models might have a separate key for specifying model_max_length
         # that will be bigger than derived_max_model_len. We compare user input
@@ -3052,15 +3209,51 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
+GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
+                                  "xgrammar", "guidance"]
+GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance"]
+GuidedDecodingBackend = Literal[GuidedDecodingBackendV0,
+                                GuidedDecodingBackendV1]
+
+
+@config
 @dataclass
 class DecodingConfig:
-    """Dataclass which contains the decoding strategy of the engine"""
-
-    # Which guided decoding algo to use.
-    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
-    guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    """Dataclass which contains the decoding strategy of the engine."""
 
-    reasoning_backend: Optional[str] = None
+    @property
+    @deprecated(
+        "`guided_decoding_backend` is deprecated and has been renamed to "
+        "`backend`. This will be removed in v0.10.0. Please use the "
+        "`backend` argument instead.")
+    def guided_decoding_backend(self) -> GuidedDecodingBackend:
+        return self.backend
+
+    @guided_decoding_backend.setter
+    def guided_decoding_backend(self, value: GuidedDecodingBackend):
+        self.backend = value
+
+    backend: GuidedDecodingBackend = "auto" if envs.VLLM_USE_V1 else "xgrammar"
+    """Which engine will be used for guided decoding (JSON schema / regex etc)
+    by default. With "auto", we will make opinionated choices based on request
+    contents and what the backend libraries currently support, so the behavior
+    is subject to change in each release."""
+
+    disable_fallback: bool = False
+    """If `True`, vLLM will not fallback to a different backend on error."""
+
+    disable_any_whitespace: bool = False
+    """If `True`, the model will not generate any whitespace during guided
+    decoding. This is only supported for xgrammar and guidance backends."""
+
+    disable_additional_properties: bool = False
+    """If `True`, the `guidance` backend will not use `additionalProperties`
+    in the JSON schema. This is only supported for the `guidance` backend and
+    is used to better align its behaviour with `outlines` and `xgrammar`."""
+
+    reasoning_backend: str = ""
+    """Select the reasoning parser depending on the model that you're using.
+    This is used to parse the reasoning content into OpenAI API format."""
 
     def compute_hash(self) -> str:
         """
@@ -3082,36 +3275,92 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        v0_valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
-        ]
-        v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
+        if ":" in self.backend:
+            self._extract_backend_options()
 
-        backend = GuidedDecodingParams(
-            backend=self.guided_decoding_backend).backend_name
         if envs.VLLM_USE_V1:
-            valid_guided_backends = v1_valid_guided_backends
+            valid_guided_backends = get_args(GuidedDecodingBackendV1)
         else:
-            valid_guided_backends = v0_valid_guided_backends
-        if backend not in valid_guided_backends:
-            raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
+            valid_guided_backends = get_args(GuidedDecodingBackendV0)
+        if self.backend not in valid_guided_backends:
+            raise ValueError(f"Invalid backend '{self.backend}',"
                              f" must be one of {valid_guided_backends}")
+        if (self.disable_any_whitespace
+                and self.backend not in ("xgrammar", "guidance")):
+            raise ValueError("disable_any_whitespace is only supported for "
+                             "xgrammar and guidance backends.")
+        if (self.disable_additional_properties and self.backend != "guidance"):
+            raise ValueError("disable_additional_properties is only supported "
+                             "for the guidance backend.")
+
+    @deprecated(
+        "Passing guided decoding backend options inside backend in the format "
+        "'backend:...' is deprecated. This will be removed in v0.10.0. Please "
+        "use the dedicated arguments '--disable-fallback', "
+        "'--disable-any-whitespace' and '--disable-additional-properties' "
+        "instead.")
+    def _extract_backend_options(self):
+        """Extract backend options from the backend string."""
+        backend, options = self.backend.split(":")
+        self.backend = cast(GuidedDecodingBackend, backend)
+        options_set = set(options.strip().split(","))
+        if "no-fallback" in options_set:
+            self.disable_fallback = True
+        if "disable-any-whitespace" in options_set:
+            self.disable_any_whitespace = True
+        if "no-additional-properties" in options_set:
+            self.disable_additional_properties = True
+
+
+DetailedTraceModules = Literal["model", "worker", "all"]
 
 
+@config
 @dataclass
 class ObservabilityConfig:
     """Configuration for observability - metrics and tracing."""
-    show_hidden_metrics: bool = False
-
-    otlp_traces_endpoint: Optional[str] = None
-
-    # Collecting detailed timing information for each request can be expensive.
 
-    # If set, collects the model forward time for the request.
-    collect_model_forward_time: bool = False
+    show_hidden_metrics_for_version: Optional[str] = None
+    """Enable deprecated Prometheus metrics that have been hidden since the
+    specified version. For example, if a previously deprecated metric has been
+    hidden since the v0.7.0 release, you use
+    `--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
+    you migrate to new metrics. The metric is likely to be removed completely
+    in an upcoming release."""
+
+    @cached_property
+    def show_hidden_metrics(self) -> bool:
+        """Check if the hidden metrics should be shown."""
+        if self.show_hidden_metrics_for_version is None:
+            return False
+        return version._prev_minor_version_was(
+            self.show_hidden_metrics_for_version)
 
-    # If set, collects the model execute time for the request.
-    collect_model_execute_time: bool = False
+    otlp_traces_endpoint: Optional[str] = None
+    """Target URL to which OpenTelemetry traces will be sent."""
+
+    collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
+    """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
+    set, it will collect detailed traces for the specified modules. This
+    involves use of possibly costly and or blocking operations and hence might
+    have a performance impact.
+
+    Note that collecting detailed timing information for each request can be
+    expensive."""
+
+    @cached_property
+    def collect_model_forward_time(self) -> bool:
+        """Whether to collect model forward time for the request."""
+        return (self.collect_detailed_traces is not None
+                and ("model" in self.collect_detailed_traces
+                     or "all" in self.collect_detailed_traces))
+
+    @cached_property
+    def collect_model_execute_time(self) -> bool:
+        """Whether to collect model execute time for the request."""
+        return (self.collect_detailed_traces is not None
+                and ("worker" in self.collect_detailed_traces
+                     or "all" in self.collect_detailed_traces))
 
     def compute_hash(self) -> str:
         """
@@ -3133,12 +3382,23 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
+        if (self.collect_detailed_traces is not None
+                and len(self.collect_detailed_traces) == 1
+                and "," in self.collect_detailed_traces[0]):
+            self._parse_collect_detailed_traces()
+
         if not is_otel_available() and self.otlp_traces_endpoint is not None:
             raise ValueError(
                 "OpenTelemetry is not available. Unable to configure "
                 "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
                 f"installed. Original error:\n{otel_import_error_traceback}")
 
+    def _parse_collect_detailed_traces(self):
+        assert isinstance(self.collect_detailed_traces, list)
+        self.collect_detailed_traces = cast(
+            list[DetailedTraceModules],
+            self.collect_detailed_traces[0].split(","))
+
 
 class KVTransferConfig(BaseModel):
     """Configuration for distributed KV cache transfer."""
@@ -3234,6 +3494,51 @@ def get_from_extra_config(self, key, default) -> Any:
         return self.kv_connector_extra_config.get(key, default)
 
 
+class KVEventsConfig(BaseModel):
+    """Configuration for KV event publishing."""
+
+    enable_kv_cache_events: bool = False
+    """If True, enable KV cache events for tracking block storage and removal.
+    Events can be published externally by zmq using the event publisher config.
+    """
+
+    publisher: str = "null"
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+
+    replay_endpoint: Optional[str] = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "KVEventsConfig":
+        """Parse the CLI value for the event publisher config."""
+        return KVEventsConfig.model_validate_json(cli_value)
+
+
 class CompilationLevel:
     # constants for the levels of the compilation process
     NO_COMPILATION = 0
@@ -3352,11 +3657,13 @@ class PassConfig(BaseModel):
         - enable_fusion: whether to enable the custom fusion pass.
         - enable_noop: whether to enable the custom no-op elimination pass.
             TODO(luka) better pass enabling system.
+        - enable_sequence_parallelism: whether to enable sequence parallelism.
         """
         dump_graph_stages: list[str] = Field(default_factory=list)
         dump_graph_dir: Path = Field(default=Path("."))
         enable_fusion: bool = True
         enable_noop: bool = True
+        enable_sequence_parallelism: bool = False
 
         def uuid(self):
             """
@@ -3365,7 +3672,8 @@ def uuid(self):
             Do not include dump_graph_* in the hash - they don't affect
             compilation.
             """
-            dict_ = self.model_dump(include={"enable_fusion", "enable_noop"})
+            dict_ = self.model_dump(include={"enable_fusion", "enable_noop", \
+                "enable_sequence_parallelism"})
             return InductorPass.hash_dict(dict_)
 
         def model_post_init(self, __context: Any) -> None:
@@ -3392,7 +3700,8 @@ def model_post_init(self, __context: Any) -> None:
     compilation_time: float = PrivateAttr
 
     # Per-model forward context
-    # Map from layer name to the attention cls
+    # Map from layer name to layer objects that need to be accessed outside
+    # model code, e.g., Attention, FusedMOE when dp_size>1.
     static_forward_context: dict[str, Any] = PrivateAttr
 
     def compute_hash(self) -> str:
@@ -3592,6 +3901,7 @@ class VllmConfig:
                                                   init=True)  # type: ignore
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
+    kv_events_config: Optional[KVEventsConfig] = None
     # some opaque config, only used to provide additional information
     # for the hash computation, mainly used for testing, debugging or out of
     # tree config registration.
@@ -3723,6 +4033,17 @@ def _get_quantization_config(
             return quant_config
         return None
 
+    @staticmethod
+    def get_quantization_config(
+            model_config: ModelConfig,
+            load_config: LoadConfig) -> Optional[QuantizationConfig]:
+        import copy
+
+        # For some reason, the _ version of this modifies the model_config
+        # object, so using deepcopy to avoid this problem.
+        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
+                                                   load_config)
+
     def with_hf_config(
         self,
         hf_config: PretrainedConfig,
@@ -3752,8 +4073,6 @@ def __post_init__(self):
         if self.lora_config:
             self.lora_config.verify_with_cache_config(self.cache_config)
             self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_with_scheduler_config(
-                self.scheduler_config)
             self.lora_config.verify_lora_support()
         if self.prompt_adapter_config:
             self.prompt_adapter_config.verify_with_model_config(
@@ -3777,6 +4096,8 @@ def __post_init__(self):
 
         if self.compilation_config is None:
             self.compilation_config = CompilationConfig()
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            self.compilation_config.custom_ops.append("+rms_norm")
         if envs.VLLM_USE_V1 and self.model_config is not None and \
             not self.model_config.enforce_eager:
             # NOTE(woosuk): Currently, we use inductor because the piecewise
@@ -3784,7 +4105,8 @@ def __post_init__(self):
             # FIXME(woosuk): Disable inductor to reduce the compilation time
             # and avoid any potential issues with the inductor.
             # FIXME(rob): Add function to set all of these.
-            self.compilation_config.custom_ops = ["none"]
+            if not self.compilation_config.custom_ops:
+                self.compilation_config.custom_ops = ["none"]
             self.compilation_config.use_cudagraph = True
             self.compilation_config.use_inductor = True
             self.compilation_config.cudagraph_num_of_warmups = 1
@@ -3793,6 +4115,18 @@ def __post_init__(self):
             self.compilation_config.level = CompilationLevel.PIECEWISE
             self.compilation_config.set_splitting_ops_for_v1()
 
+        if self.parallel_config is not None and \
+            self.parallel_config.tensor_parallel_size > 1 and \
+            self.parallel_config.pipeline_parallel_size > 1 and \
+            self.compilation_config is not None and \
+                self.compilation_config.pass_config is not None and \
+            self.compilation_config.pass_config.enable_sequence_parallelism:
+            logger.warning_once(
+                "Sequence parallelism is not supported with pipeline "
+                "parallelism. Disabling sequence parallelism.")
+            self.compilation_config.pass_config.\
+                enable_sequence_parallelism = False
+
         self._set_cudagraph_sizes()
 
         if self.cache_config is not None and \
@@ -3827,11 +4161,43 @@ def __post_init__(self):
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False
 
+        if (self.kv_events_config
+                and self.kv_events_config.enable_kv_cache_events
+                and not self.cache_config.enable_prefix_caching):
+            logger.warning(
+                "KV cache events are on, but prefix caching is not enabled."
+                "Use --enable-prefix-caching to enable.")
+        if (self.kv_events_config and self.kv_events_config.publisher != "null"
+                and not self.kv_events_config.enable_kv_cache_events):
+            logger.warning("KV cache events are disabled,"
+                           "but the scheduler is configured to publish them."
+                           "Modify KVEventsConfig.enable_kv_cache_events"
+                           "to True to enable.")
         current_platform.check_and_update_config(self)
 
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+    def update_sizes_for_sequence_parallelism(self,
+                                              possible_sizes: list) -> list:
+        # remove the sizes that not multiple of tp_size when
+        # enable sequence parallelism
+        removed_sizes = [
+            size for size in possible_sizes
+            if size % self.parallel_config.tensor_parallel_size != 0
+        ]
+        if removed_sizes:
+            logger.warning(
+                "Batch sizes %s are removed because they are not "
+                "multiple of tp_size %d when "
+                "sequence parallelism is enabled", removed_sizes,
+                self.parallel_config.tensor_parallel_size)
+
+        return [
+            size for size in possible_sizes
+            if size % self.parallel_config.tensor_parallel_size == 0
+        ]
+
     def _set_cudagraph_sizes(self):
         """
         cudagraph batchsize padding logic:
@@ -3869,6 +4235,11 @@ def _set_cudagraph_sizes(self):
                     not self.model_config.enforce_eager:
 
                 possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
+                if self.parallel_config.tensor_parallel_size > 1 and \
+                    self.compilation_config.pass_config.enable_sequence_parallelism:
+                    possible_sizes = self.update_sizes_for_sequence_parallelism(
+                        possible_sizes)
+
                 # find the minimum size that is larger than max_num_seqs,
                 # which then becomes the max_batchsize_to_capture
                 larger_sizes = [
@@ -3890,8 +4261,19 @@ def _set_cudagraph_sizes(self):
             batch_size_capture_list = []
             if self.model_config is not None and \
                 not self.model_config.enforce_eager:
-                batch_size_capture_list = [1, 2, 4
-                                           ] + [i for i in range(8, 513, 8)]
+                cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
+                if len(cuda_graph_sizes) == 1:
+                    batch_size_capture_list = [1, 2, 4] + [
+                        i for i in range(8, cuda_graph_sizes[0] + 1, 8)
+                    ]
+                elif len(cuda_graph_sizes) > 1:
+                    batch_size_capture_list = sorted(cuda_graph_sizes)
+                else:
+                    raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
+                if self.parallel_config.tensor_parallel_size > 1 and \
+                    self.compilation_config.pass_config.enable_sequence_parallelism:
+                    batch_size_capture_list = \
+                        self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
                 max_num_tokens = self.scheduler_config.max_num_batched_tokens
                 batch_size_capture_list = [
                     size for size in batch_size_capture_list
@@ -3932,8 +4314,6 @@ def __str__(self):
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
-            f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, "  # noqa
-            f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}")
 
@@ -3990,3 +4370,43 @@ def get_current_vllm_config() -> VllmConfig:
         from vllm.config import VllmConfig
         return VllmConfig()
     return _current_vllm_config
+
+
+def contains_object_print(text):
+    """
+    Check if the text looks like a printed Python object, e.g.
+    contains any substring matching the pattern: "at 0xFFFFFFF>"
+    We match against 0x followed by 2-16 hex chars (there's
+    a max of 16 on a 64 bit system).
+
+    Args:
+        text (str): The text to check
+
+    Returns:
+        bool: True if a match is found, False otherwise
+    """
+    pattern = r'at 0x[a-fA-F0-9]{2,16}>'
+    match = re.search(pattern, text)
+    return match is not None
+
+
+def assert_hashable(text):
+    if not contains_object_print(text):
+        return True
+    raise AssertionError(
+        f"vLLM tried to hash some configs that may have Python objects ids "
+        f"in them. This is a bug, please file an issue. "
+        f"Text being hashed: {text}")
+
+
+T = TypeVar("T")
+
+
+def get_layers_from_vllm_config(vllm_config: VllmConfig,
+                                layer_type: type[T]) -> dict[str, T]:
+    return {
+        layer_name: layer
+        for layer_name, layer in
+        vllm_config.compilation_config.static_forward_context.items()
+        if isinstance(layer, layer_type)
+    }
diff --git a/vllm/connections.py b/vllm/connections.py
index 2c259bb7c3e..9abc66050e1 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -167,4 +167,4 @@ async def async_download_file(
 
 
 global_http_connection = HTTPConnection()
-"""The global :class:`HTTPConnection` instance used by vLLM."""
+"""The global {class}`HTTPConnection` instance used by vLLM."""
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index cf85a2135c8..06d4ed470b2 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1071,6 +1071,7 @@ def _schedule_prefills(
             )
         ignored_seq_groups: List[SequenceGroup] = []
         seq_groups: List[ScheduledSequenceGroup] = []
+        using_prompt_embeds: bool = False
 
         waiting_queue = self.waiting
 
@@ -1138,6 +1139,15 @@ def _schedule_prefills(
                 waiting_queue.popleft()
                 continue
 
+            # We cannot mix sequence groups that use prompt embeds and
+            # those that do not.
+            if len(seq_groups) == 0:
+                using_prompt_embeds = seq_group.uses_prompt_embeds()
+            if using_prompt_embeds != seq_group.uses_prompt_embeds():
+                leftover_waiting_sequences.appendleft(seq_group)
+                waiting_queue.popleft()
+                continue
+
             lora_int_id = 0
             if self.lora_enabled:
                 lora_int_id = seq_group.lora_int_id
@@ -1295,17 +1305,39 @@ def _schedule_default(self) -> SchedulerOutputs:
 
         # Merge lists
         num_prefill_groups = len(prefills.seq_groups)
+        ignored_seq_groups_for_embeds = list[SequenceGroup]()
         if num_prefill_groups > 0:
             scheduled_seq_groups = prefills.seq_groups
             scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
+            ignored_seq_groups_for_embeds.clear()
         else:
             scheduled_seq_groups = running_scheduled.decode_seq_groups
+            if len(scheduled_seq_groups) > 0:
+                using_prompt_embeds = scheduled_seq_groups[
+                    0].seq_group.uses_prompt_embeds()
+                ignored_seq_groups_for_embeds.clear()
+                indices_ignored = list[int]()
+                for i, schedule_seq_group in enumerate(scheduled_seq_groups):
+                    if using_prompt_embeds !=\
+                        schedule_seq_group.seq_group.uses_prompt_embeds():
+                        ignored_seq_groups_for_embeds.append(
+                            schedule_seq_group.seq_group)
+                        indices_ignored.append(i)
+                if len(ignored_seq_groups_for_embeds) > 0:
+                    scheduled_seq_groups = [
+                        group for i, group in enumerate(scheduled_seq_groups)
+                        if i not in indices_ignored
+                    ]
+            else:
+                ignored_seq_groups_for_embeds.clear()
+
         scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
 
         blocks_to_copy = running_scheduled.blocks_to_copy
         blocks_to_copy.extend(swapped_in.blocks_to_copy)
 
         ignored_seq_groups = prefills.ignored_seq_groups
+        ignored_seq_groups.extend(ignored_seq_groups_for_embeds)
         ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
 
         return SchedulerOutputs(
@@ -1596,7 +1628,6 @@ def schedule(
                     multi_modal_placeholders=(
                         seq_group.multi_modal_placeholders
                         if scheduler_outputs.num_prefill_groups > 0 else None),
-                    mm_processor_kwargs=seq_group.mm_processor_kwargs,
                     prompt_adapter_request=seq_group.prompt_adapter_request,
                 )
             else:
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 0228264f91f..894a0fafb64 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -19,6 +19,12 @@ def tensor_model_parallel_all_gather(input_: torch.Tensor,
     return get_tp_group().all_gather(input_, dim)
 
 
+def tensor_model_parallel_reduce_scatter(input_: torch.Tensor,
+                                         dim: int = -1) -> torch.Tensor:
+    """Reduce-Scatter the input tensor across model parallel group."""
+    return get_tp_group().reduce_scatter(input_, dim)
+
+
 def tensor_model_parallel_gather(input_: torch.Tensor,
                                  dst: int = 0,
                                  dim: int = -1) -> Optional[torch.Tensor]:
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index eb12f8834b4..240313b98c8 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -61,6 +61,40 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
                                               input_size[dim + 1:])
         return output_tensor
 
+    def reduce_scatter(self,
+                       input_: torch.Tensor,
+                       dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        assert input_tensor.shape[0] % world_size == 0
+        chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+
+        output_tensor = torch.empty(output_shape,
+                                    dtype=input_tensor.dtype,
+                                    device=input_tensor.device)
+
+        # Perform reduce-scatter operation
+        torch.distributed.reduce_scatter_tensor(output_tensor,
+                                                input_tensor,
+                                                group=self.device_group)
+
+        # Reshape before returning
+        return output_tensor.movedim(0, dim).contiguous()
+
     def gather(self,
                input_: torch.Tensor,
                dst: int = 0,
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 07c9ff50609..8bca278f388 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -70,6 +70,31 @@ def all_reduce(self, input_):
             torch.distributed.all_reduce(out, group=self.device_group)
         return out
 
+    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Note: This will produce an incorrect answer if we don't make
+        # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
+        input_tensor = input_.movedim(0, dim).contiguous()
+
+        assert input_tensor.shape[0] % world_size == 0
+        chunk_size = input_tensor.shape[0] // world_size
+        output_shape = (chunk_size, ) + input_tensor.shape[1:]
+
+        output = torch.empty(output_shape,
+                             dtype=input_tensor.dtype,
+                             device=input_tensor.device)
+
+        pynccl_comm.reduce_scatter(output, input_)
+
+        # Reshape before returning
+        return output.movedim(0, dim).contiguous()
+
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 11ed7c08437..e33cfee2197 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -7,11 +7,13 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple, Union
+from threading import Event
+from typing import Any, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
+import zmq
 from torch.distributed import ProcessGroup
 from zmq import IPV6  # type: ignore
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
@@ -239,7 +241,7 @@ def __init__(
                 self.remote_socket.setsockopt(IPV6, 1)
                 remote_addr_ipv6 = True
                 connect_ip = f"[{connect_ip}]"
-            socket_addr = f"tcp://*:{remote_subscribe_port}"
+            socket_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
             self.remote_socket.bind(socket_addr)
             remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
         else:
@@ -400,7 +402,9 @@ def acquire_write(self, timeout: Optional[float] = None):
                 break
 
     @contextmanager
-    def acquire_read(self, timeout: Optional[float] = None):
+    def acquire_read(self,
+                     timeout: Optional[float] = None,
+                     cancel: Optional[Event] = None):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -425,11 +429,14 @@ def acquire_read(self, timeout: Optional[float] = None):
                             > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
                         logger.debug(
                             ("No available shared memory broadcast block found"
-                             "in %s second."),
+                             " in %s second."),
                             VLLM_RINGBUFFER_WARNING_INTERVAL,
                         )
                         n_warning += 1
 
+                    if cancel is not None and cancel.is_set():
+                        raise RuntimeError("cancelled")
+
                     # if we time out, raise an exception
                     if (timeout is not None
                             and time.monotonic() - start_time > timeout):
@@ -464,10 +471,12 @@ def enqueue(self, obj, timeout: Optional[float] = None):
         if self.n_remote_reader > 0:
             self.remote_socket.send(serialized_obj)
 
-    def dequeue(self, timeout: Optional[float] = None):
+    def dequeue(self,
+                timeout: Optional[float] = None,
+                cancel: Optional[Event] = None):
         """ Read from message queue with optional timeout (in seconds) """
         if self._is_local_reader:
-            with self.acquire_read(timeout) as buf:
+            with self.acquire_read(timeout, cancel) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
                     # no need to know the size of serialized object
@@ -475,15 +484,21 @@ def dequeue(self, timeout: Optional[float] = None):
                     # see https://docs.python.org/3/library/pickle.html
                     obj = pickle.loads(buf[1:])
             if overflow:
-                recv = self.local_socket.recv()
-                obj = pickle.loads(recv)
+                obj = MessageQueue.recv(self.local_socket, timeout)
         elif self._is_remote_reader:
-            recv = self.remote_socket.recv()
-            obj = pickle.loads(recv)
+            obj = MessageQueue.recv(self.remote_socket, timeout)
         else:
             raise RuntimeError("Only readers can dequeue")
         return obj
 
+    @staticmethod
+    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+        timeout_ms = None if timeout is None else int(timeout * 1000)
+        if not socket.poll(timeout=timeout_ms):
+            raise TimeoutError
+        recv = socket.recv(copy=False)
+        return pickle.loads(recv.buffer)
+
     def broadcast_object(self, obj=None):
         if self._is_writer:
             self.enqueue(obj)
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
new file mode 100644
index 00000000000..96091385852
--- /dev/null
+++ b/vllm/distributed/kv_events.py
@@ -0,0 +1,295 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import queue
+import threading
+import time
+from abc import ABC, abstractmethod
+from collections import deque
+from itertools import count
+from queue import Queue
+from typing import Any, Callable, Optional, Union
+
+import msgspec
+import zmq
+
+from vllm.config import KVEventsConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class EventBatch(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False,  # type: ignore[call-arg]
+):
+    ts: float
+    events: list[Any]
+
+
+class KVCacheEvent(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False,  # type: ignore[call-arg]
+        tag=True):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+class EventPublisher(ABC):
+    """Lightweight publisher for EventBatch batches."""
+
+    @abstractmethod
+    def publish(self, events: EventBatch) -> None:
+        """Emit events in order.
+
+        Implementations should guarantee at-least-once delivery and
+        monotonic ordering (e.g., via sequence numbers).
+        """
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the publisher."""
+
+
+class NullEventPublisher(EventPublisher):
+    """No-op implementation (default when disabled)."""
+
+    def publish(self, events) -> None:
+        return
+
+    def shutdown(self) -> None:
+        return
+
+
+class ZmqEventPublisher(EventPublisher):
+    """Reliable PUB/ROUTER publisher with an in-memory replay buffer.
+
+    Spawns a separate thread to handle publishing from a queue.
+
+    Parameters
+    ----------
+    endpoint:
+        PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
+        connect.
+    replay_endpoint:
+        Optional ROUTER address for replay requests. When given, subscribers can
+        request missed batches by sending the starting sequence number as an
+        8-byte big-endian integer.
+    buffer_steps:
+        Number of past batches to keep for replay.
+    hwm:
+        ZeroMQ high-water-mark for PUB socket.
+    max_queue_size:
+        Maximum number of events to buffer in memory.
+    topic:
+        Topic to publish events to.
+    """
+    SHUTDOWN_TIMEOUT: float = 1.0
+    END_SEQ = (-1).to_bytes(8, "big", signed=True)
+
+    def __init__(
+        self,
+        endpoint: str = "tcp://*:5557",
+        replay_endpoint: Optional[str] = None,
+        buffer_steps: int = 10_000,
+        hwm: int = 100_000,
+        max_queue_size: int = 100_000,
+        topic: str = "",
+    ) -> None:
+        # Storage
+        self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
+        self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
+
+        # ZMQ sockets
+        self._ctx = zmq.Context.instance()
+        self._pub: Optional[zmq.Socket] = None
+        self._replay: Optional[zmq.Socket] = None
+        self._endpoint = endpoint
+        self._replay_endpoint = replay_endpoint
+        self._hwm = hwm
+
+        # Payload
+        self._seq_gen = count()
+        self._topic_bytes = topic.encode('utf-8')
+
+        # Thread
+        self._running = True
+        logger.info("Starting ZMQ publisher thread")
+
+        self._thread = threading.Thread(target=self._publisher_thread,
+                                        daemon=True,
+                                        name="zmq-publisher")
+        self._thread.start()
+
+    def publish(self, events: EventBatch) -> None:
+        if not self._running:
+            raise RuntimeError("Publisher is closed")
+        self._event_queue.put(events)
+
+    def shutdown(self) -> None:
+        """Stop the publisher thread and clean up resources."""
+        self._running = False
+        self._event_queue.put_nowait(None)
+
+        start = time.time()
+        pending_items = True
+        while pending_items and (time.time() - start < self.SHUTDOWN_TIMEOUT):
+            pending_items = not self._event_queue.empty()
+            if pending_items:
+                time.sleep(0.1)
+
+        if pending_items:
+            logger.warning(
+                "Warning: Queue still has %s items after %s seconds timeout",
+                self._event_queue.qsize(),
+                self.SHUTDOWN_TIMEOUT,
+            )
+
+        if self._thread.is_alive():
+            self._thread.join(timeout=self.SHUTDOWN_TIMEOUT)
+
+        # Clean up ZMQ resources
+        try:
+            if self._pub is not None:
+                self._pub.close(linger=0)
+            if self._replay is not None:
+                self._replay.close(linger=0)
+        finally:
+            pass  # Do not terminate context; other sockets may use it
+
+    def _socket_setup(self) -> None:
+        """Initialize sockets
+        https://pyzmq.readthedocs.io/en/v19.0.0/morethanbindings.html#thread-safety
+        """
+        if self._pub is None:
+            self._pub = self._ctx.socket(zmq.PUB)
+            self._pub.set_hwm(self._hwm)
+            # Heuristic: bind if wildcard / * present, else connect.
+            # bind stable, connect volatile convention
+            if ("*" in self._endpoint or "::" in self._endpoint
+                    or self._endpoint.startswith("ipc://")
+                    or self._endpoint.startswith("inproc://")):
+                self._pub.bind(self._endpoint)
+            else:
+                self._pub.connect(self._endpoint)
+
+        # Set up replay socket: use ROUTER
+        # 1) handles multiple REQ clients (identities)
+        # 2) lets us send back one request → many replies (streamed events)
+        # 3) works in our non‑blocking poll loop alongside PUB
+        if self._replay_endpoint is not None:
+            self._replay = self._ctx.socket(zmq.ROUTER)
+            self._replay.bind(self._replay_endpoint)
+
+    def _publisher_thread(self) -> None:
+        """Background thread that processes the event queue."""
+        self._pack = msgspec.msgpack.Encoder()
+        self._socket_setup()
+
+        assert self._pub is not None  # narrows type for mypy
+
+        while self._running or self._event_queue.qsize() > 0:
+            # --- replay (non-critical) ---------------------------------
+            if self._replay is not None and self._replay.poll(0):
+                try:
+                    self._service_replay()
+                except Exception as e:
+                    logger.exception("Error in replay: %s", e)
+
+            # --- main queue (critical) ---------------------------------
+            try:
+                event = self._event_queue.get(timeout=0.1)
+                if event is None:
+                    break  # Sentinel received, exit thread
+            except queue.Empty:
+                continue
+
+            try:
+                seq = next(self._seq_gen)
+
+                payload = self._pack.encode(event)
+                seq_bytes = seq.to_bytes(8, "big")
+                self._pub.send_multipart(
+                    (self._topic_bytes, seq_bytes, payload))
+
+                self._buffer.append((seq, payload))
+                self._event_queue.task_done()
+
+            except Exception as e:
+                # Publishing failed;  back-off a bit to avoid a tight error loop
+                logger.exception("Error in publisher thread: %s", e)
+                time.sleep(0.1)
+
+    def _service_replay(self) -> None:
+        """If a replay request is waiting, send buffered batches."""
+        assert self._replay is not None  # narrows type for mypy
+
+        frame = self._replay.recv_multipart()
+        if len(frame) != 3:
+            logger.warning("Invalid replay request: %s", frame)
+            return
+        client_id, _, start_seq_bytes = frame
+        start_seq = int.from_bytes(start_seq_bytes, "big")
+
+        for seq, buf in self._buffer:
+            if seq >= start_seq:
+                # [identity, empty_delim, seq_bytes, payload]
+                # (identity, empty_delim) are stripped off by the router
+                # receiving payload is (seq_bytes, payload)
+                self._replay.send_multipart(
+                    (client_id, b"", seq.to_bytes(8, "big"), buf))
+        # Send end of sequence marker
+        # receiving payload is (-1, b""")
+        self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
+
+
+class EventPublisherFactory:
+    _registry: dict[str, Callable[..., EventPublisher]] = {
+        "null": NullEventPublisher,
+        "zmq": ZmqEventPublisher,
+    }
+
+    @classmethod
+    def register_publisher(cls, name: str,
+                           ctor: Callable[..., EventPublisher]) -> None:
+        if name in cls._registry:
+            raise KeyError(f"publisher '{name}' already registered")
+        cls._registry[name] = ctor
+
+    @classmethod
+    def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher:
+        """Create publisher from a config mapping."""
+        if not config:
+            return NullEventPublisher()
+
+        config_dict = config.model_dump()
+
+        kind = config_dict.pop("publisher", "null")
+        config_dict.pop("enable_kv_cache_events")
+        try:
+            constructor = cls._registry[kind]
+        except KeyError as exc:
+            raise ValueError(f"Unknown event publisher '{kind}'") from exc
+        return constructor(**config_dict)
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index e69de29bb2d..a9f26607de4 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_transfer_state import (
+    ensure_kv_transfer_initialized, get_kv_transfer_group,
+    has_kv_transfer_group, is_v1_kv_transfer_group)
+
+__all__ = [
+    "get_kv_transfer_group", "has_kv_transfer_group",
+    "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
+    "KVConnectorBaseType"
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py
index 57c764b481c..0d1a3d40af4 100644
--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -12,6 +12,7 @@
 
 import torch
 
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -121,3 +122,6 @@ def recv_kv_caches_and_hidden_states(
         """
 
         raise NotImplementedError
+
+
+KVConnectorBaseType = Union[KVConnectorBase, KVConnectorBase_V1]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index e37ce6dc75b..6532c101a4f 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -3,14 +3,22 @@
 import importlib
 from typing import TYPE_CHECKING, Callable, Dict, Type
 
+import vllm.envs as envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.logger import init_logger
+
 from .base import KVConnectorBase
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+logger = init_logger(__name__)
+
 
 class KVConnectorFactory:
-    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
+    _registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {}
 
     @classmethod
     def register_connector(cls, name: str, module_path: str,
@@ -19,22 +27,51 @@ def register_connector(cls, name: str, module_path: str,
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
 
-        def loader() -> Type[KVConnectorBase]:
+        def loader() -> Type[KVConnectorBaseType]:
             module = importlib.import_module(module_path)
             return getattr(module, class_name)
 
         cls._registry[name] = loader
 
     @classmethod
-    def create_connector(cls, rank: int, local_rank: int,
-                         config: "VllmConfig") -> KVConnectorBase:
+    def create_connector_v0(cls, rank: int, local_rank: int,
+                            config: "VllmConfig") -> KVConnectorBase:
+        if envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V0 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
         connector_name = config.kv_transfer_config.kv_connector
         if connector_name not in cls._registry:
             raise ValueError(f"Unsupported connector type: {connector_name}")
 
         connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase)
         return connector_cls(rank, local_rank, config)
 
+    @classmethod
+    def create_connector_v1(
+        cls,
+        config: "VllmConfig",
+        role: KVConnectorRole,
+    ) -> KVConnectorBase_V1:
+        if not envs.VLLM_USE_V1:
+            raise ValueError("Attempting to initialize a V1 Connector, "
+                             f"but found {envs.VLLM_USE_V1=}")
+
+        connector_name = config.kv_transfer_config.kv_connector
+        connector_cls = cls._registry[connector_name]()
+        assert issubclass(connector_cls, KVConnectorBase_V1)
+        logger.info("Creating v1 connector with name: %s", connector_name)
+        # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
+        # Scheduler connector:
+        # - Co-locate with scheduler process
+        # - Should only be used inside the Scheduler class
+        # Worker connector:
+        # - Co-locate with worker process
+        # - Should only be used inside the forward context & attention layer
+        # We build separately to enforce strict separation
+        return connector_cls(config, role)
+
 
 # Register various connectors here.
 # The registration should not be done in each individual file, as we want to
@@ -57,4 +94,14 @@ def create_connector(cls, rank: int, local_rank: int,
 KVConnectorFactory.register_connector(
     "MooncakeStoreConnector",
     "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
-    "MooncakeStoreConnector")
\ No newline at end of file
+    "MooncakeStoreConnector")
+
+KVConnectorFactory.register_connector(
+    "SharedStorageConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
+    "SharedStorageConnector")
+
+KVConnectorFactory.register_connector(
+    "LMCacheConnectorV1",
+    "vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector",
+    "LMCacheConnectorV1")
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
index c5135dab23e..7b26aec2323 100644
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 MooncakeStore Connector for Distributed Machine Learning Inference
-
 The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
 (KV cache producer) and decode vLLM workers (KV cache consumer) using a
 database-style KVStore.
@@ -11,9 +10,10 @@
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 
@@ -32,8 +32,7 @@ def __init__(
         config: VllmConfig,
     ):
         self.config = config.kv_transfer_config
-        self.tp_size = config.parallel_config.tensor_parallel_size
-
+        self.kv_helper = kv_helper(config)
         self.local_tp_rank = local_rank
 
         # Init kv_store
@@ -80,12 +79,7 @@ def send_kv_caches_and_hidden_states(
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
-
-        model_config = model_executable.model.config
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-        head_size = int(hidden_size / num_attention_heads)
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
 
         for idx, slen in enumerate(seq_lens):
             start_pos = sum(seq_lens[:idx])
@@ -97,10 +91,8 @@ def send_kv_caches_and_hidden_states(
 
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
-
-                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
-
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
                 keys.append(key_cache[current_slot_mapping].unsqueeze(0))
@@ -173,22 +165,15 @@ def recv_kv_caches_and_hidden_states(
                 layer = model_executable.model.layers[layer_id]
                 # get kvcache object
                 kv_cache = kv_caches[layer_id - start_layer]
-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                # get remote kvcache
 
+                # get remote kvcache
                 remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
                     layer_id]
-                # use ops.reshape_and_cache_flash to put kv into kvcache
-                ops.reshape_and_cache_flash(
-                    remote_k.to(key_cache.device),
-                    remote_v.to(value_cache.device),
-                    key_cache,
-                    value_cache,
-                    slot_mapping[start_pos:end_pos],
-                    layer.self_attn.attn.kv_cache_dtype,
-                    layer.self_attn.attn._k_scale,
-                    layer.self_attn.attn._v_scale,
-                )
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 49b97d7b588..0464a758513 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -12,10 +12,10 @@
 
 import torch
 
-import vllm.envs as envs
-from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    model_aware_kv_ops_helper as kv_helper)
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
     SimpleBuffer)
 from vllm.logger import init_logger
@@ -37,9 +37,7 @@ def __init__(
     ):
 
         self.config = config.kv_transfer_config
-        self.tp_size = config.parallel_config.tensor_parallel_size
-        self.is_deepseek_mla = config.model_config.is_deepseek_mla
-        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.kv_helper = kv_helper(config)
 
         if self.config.kv_connector == "PyNcclConnector":
             from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -165,31 +163,7 @@ def send_kv_caches_and_hidden_states(
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
-
-        model_config = model_executable.model.config
-        num_heads = int(model_config.num_key_value_heads / self.tp_size)
-        hidden_size = model_config.hidden_size
-        num_attention_heads = model_config.num_attention_heads
-
-        # Deepseek's MLA (Multi-head Latent Attention) uses two different
-        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
-        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
-        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
-        # kv_lora_rank + qk_rope_head_dim].
-        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
-        # to a kv_cache shape of [2, num_blks, blk_size,
-        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
-        # For more details, see vllm/attention/backends/mla/common.py.
-        if self.is_deepseek_mla and self.use_mla_opt:
-            head_size = model_config.kv_lora_rank + \
-                model_config.qk_rope_head_dim
-            num_heads = 1
-        elif self.is_deepseek_mla and not self.use_mla_opt:
-            head_size = model_config.qk_nope_head_dim + \
-                model_config.qk_rope_head_dim
-        else:
-            head_size = getattr(model_config, "head_dim",
-                                int(hidden_size // num_attention_heads))
+        num_heads, head_size = self.kv_helper.get_model_args(model_executable)
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -212,13 +186,8 @@ def send_kv_caches_and_hidden_states(
 
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
-
-                if self.is_deepseek_mla and self.use_mla_opt:
-                    key_cache = kv_cache.reshape(-1, num_heads, head_size)
-                    value_cache = kv_cache.reshape(-1, num_heads, head_size)
-                else:
-                    key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                    value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                key_cache, value_cache = self.kv_helper.get_kv_from_cache(
+                    kv_cache, num_heads, head_size)
 
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
@@ -248,12 +217,12 @@ def recv_kv_caches_and_hidden_states(
         # and hidden states.
         bypass_model_exec = True
 
-        model_config = model_executable.model.config
-
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
 
         hidden_or_intermediate_states_for_one_req = []
 
@@ -312,41 +281,19 @@ def recv_kv_caches_and_hidden_states(
             end_pos = start_pos + num_computed_tokens
 
             # put received KV caches into paged memory
-            for i in range(model_executable.model.start_layer,
-                           model_executable.model.end_layer):
-
-                kv_cache = kv_caches[i - model_executable.model.start_layer]
-                layer = model_executable.model.layers[i]
-
-                if self.is_deepseek_mla and self.use_mla_opt:
-                    layer.self_attn.attn = layer.self_attn.mla_attn
-                    k_c_normed_k_pe = keys[
-                        i - model_executable.model.start_layer].to(
-                            kv_cache.device).squeeze(1)
-                    k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
-                    k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
-                    ops.concat_and_cache_mla(
-                        k_c_normed,
-                        k_pe,
-                        kv_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                    )
-                else:
-                    key_cache, value_cache = kv_cache[0], kv_cache[1]
-                    ops.reshape_and_cache_flash(
-                        keys[i - model_executable.model.start_layer].to(
-                            key_cache.device),
-                        values[i - model_executable.model.start_layer].to(
-                            value_cache.device),
-                        key_cache,
-                        value_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                        layer.self_attn.attn._v_scale,
-                    )
+            for cur_layer in range(start_layer, end_layer):
+
+                layer_id = cur_layer - start_layer
+                kv_cache = kv_caches[layer_id]
+                layer = model_executable.model.layers[cur_layer]
+
+                # get remote kvcache
+                remote_k, remote_v = keys[layer_id], values[layer_id]
+
+                self.kv_helper.put_kv_to_cache(model_executable, remote_k,
+                                               remote_v, layer, kv_cache,
+                                               slot_mapping, start_pos,
+                                               end_pos)
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
new file mode 100644
index 00000000000..0b0ce9828a7
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KV cache helper for store.
+"""
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class model_aware_kv_ops_helper:
+
+    def __init__(self, config: VllmConfig):
+        self.is_deepseek_mla = config.model_config.is_deepseek_mla
+        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
+        self.tp_size = config.parallel_config.tensor_parallel_size
+
+    def get_model_args(self, model_executable: torch.nn.Module):
+
+        model_config = model_executable.model.config
+        self.model_executable = model_executable
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+
+        # Deepseek's MLA (Multi-head Latent Attention) uses two different
+        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
+        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
+        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
+        # kv_lora_rank + qk_rope_head_dim].
+        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
+        # to a kv_cache shape of [2, num_blks, blk_size,
+        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
+        # For more details, see vllm/attention/backends/mla/common.py.
+        if self.is_deepseek_mla and self.use_mla_opt:
+            head_size = model_config.kv_lora_rank + \
+                model_config.qk_rope_head_dim
+            num_heads = 1
+        elif self.is_deepseek_mla and not self.use_mla_opt:
+            head_size = model_config.qk_nope_head_dim + \
+                model_config.qk_rope_head_dim
+        else:
+            head_size = getattr(model_config, "head_dim",
+                                int(hidden_size // num_attention_heads))
+
+        return num_heads, head_size
+
+    def get_kv_from_cache(self, kv_cache, num_heads, head_size):
+        if self.is_deepseek_mla and self.use_mla_opt:
+            key_cache = kv_cache.reshape(-1, num_heads, head_size)
+            value_cache = kv_cache.reshape(-1, num_heads, head_size)
+        else:
+            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+        return key_cache, value_cache
+
+    def put_kv_to_cache(self, model_executable: torch.nn.Module, keys, values,
+                        layer, kv_cache, slot_mapping, start_pos, end_pos):
+
+        model_config = model_executable.model.config
+
+        if self.is_deepseek_mla and self.use_mla_opt:
+            layer.self_attn.attn = layer.self_attn.mla_attn
+            k_c_normed_k_pe = keys.squeeze(1)
+            k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
+            k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+            ops.concat_and_cache_mla(
+                k_c_normed.to(kv_cache.device),
+                k_pe.to(kv_cache.device),
+                kv_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+            )
+        else:
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            ops.reshape_and_cache_flash(
+                keys.to(key_cache.device),
+                values.to(value_cache.device),
+                key_cache,
+                value_cache,
+                slot_mapping[start_pos:end_pos],
+                layer.self_attn.attn.kv_cache_dtype,
+                layer.self_attn.attn._k_scale,
+                layer.self_attn.attn._v_scale,
+            )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
new file mode 100644
index 00000000000..a017b140e09
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorRole)
+
+__all__ = [
+    "KVConnectorRole",
+    "KVConnectorBase_V1",
+]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
new file mode 100644
index 00000000000..95967d2ca91
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State
+communication in vLLM v1
+
+The class provides the following primitives:
+    Scheduler-side: runs in the scheduler, binds metadata, which
+    is used by the worker-side to load/save KV cache.
+        get_num_new_matched_tokens() - get number of new tokens 
+            that exist in the remote KV cache
+        update_state_after_alloc() - update KVConnector state after
+            temporary buffer alloc by the CacheManager.
+
+    Worker-side: runs in each worker, loads/saves KV cache to/from
+    the Connector based on the metadata.
+        start_load_kv() - starts loading all KVs (maybe async)
+        wait_for_layer_load() - blocks until layer i load is done
+
+        save_kv_layer() - starts saving KV for layer i (maybe async)
+        wait_for_save() - blocks until all saves are done
+"""
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.config import VllmConfig
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class KVConnectorRole(enum.Enum):
+    # Connector running in the scheduler process
+    SCHEDULER = 0
+
+    # Connector running in the worker process
+    WORKER = 1
+
+
+@dataclass
+class KVConnectorMetadata:
+    pass
+
+
+class KVConnectorBase_V1(ABC):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        logger.warning(
+            "Initializing KVConnectorBase_V1. This API is experimental and "
+            "subject to change in the future as we iterate the design.")
+        self._connector_metadata = KVConnectorMetadata()
+        self._vllm_config = vllm_config
+        self._role = role
+
+    @property
+    def role(self) -> KVConnectorRole:
+        return self._role
+
+    def bind_connector_metadata(
+            self, connector_metadata: KVConnectorMetadata) -> None:
+        """Set the connector metadata from the scheduler.
+
+        This function should be called by the model runner every time 
+        before the model execution. The metadata will be used for runtime
+        KV cache loading and saving.
+
+        Args:
+            connector_metadata (dict): the connector metadata.
+        """
+        self._connector_metadata = connector_metadata
+
+    def clear_connector_metadata(self) -> None:
+        """Clear the connector metadata.
+
+        This function should be called by the model runner every time 
+        after the model execution.
+        """
+        self._connector_metadata = KVConnectorMetadata()
+
+    def _get_connector_metadata(self) -> KVConnectorMetadata:
+        """Get the connector metadata.
+
+        This function should only be called inside the connector.
+
+        Returns:
+            ConnectorMetadata: the connector metadata.
+        """
+        return self._connector_metadata
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+
+    @abstractmethod
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+            
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        pass
+
+    @abstractmethod
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """
+        Start saving a layer of KV cache from vLLM's paged buffer 
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        pass
+
+    @abstractmethod
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        pass
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    @abstractmethod
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        pass
+
+    @abstractmethod
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        pass
+
+    @abstractmethod
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        pass
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
new file mode 100644
index 00000000000..e07f185f0dd
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING
+
+import torch
+from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class LMCacheConnectorV1(KVConnectorBase_V1):
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._lmcache_engine = LMCacheConnectorV1Impl(vllm_config, role, self)
+
+    # ==============================
+    # Worker-side methods
+    # ==============================
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """
+        Start loading the KV cache from the connector to vLLM's paged
+        KV buffer. This is called from the forward context before the
+        forward pass to enable async loading during model execution.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+            
+        """
+        self._lmcache_engine.start_load_kv(forward_context, **kwargs)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """
+        Block until the KV for a specific layer is loaded into vLLM's
+        paged buffer. This is called from within attention layer to ensure
+        async copying from start_load_kv is complete.
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        self._lmcache_engine.wait_for_layer_load(layer_name)
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """
+        Start saving the a layer of KV cache from vLLM's paged buffer 
+        to the connector. This is called from within attention layer to
+        enable async copying during execution.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+        self._lmcache_engine.save_kv_layer(layer_name, kv_layer, attn_metadata,
+                                           **kwargs)
+
+    def wait_for_save(self):
+        """
+        Block until all the save operations is done. This is called
+        as the forward context exits to ensure that the async saving
+        from save_kv_layer is complete before finishing the forward.
+
+        This prevents overwrites of paged KV buffer before saving done.
+        """
+        self._lmcache_engine.wait_for_save()
+
+    # ==============================
+    # Scheduler-side methods
+    # ==============================
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+        return self._lmcache_engine.get_num_new_matched_tokens(
+            request, num_computed_tokens)
+
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+        """
+        self._lmcache_engine.update_state_after_alloc(request,
+                                                      num_external_tokens)
+
+    def build_connector_meta(
+            self, scheduler_output: SchedulerOutput) -> KVConnectorMetadata:
+        """
+        Build the connector metadata for this step.
+
+        This function should NOT modify fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        return self._lmcache_engine.build_connector_meta(scheduler_output)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
new file mode 100644
index 00000000000..f91ffbc720e
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -0,0 +1,383 @@
+# SPDX-License-Identifier: Apache-2.0
+import hashlib
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import safetensors
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
+from vllm.v1.core.sched.output import SchedulerOutput
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionMetadata
+    from vllm.forward_context import ForwardContext
+    from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class ReqMeta:
+    # Request tokens
+    token_ids: torch.Tensor
+    # Slot mappings, should have the same length as token_ids
+    slot_mapping: torch.Tensor
+    # Is store or load
+    is_store: bool
+
+    @staticmethod
+    def make_meta(token_ids: list[int], block_ids: list[int], block_size: int,
+                  is_store: bool) -> "ReqMeta":
+        valid_num_tokens = align_to_block_size(len(token_ids), block_size)
+        token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens]
+        block_ids_tensor = torch.tensor(block_ids)
+        num_blocks = block_ids_tensor.shape[0]
+        block_offsets = torch.arange(0, block_size)
+        slot_mapping = block_offsets.reshape((1, block_size)) + \
+                block_ids_tensor.reshape((num_blocks, 1)) * block_size
+        slot_mapping = slot_mapping.flatten()[:valid_num_tokens]
+        return ReqMeta(
+            token_ids=token_ids_tensor,
+            slot_mapping=slot_mapping,
+            is_store=is_store,
+        )
+
+
+@dataclass
+class SharedStorageConnectorMetadata(KVConnectorMetadata):
+    requests: list[ReqMeta]
+
+    def __init__(self):
+        self.requests = []
+
+    def add_request(
+        self,
+        token_ids: list[int],
+        block_ids: list[int],
+        block_size: int,
+        is_store: bool,
+    ) -> None:
+        self.requests.append(
+            ReqMeta.make_meta(token_ids, block_ids, block_size, is_store))
+
+
+class SharedStorageConnector(KVConnectorBase_V1):
+    # NOTE: This is Simple debug implementation of the KV connector.
+    # It save / load the KV cache to / from the disk.
+    # It does extra work which will overwrite the existing prefix-cache in GPU
+    # - to remove the overhead, need to add some "mask" in the ReqMeta class
+
+    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
+        super().__init__(vllm_config=vllm_config, role=role)
+        self._block_size = vllm_config.cache_config.block_size
+        self._requests_need_load: dict[str, Request] = {}
+        transfer_config = vllm_config.kv_transfer_config
+        self._storage_path = transfer_config.get_from_extra_config(
+            "shared_storage_path", "/tmp")
+        logger.info(vllm_config.kv_transfer_config)
+        logger.info("Shared storage path is %s", self._storage_path)
+
+    def start_load_kv(self, forward_context: "ForwardContext",
+                      **kwargs) -> None:
+        """Start loading the KV cache from the connector buffer to vLLM's 
+        paged KV buffer.
+
+        Args:
+            forward_context (ForwardContext): the forward context.
+            **kwargs: additional arguments for the load operation
+
+        Note:
+            The number of elements in kv_caches and layer_names should be 
+            the same.
+        """
+        attn_metadata = forward_context.attn_metadata
+
+        def inject_kv_into_layer(
+            dst_kv_cache_layer: torch.Tensor,
+            src_kv_cache: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> None:
+            """Inject the KV cache into the layer.
+
+            Args:
+                dst_kv_cache_layer (torch.Tensor): the destination KV cache 
+                    layer. In shape [2, num_pages, page_size, xxx] if not 
+                    using MLA, [num_pages, page_size, xxx] otherwise.
+                src_kv_cache (torch.Tensor): the source KV cache. In shape
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] 
+                    otherwise.
+                slot_mapping (torch.Tensor): the slot mapping. In shape 
+                    [num_tokens].
+            """
+            dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+
+        # Get the metadata
+        metadata: KVConnectorMetadata = \
+            self._get_connector_metadata()
+        assert isinstance(metadata, SharedStorageConnectorMetadata)
+
+        if metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the connector metadata is None"
+            )
+            return
+
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            logger.warning(
+                "In connector.start_load_kv, but the attn_metadata is None")
+            return
+
+        # Load the KV for each request each layer
+        for request in metadata.requests:
+            if request.is_store:
+                continue
+            logger.info("Inject KV cache of %d tokens to the paged memory",
+                        len(request.slot_mapping))
+            for layer_name in forward_context.no_compile_layers:
+                attn_layer = forward_context.no_compile_layers[layer_name]
+                kv_cache_layer = attn_layer.kv_cache[\
+                        forward_context.virtual_engine]
+
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = safetensors.torch.load_file(
+                    filename)["kv_cache"].cuda()
+                inject_kv_into_layer(kv_cache_layer, kv_cache,
+                                     request.slot_mapping)
+
+    def wait_for_layer_load(self, layer_name: str) -> None:
+        """Blocking until the KV for a specific layer is loaded into vLLM's
+        paged buffer. 
+        
+        This interface will be useful for layer-by-layer pipelining.
+
+        Args:
+            layer_name: the name of that layer
+        """
+        return
+
+    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
+                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+        """Start saving the KV cache of the layer from vLLM's paged buffer 
+        to the connector.
+
+        Args:
+            layer_name (str): the name of the layer.
+            kv_layer (torch.Tensor): the paged KV buffer of the current 
+                layer in vLLM.
+            attn_metadata (AttentionMetadata): the attention metadata.
+            **kwargs: additional arguments for the save operation.
+        """
+
+        def extract_kv_from_layer(
+            layer: torch.Tensor,
+            slot_mapping: torch.Tensor,
+        ) -> torch.Tensor:
+            """Extract the KV cache from the layer.
+
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
+            """
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
+            num_pages, page_size = layer.shape[1], layer.shape[2]
+            return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
+                                                               ...]
+
+        connector_metadata = self._get_connector_metadata()
+        assert isinstance(connector_metadata, SharedStorageConnectorMetadata)
+        for request in connector_metadata.requests:
+            if request.is_store:
+                filename = self._generate_filename_debug(
+                    layer_name, request.token_ids)
+                kv_cache = extract_kv_from_layer(kv_layer,
+                                                 request.slot_mapping)
+                tensors = {"kv_cache": kv_cache.detach().cpu()}
+                safetensors.torch.save_file(tensors, filename)
+
+    def wait_for_save(self):
+        return
+
+    def get_num_new_matched_tokens(
+        self,
+        request: "Request",
+        num_computed_tokens: int,
+    ) -> int:
+        """
+        Get number of new tokens that can be loaded from the
+        external KV cache beyond the num_computed_tokens.
+        
+        Args:
+            request (Request): the request object.
+            num_computed_tokens (int): the number of locally
+                computed tokens for this request
+
+        Returns:
+            the number of tokens that can be loaded from the 
+            external KV cache beyond what is already computed.
+        """
+
+        # NOTE: in this debug implementation, we assume that the prompt is
+        # cached_prompt + newly_generated_single_token
+        # Therefore, we use prompt_token_ids[:-1] to determine the folder name
+
+        # NOTE: in current v1 scheduler, the num_computed_tokens is aligned
+        # with the block granularity. And it expects the returned blocks and
+        # num_computed_tokens to also be aligned with the block granularity.
+        if not self._found_match_for_request(request):
+            return 0
+
+        logger.info("External Cache Hit!")
+
+        # Now, first num_tokens_to_check tokens are hit, we need to prepare
+        # the metadata for the worker connector to correctly load the KV
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+
+        return num_tokens_to_check - num_computed_tokens
+
+    def update_state_after_alloc(self, request: "Request",
+                                 num_external_tokens: int):
+        """
+        Update KVConnector state after block allocation.
+
+        If blocks were allocated, add to _requests_need_load,
+        such that we load the KVs in the next forward pass.
+        """
+        if num_external_tokens > 0:
+            self._requests_need_load[request.request_id] = request
+
+    def build_connector_meta(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> KVConnectorMetadata:
+        """Build the connector metadata for this step.
+
+        This function should NOT modify any fields in the scheduler_output.
+        Also, calling this function will reset the state of the connector.
+
+        Args:
+            scheduler_output (SchedulerOutput): the scheduler output object.
+        """
+        meta = SharedStorageConnectorMetadata()
+
+        total_need_load = 0
+        for new_req in scheduler_output.scheduled_new_reqs:
+            if new_req.req_id in self._requests_need_load:
+                meta.add_request(token_ids=new_req.prompt_token_ids,
+                                 block_ids=new_req.block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+            else:
+                # NOTE: here, we set the store and load being exclusive,
+                # but a single request can have both store and load.
+                # NOTE(rob): for this debug implementation, we only cache
+                # the original prompt tokens.
+                if not self._found_match_for_request(new_req):
+                    meta.add_request(token_ids=new_req.prompt_token_ids,
+                                     block_ids=new_req.block_ids,
+                                     block_size=self._block_size,
+                                     is_store=True)
+
+        for cached_req in scheduler_output.scheduled_cached_reqs:
+            # NOTE(rob): here we rely on the resumed requests being
+            # the first N requests in the list scheduled_cache_reqs.
+            if not cached_req.resumed_from_preemption:
+                break
+            if cached_req.req_id in self._requests_need_load:
+                # NOTE(rob): cached_req_data does not have the full
+                # list of token ids (only new tokens). So we look it
+                # up in the actual request object.
+                request = self._requests_need_load[cached_req.req_id]
+                total_tokens = (len(cached_req.new_token_ids) +
+                                cached_req.num_computed_tokens)
+                token_ids = request.all_token_ids[:total_tokens]
+
+                # NOTE(rob): For resumed req, new_block_ids is all
+                # of the block_ids for the request.
+                block_ids = cached_req.new_block_ids
+
+                meta.add_request(token_ids=token_ids,
+                                 block_ids=block_ids,
+                                 block_size=self._block_size,
+                                 is_store=False)
+                total_need_load += 1
+
+        assert total_need_load == len(self._requests_need_load)
+        self._requests_need_load.clear()
+        return meta
+
+    # ==============================
+    # Helper functions
+    # ==============================
+
+    def _found_match_for_request(
+        self,
+        request: "Request",
+    ) -> bool:
+        """Check if the cache is hit for the request.
+        """
+        num_tokens_to_check = align_to_block_size(
+            len(request.prompt_token_ids) - 1, self._block_size)
+        foldername = self._generate_foldername_debug(torch.tensor(
+            request.prompt_token_ids)[:num_tokens_to_check],
+                                                     create_folder=False)
+        return os.path.exists(foldername)
+
+    def _generate_foldername_debug(
+        self,
+        input_ids: torch.Tensor,
+        create_folder=False,
+    ) -> str:
+        """Generate a folder name based on the hash of the bytes of the input 
+        ids.
+        """
+        input_ids_bytes = input_ids.numpy().tobytes()
+        input_ids_hash = hashlib.md5(input_ids_bytes,
+                                     usedforsecurity=False).hexdigest()
+        foldername = os.path.join(self._storage_path, input_ids_hash)
+        if create_folder:
+            os.makedirs(foldername, exist_ok=True)
+        return foldername
+
+    def _generate_filename_debug(
+        self,
+        layer_name: str,
+        input_ids: torch.Tensor,
+    ) -> str:
+        """Generate a file name based on the layer name and the hash 
+        of the bytes of the input ids.
+        """
+        foldername = self._generate_foldername_debug(input_ids,
+                                                     create_folder=True)
+        return os.path.join(foldername, f"{layer_name}.safetensors")
+
+
+def align_to_block_size(num_tokens: int, block_size) -> int:
+    """Align the number of tokens to the block size.
+    """
+    return (num_tokens - 1) // block_size * block_size
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py
similarity index 97%
rename from vllm/distributed/kv_transfer/kv_transfer_agent.py
rename to vllm/distributed/kv_transfer/kv_connector_agent.py
index 1e80e0bd7de..9d714509810 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -46,7 +46,7 @@ def __init__(
         assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\
             "TransferAgent should only be used when kv_connector is set."
 
-        self.connector = KVConnectorFactory.create_connector(
+        self.connector = KVConnectorFactory.create_connector_v0(
             rank, local_rank, config)
 
     def send_kv_caches_and_hidden_states(
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 46670026491..aa4b1ba7149 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -2,6 +2,7 @@
 
 import json
 import os
+import struct
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import Optional, Union
@@ -115,14 +116,14 @@ def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str,
         p_rank_offset = int(p_port) + 8 + self.local_rank * 2
         d_rank_offset = int(d_port) + 8 + self.local_rank * 2
         if kv_rank == 0:
-            self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}")
+            self.sender_socket.bind(f"tcp://{p_host}:{p_rank_offset + 1}")
             self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}")
             self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}")
-            self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}")
+            self.receiver_ack.bind(f"tcp://{p_host}:{p_rank_offset + 2}")
         else:
             self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}")
-            self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}")
-            self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}")
+            self.sender_socket.bind(f"tcp://{d_host}:{d_rank_offset + 1}")
+            self.receiver_ack.bind(f"tcp://{d_host}:{d_rank_offset + 2}")
             self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}")
 
     def initialize(self, local_hostname: str, metadata_server: str,
@@ -176,7 +177,7 @@ def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
 
     def wait_for_ack(self, src_ptr: int, length: int) -> None:
         """Asynchronously wait for ACK from the receiver."""
-        ack = self.sender_ack.recv_pyobj()
+        ack = self.sender_ack.recv()
         if ack != b'ACK':
             logger.error("Failed to receive ACK from the receiver")
 
@@ -187,18 +188,22 @@ def send_bytes(self, user_data: bytes) -> None:
         length = len(user_data)
         src_ptr = self.allocate_managed_buffer(length)
         self.write_bytes_to_buffer(src_ptr, user_data, length)
-        self.sender_socket.send_pyobj((src_ptr, length))
+        self.sender_socket.send_multipart(
+            [struct.pack("!Q", src_ptr),
+             struct.pack("!Q", length)])
         self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
 
     def recv_bytes(self) -> bytes:
         """Receive bytes from the remote process."""
-        src_ptr, length = self.receiver_socket.recv_pyobj()
+        data = self.receiver_socket.recv_multipart()
+        src_ptr = struct.unpack("!Q", data[0])[0]
+        length = struct.unpack("!Q", data[1])[0]
         dst_ptr = self.allocate_managed_buffer(length)
         self.transfer_sync(dst_ptr, src_ptr, length)
         ret = self.read_bytes_from_buffer(dst_ptr, length)
 
         # Buffer cleanup
-        self.receiver_ack.send_pyobj(b'ACK')
+        self.receiver_ack.send(b'ACK')
         self.free_managed_buffer(dst_ptr, length)
 
         return ret
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
new file mode 100644
index 00000000000..25d2f2cf5c6
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING, Optional
+
+from vllm import envs
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+from vllm.distributed.parallel_state import get_world_group
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None
+
+
+def get_kv_transfer_group() -> KVConnectorBaseType:
+    assert _KV_CONNECTOR_AGENT is not None, (
+        "disaggregated KV cache transfer parallel group is not initialized")
+    return _KV_CONNECTOR_AGENT
+
+
+def has_kv_transfer_group() -> bool:
+    return _KV_CONNECTOR_AGENT is not None
+
+
+def is_v1_kv_transfer_group(
+        connector: Optional[KVConnectorBaseType] = None) -> bool:
+    """Check if the KV connector is the v1 connector.
+    If the argument is None, it will check the global KV connector
+
+    Args:
+        connector: The KV connector to check. If None, it will check the
+            global KV connector.
+
+    Note:
+        This function will no-longer be needed after the v1 KV connector
+        becomes the default.
+    """
+    if connector is None:
+        connector = _KV_CONNECTOR_AGENT
+
+    if connector is None:
+        return False
+
+    return isinstance(connector, KVConnectorBase_V1)
+
+
+def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
+    """
+    Initialize KV cache transfer parallel group.
+    """
+
+    global _KV_CONNECTOR_AGENT
+
+    if vllm_config.kv_transfer_config is None:
+        return
+
+    if (vllm_config.kv_transfer_config.is_kv_transfer_instance
+            and _KV_CONNECTOR_AGENT is None):
+        if envs.VLLM_USE_V1:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1(
+                config=vllm_config, role=KVConnectorRole.WORKER)
+        else:
+            _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0(
+                rank=get_world_group().rank,
+                local_rank=get_world_group().local_rank,
+                config=vllm_config,
+            )
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e0eeeffb88a..cb9658ce100 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -29,15 +29,13 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
-import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase)
@@ -46,9 +44,6 @@
 from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
                         supports_custom_op)
 
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-
 
 @dataclass
 class GraphCaptureContext:
@@ -118,6 +113,38 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
     return torch.empty_like(tensor)
 
 
+def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int,
+                   group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group.reduce_scatter(tensor, dim)
+
+
+def reduce_scatter_fake(tensor: torch.Tensor, dim: int, world_size: int,
+                        group_name: str) -> torch.Tensor:
+    new_shape = list(tensor.shape)
+    new_shape[dim] = tensor.shape[dim] // world_size
+    return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
+
+
+def all_gather(tensor: torch.Tensor, dim: int, world_size: int,
+               group_name: str) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group.all_gather(tensor, dim)
+
+
+def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
+                    group_name: str) -> torch.Tensor:
+    new_shape = list(tensor.shape)
+    new_shape[dim] = tensor.shape[dim] * world_size
+    return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
+
+
 if supports_custom_op():
     from vllm.platforms import current_platform
     direct_register_custom_op(
@@ -128,6 +155,20 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
         dispatch_key=current_platform.dispatch_key,
     )
 
+    direct_register_custom_op(
+        op_name="reduce_scatter",
+        op_func=reduce_scatter,
+        mutates_args=[],
+        fake_impl=reduce_scatter_fake,
+    )
+
+    direct_register_custom_op(
+        op_name="all_gather",
+        op_func=all_gather,
+        mutates_args=[],
+        fake_impl=all_gather_fake,
+    )
+
 
 class GroupCoordinator:
     """
@@ -327,6 +368,18 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
 
         return self.device_communicator.all_gather(input_, dim)
 
+    def reduce_scatter(self,
+                       input_: torch.Tensor,
+                       dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        return self.device_communicator.reduce_scatter(input_, dim)
+
     def gather(self,
                input_: torch.Tensor,
                dst: int = 0,
@@ -772,14 +825,6 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
-_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None
-
-
-def get_kv_transfer_group() -> kv_transfer.KVTransferAgent:
-    assert _KV_TRANSFER is not None, (
-        "disaggregated KV cache transfer parallel group is not initialized")
-    return _KV_TRANSFER
-
 
 @contextmanager
 def graph_capture(device: torch.device):
@@ -962,26 +1007,6 @@ def initialize_model_parallel(
         _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group)
 
 
-def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
-    """
-    Initialize KV cache transfer parallel group.
-    """
-
-    global _KV_TRANSFER
-
-    if vllm_config.kv_transfer_config is None:
-        return
-
-    if all([
-            vllm_config.kv_transfer_config.is_kv_transfer_instance,
-            _KV_TRANSFER is None
-    ]):
-        _KV_TRANSFER = kv_transfer.KVTransferAgent(
-            rank=get_world_group().rank,
-            local_rank=get_world_group().local_rank,
-            config=vllm_config)
-
-
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 2cb57afd456..e4d4008cd0a 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -7,6 +7,7 @@
 import dataclasses
 import datetime
 import pickle
+import socket
 import time
 from collections import deque
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
@@ -123,6 +124,10 @@ class StatelessProcessGroup:
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
+
+    # stores a reference to the socket so that the file descriptor stays alive
+    socket: Optional[socket.socket]
+
     data_expiration_seconds: int = 3600  # 1 hour
 
     # dst rank -> counter
@@ -234,18 +239,33 @@ def create(
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
         """ # noqa
+        launch_server = rank == 0
+        if launch_server:
+            # listen on the specified interface (instead of 0.0.0.0)
+            listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            listen_socket.bind((host, port))
+            listen_socket.listen()
+            listen_fd = listen_socket.fileno()
+        else:
+            listen_socket = None
+            listen_fd = None
+
         store = TCPStore(
             host_name=host,
             port=port,
             world_size=world_size,
-            is_master=(rank == 0),
+            is_master=launch_server,
             timeout=datetime.timedelta(seconds=store_timeout),
+            use_libuv=False,  # for now: github.com/pytorch/pytorch/pull/150215
+            master_listen_fd=listen_fd,
         )
 
         return StatelessProcessGroup(
             rank=rank,
             world_size=world_size,
             store=store,
+            socket=listen_socket,
             data_expiration_seconds=data_expiration_seconds)
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 32cb2e90af2..08dbb4c4503 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,82 +7,75 @@
 import re
 import threading
 from dataclasses import MISSING, dataclass, fields
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal, Mapping,
-                    Optional, Tuple, Type, TypeVar, Union, cast, get_args,
-                    get_origin)
+from itertools import permutations
+from typing import (Any, Callable, Dict, List, Literal, Optional, Type,
+                    TypeVar, Union, cast, get_args, get_origin)
 
 import torch
-from typing_extensions import TypeIs
+from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
-from vllm import version
-from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
-                         DecodingConfig, DeviceConfig,
-                         DistributedExecutorBackend, HfOverrides,
+from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
+                         ConfigFormat, ConfigType, DecodingConfig,
+                         DetailedTraceModules, Device, DeviceConfig,
+                         DistributedExecutorBackend, GuidedDecodingBackend,
+                         GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ModelImpl, ObservabilityConfig,
-                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
+                         ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
+                         ObservabilityConfig, ParallelConfig, PoolerConfig,
+                         PrefixCachingHashAlgo, PromptAdapterConfig,
                          SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
-                         TaskOption, TokenizerPoolConfig, VllmConfig,
-                         get_attr_docs)
+                         TaskOption, TokenizerMode, TokenizerPoolConfig,
+                         VllmConfig, get_attr_docs, get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.plugins import load_general_plugins
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
+from vllm.utils import FlexibleArgumentParser, GiB_bytes, is_in_ray_actor
 
 # yapf: enable
 
-if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-
 logger = init_logger(__name__)
 
-ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
-
-DEVICE_OPTIONS = [
-    "auto",
-    "cuda",
-    "neuron",
-    "cpu",
-    "tpu",
-    "xpu",
-    "hpu",
-]
-
 # object is used to allow for special typing forms
 T = TypeVar("T")
 TypeHint = Union[type[Any], object]
 TypeHintT = Union[type[T], object]
 
 
-def optional_arg(val: str, return_type: type[T]) -> Optional[T]:
-    if val == "" or val == "None":
-        return None
-    try:
-        return cast(Callable, return_type)(val)
-    except ValueError as e:
-        raise argparse.ArgumentTypeError(
-            f"Value {val} cannot be converted to {return_type}.") from e
-
-
-def optional_str(val: str) -> Optional[str]:
-    return optional_arg(val, str)
+def optional_type(
+        return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
 
+    def _optional_type(val: str) -> Optional[T]:
+        if val == "" or val == "None":
+            return None
+        try:
+            if return_type is json.loads and not re.match("^{.*}$", val):
+                return cast(T, nullable_kvs(val))
+            return return_type(val)
+        except ValueError as e:
+            raise argparse.ArgumentTypeError(
+                f"Value {val} cannot be converted to {return_type}.") from e
 
-def optional_int(val: str) -> Optional[int]:
-    return optional_arg(val, int)
+    return _optional_type
 
 
-def optional_float(val: str) -> Optional[float]:
-    return optional_arg(val, float)
+def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
+    if not re.match("^{.*}$", val):
+        return str(val)
+    else:
+        return optional_type(json.loads)(val)
 
 
-def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
+@deprecated(
+    "Passing a JSON argument as a string containing comma separated key=value "
+    "pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
+    "string instead.")
+def nullable_kvs(val: str) -> dict[str, int]:
     """Parses a string containing comma separate key [str] to value [int]
     pairs into a dictionary.
 
@@ -92,10 +85,7 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
     Returns:
         Dictionary with parsed values.
     """
-    if len(val) == 0:
-        return None
-
-    out_dict: Dict[str, int] = {}
+    out_dict: dict[str, int] = {}
     for item in val.split(","):
         kv_parts = [part.lower().strip() for part in item.split("=")]
         if len(kv_parts) != 2:
@@ -117,25 +107,146 @@ def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
     return out_dict
 
 
+def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
+    """Check if the type hint is a specific type."""
+    return type_hint is type or get_origin(type_hint) is type
+
+
+def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
+    """Check if the type hints contain a specific type."""
+    return any(is_type(type_hint, type) for type_hint in type_hints)
+
+
+def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
+    """Get the specific type from the type hints."""
+    return next((th for th in type_hints if is_type(th, type)), None)
+
+
+def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
+    """Convert Literal type hints to argparse kwargs."""
+    type_hint = get_type(type_hints, Literal)
+    choices = get_args(type_hint)
+    choice_type = type(choices[0])
+    if not all(isinstance(choice, choice_type) for choice in choices):
+        raise ValueError(
+            "All choices must be of the same type. "
+            f"Got {choices} with types {[type(c) for c in choices]}")
+    return {"type": choice_type, "choices": sorted(choices)}
+
+
+def is_not_builtin(type_hint: TypeHint) -> bool:
+    """Check if the class is not a built-in type."""
+    return type_hint.__module__ != "builtins"
+
+
+def get_kwargs(cls: ConfigType) -> dict[str, Any]:
+    cls_docs = get_attr_docs(cls)
+    kwargs = {}
+    for field in fields(cls):
+        # Get the default value of the field
+        default = field.default
+        if field.default_factory is not MISSING:
+            default = field.default_factory()
+
+        # Get the help text for the field
+        name = field.name
+        help = cls_docs[name].strip()
+        # Escape % for argparse
+        help = help.replace("%", "%%")
+
+        # Initialise the kwargs dictionary for the field
+        kwargs[name] = {"default": default, "help": help}
+
+        # Get the set of possible types for the field
+        type_hints: set[TypeHint] = set()
+        if get_origin(field.type) is Union:
+            type_hints.update(get_args(field.type))
+        else:
+            type_hints.add(field.type)
+
+        # Set other kwargs based on the type hints
+        json_tip = "\n\nShould be a valid JSON string."
+        if contains_type(type_hints, bool):
+            # Creates --no-<name> and --<name> flags
+            kwargs[name]["action"] = argparse.BooleanOptionalAction
+        elif contains_type(type_hints, Literal):
+            kwargs[name].update(literal_to_kwargs(type_hints))
+        elif contains_type(type_hints, tuple):
+            type_hint = get_type(type_hints, tuple)
+            types = get_args(type_hint)
+            tuple_type = types[0]
+            assert all(t is tuple_type for t in types if t is not Ellipsis), (
+                "All non-Ellipsis tuple elements must be of the same "
+                f"type. Got {types}.")
+            kwargs[name]["type"] = tuple_type
+            kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
+        elif contains_type(type_hints, list):
+            type_hint = get_type(type_hints, list)
+            types = get_args(type_hint)
+            assert len(types) == 1, (
+                "List type must have exactly one type. Got "
+                f"{type_hint} with types {types}")
+            kwargs[name]["type"] = types[0]
+            kwargs[name]["nargs"] = "+"
+        elif contains_type(type_hints, int):
+            kwargs[name]["type"] = int
+            # Special case for large integers
+            if name in {"max_model_len"}:
+                kwargs[name]["type"] = human_readable_int
+        elif contains_type(type_hints, float):
+            kwargs[name]["type"] = float
+        elif contains_type(type_hints,
+                           dict) and (contains_type(type_hints, str) or any(
+                               is_not_builtin(th) for th in type_hints)):
+            kwargs[name]["type"] = union_dict_and_str
+        elif contains_type(type_hints, dict):
+            # Dict arguments will always be optional
+            kwargs[name]["type"] = optional_type(json.loads)
+            kwargs[name]["help"] += json_tip
+        elif (contains_type(type_hints, str)
+              or any(is_not_builtin(th) for th in type_hints)):
+            kwargs[name]["type"] = str
+        else:
+            raise ValueError(
+                f"Unsupported type {type_hints} for argument {name}.")
+
+        # If the type hint was a sequence of literals, use the helper function
+        # to update the type and choices
+        if get_origin(kwargs[name].get("type")) is Literal:
+            kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
+
+        # If None is in type_hints, make the argument optional.
+        # But not if it's a bool, argparse will handle this better.
+        if type(None) in type_hints and not contains_type(type_hints, bool):
+            kwargs[name]["type"] = optional_type(kwargs[name]["type"])
+            if kwargs[name].get("choices"):
+                kwargs[name]["choices"].append("None")
+    return kwargs
+
+
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
-    model: str = 'facebook/opt-125m'
-    served_model_name: Optional[Union[str, List[str]]] = None
-    tokenizer: Optional[str] = None
-    hf_config_path: Optional[str] = None
-    task: TaskOption = "auto"
-    skip_tokenizer_init: bool = False
-    tokenizer_mode: str = 'auto'
-    trust_remote_code: bool = False
-    allowed_local_media_path: str = ""
+    model: str = ModelConfig.model
+    served_model_name: Optional[Union[
+        str, List[str]]] = ModelConfig.served_model_name
+    tokenizer: Optional[str] = ModelConfig.tokenizer
+    hf_config_path: Optional[str] = ModelConfig.hf_config_path
+    task: TaskOption = ModelConfig.task
+    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
+    enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
+    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
+    trust_remote_code: bool = ModelConfig.trust_remote_code
+    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
     download_dir: Optional[str] = LoadConfig.download_dir
     load_format: str = LoadConfig.load_format
-    config_format: ConfigFormat = ConfigFormat.AUTO
-    dtype: str = 'auto'
-    kv_cache_dtype: str = 'auto'
-    seed: Optional[int] = None
-    max_model_len: Optional[int] = None
+    config_format: str = ModelConfig.config_format
+    dtype: ModelDType = ModelConfig.dtype
+    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
+    seed: Optional[int] = ModelConfig.seed
+    max_model_len: Optional[int] = ModelConfig.max_model_len
+    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
+                                            "cuda_graph_sizes")
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
@@ -149,15 +260,16 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     max_parallel_loading_workers: Optional[
         int] = ParallelConfig.max_parallel_loading_workers
-    block_size: Optional[int] = None
-    enable_prefix_caching: Optional[bool] = None
-    prefix_caching_hash_algo: str = "builtin"
-    disable_sliding_window: bool = False
-    disable_cascade_attn: bool = False
+    block_size: Optional[BlockSize] = CacheConfig.block_size
+    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
+        CacheConfig.prefix_caching_hash_algo
+    disable_sliding_window: bool = ModelConfig.disable_sliding_window
+    disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
     use_v2_block_manager: bool = True
-    swap_space: float = 4  # GiB
-    cpu_offload_gb: float = 0  # GiB
-    gpu_memory_utilization: float = 0.90
+    swap_space: float = CacheConfig.swap_space
+    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
+    gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     max_num_batched_tokens: Optional[
         int] = SchedulerConfig.max_num_batched_tokens
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
@@ -165,48 +277,59 @@ class EngineArgs:
     long_prefill_token_threshold: int = \
         SchedulerConfig.long_prefill_token_threshold
     max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
-    max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
+    max_logprobs: int = ModelConfig.max_logprobs
     disable_log_stats: bool = False
-    revision: Optional[str] = None
-    code_revision: Optional[str] = None
-    rope_scaling: Optional[Dict[str, Any]] = None
-    rope_theta: Optional[float] = None
-    hf_token: Optional[Union[bool, str]] = None
-    hf_overrides: Optional[HfOverrides] = None
-    tokenizer_revision: Optional[str] = None
-    quantization: Optional[str] = None
-    enforce_eager: Optional[bool] = None
-    max_seq_len_to_capture: int = 8192
+    revision: Optional[str] = ModelConfig.revision
+    code_revision: Optional[str] = ModelConfig.code_revision
+    rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
+    rope_theta: Optional[float] = ModelConfig.rope_theta
+    hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
+    hf_overrides: Optional[HfOverrides] = \
+        get_field(ModelConfig, "hf_overrides")
+    tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
+    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
+    enforce_eager: bool = ModelConfig.enforce_eager
+    max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
-    tokenizer_pool_size: int = 0
-    # Note: Specifying a tokenizer pool by passing a class
-    # is intended for expert use only. The API may change without
-    # notice.
-    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
-    tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
-    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None
-    disable_mm_preprocessor_cache: bool = False
+    # The following three fields are deprecated and will be removed in a future
+    # release. Setting them will have no effect. Please remove them from your
+    # configurations.
+    tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
+    tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
+    tokenizer_pool_extra_config: dict = \
+        get_field(TokenizerPoolConfig, "extra_config")
+    limit_mm_per_prompt: dict[str, int] = \
+        get_field(MultiModalConfig, "limit_per_prompt")
+    mm_processor_kwargs: Optional[Dict[str, Any]] = \
+        MultiModalConfig.mm_processor_kwargs
+    disable_mm_preprocessor_cache: bool = \
+        MultiModalConfig.disable_mm_preprocessor_cache
+    # LoRA fields
     enable_lora: bool = False
-    enable_lora_bias: bool = False
-    max_loras: int = 1
-    max_lora_rank: int = 16
+    enable_lora_bias: bool = LoRAConfig.bias_enabled
+    max_loras: int = LoRAConfig.max_loras
+    max_lora_rank: int = LoRAConfig.max_lora_rank
+    fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
+    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
+    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
+    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
+    long_lora_scaling_factors: Optional[tuple[float, ...]] = \
+        LoRAConfig.long_lora_scaling_factors
+    # PromptAdapter fields
     enable_prompt_adapter: bool = False
-    max_prompt_adapters: int = 1
-    max_prompt_adapter_token: int = 0
-    fully_sharded_loras: bool = False
-    lora_extra_vocab_size: int = 256
-    long_lora_scaling_factors: Optional[Tuple[float]] = None
-    lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
-    max_cpu_loras: Optional[int] = None
-    device: str = 'auto'
+    max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
+    max_prompt_adapter_token: int = \
+        PromptAdapterConfig.max_prompt_adapter_token
+
+    device: Device = DeviceConfig.device
     num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
     multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
-    num_gpu_blocks_override: Optional[int] = None
+    num_gpu_blocks_override: Optional[
+        int] = CacheConfig.num_gpu_blocks_override
     num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
-    model_loader_extra_config: Optional[
-        dict] = LoadConfig.model_loader_extra_config
+    model_loader_extra_config: dict = \
+        get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: Optional[Union[str,
                                     List[str]]] = LoadConfig.ignore_patterns
     preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
@@ -216,43 +339,55 @@ class EngineArgs:
         bool] = SchedulerConfig.enable_chunked_prefill
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
-    guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
-    logits_processor_pattern: Optional[str] = None
+    guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
+    guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
+    guided_decoding_disable_any_whitespace: bool = \
+        DecodingConfig.disable_any_whitespace
+    guided_decoding_disable_additional_properties: bool = \
+        DecodingConfig.disable_additional_properties
+    logits_processor_pattern: Optional[
+        str] = ModelConfig.logits_processor_pattern
 
     speculative_config: Optional[Dict[str, Any]] = None
 
     qlora_adapter_name_or_path: Optional[str] = None
-    show_hidden_metrics_for_version: Optional[str] = None
-    otlp_traces_endpoint: Optional[str] = None
-    collect_detailed_traces: Optional[str] = None
-    disable_async_output_proc: bool = False
+    show_hidden_metrics_for_version: Optional[str] = \
+        ObservabilityConfig.show_hidden_metrics_for_version
+    otlp_traces_endpoint: Optional[str] = \
+        ObservabilityConfig.otlp_traces_endpoint
+    collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
+        ObservabilityConfig.collect_detailed_traces
+    disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
 
-    override_neuron_config: Optional[Dict[str, Any]] = None
-    override_pooler_config: Optional[PoolerConfig] = None
+    override_neuron_config: dict[str, Any] = \
+        get_field(ModelConfig, "override_neuron_config")
+    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
+        ModelConfig.override_pooler_config
     compilation_config: Optional[CompilationConfig] = None
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
     kv_transfer_config: Optional[KVTransferConfig] = None
+    kv_events_config: Optional[KVEventsConfig] = None
 
-    generation_config: Optional[str] = "auto"
-    override_generation_config: Optional[Dict[str, Any]] = None
-    enable_sleep_mode: bool = False
-    model_impl: str = "auto"
+    generation_config: str = ModelConfig.generation_config
+    enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
+    override_generation_config: dict[str, Any] = \
+        get_field(ModelConfig, "override_generation_config")
+    model_impl: str = ModelConfig.model_impl
 
-    calculate_kv_scales: Optional[bool] = None
+    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
 
     additional_config: Optional[Dict[str, Any]] = None
-    enable_reasoning: Optional[bool] = None
-    reasoning_parser: Optional[str] = None
+    enable_reasoning: Optional[bool] = None  # DEPRECATED
+    reasoning_parser: str = DecodingConfig.reasoning_backend
+
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
+    pt_load_map_location: str = LoadConfig.pt_load_map_location
 
     def __post_init__(self):
-        if not self.tokenizer:
-            self.tokenizer = self.model
-
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
@@ -268,240 +403,145 @@ def __post_init__(self):
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
 
-        def is_type_in_union(cls: TypeHint, type: TypeHint) -> bool:
-            """Check if the class is a type in a union type."""
-            is_union = get_origin(cls) is Union
-            type_in_union = type in [get_origin(a) or a for a in get_args(cls)]
-            return is_union and type_in_union
-
-        def get_type_from_union(cls: TypeHint, type: TypeHintT) -> TypeHintT:
-            """Get the type in a union type."""
-            for arg in get_args(cls):
-                if (get_origin(arg) or arg) is type:
-                    return arg
-            raise ValueError(f"Type {type} not found in union type {cls}.")
-
-        def is_optional(cls: TypeHint) -> TypeIs[Union[Any, None]]:
-            """Check if the class is an optional type."""
-            return is_type_in_union(cls, type(None))
-
-        def can_be_type(cls: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
-            """Check if the class can be of type."""
-            return cls is type or get_origin(cls) is type or is_type_in_union(
-                cls, type)
-
-        def is_custom_type(cls: TypeHint) -> bool:
-            """Check if the class is a custom type."""
-            return cls.__module__ != "builtins"
-
-        def get_kwargs(cls: type[Any]) -> dict[str, Any]:
-            cls_docs = get_attr_docs(cls)
-            kwargs = {}
-            for field in fields(cls):
-                name = field.name
-                # One of these will always be present
-                default = (field.default_factory
-                           if field.default is MISSING else field.default)
-                kwargs[name] = {"default": default, "help": cls_docs[name]}
-
-                # Make note of if the field is optional and get the actual
-                # type of the field if it is
-                optional = is_optional(field.type)
-                field_type = get_args(
-                    field.type)[0] if optional else field.type
-
-                if can_be_type(field_type, bool):
-                    # Creates --no-<name> and --<name> flags
-                    kwargs[name]["action"] = argparse.BooleanOptionalAction
-                    kwargs[name]["type"] = bool
-                elif can_be_type(field_type, Literal):
-                    # Creates choices from Literal arguments
-                    if is_type_in_union(field_type, Literal):
-                        field_type = get_type_from_union(field_type, Literal)
-                    choices = get_args(field_type)
-                    kwargs[name]["choices"] = choices
-                    choice_type = type(choices[0])
-                    assert all(type(c) is choice_type for c in choices), (
-                        f"All choices must be of the same type. "
-                        f"Got {choices} with types {[type(c) for c in choices]}"
-                    )
-                    kwargs[name]["type"] = choice_type
-                elif can_be_type(field_type, int):
-                    kwargs[name]["type"] = optional_int if optional else int
-                elif can_be_type(field_type, float):
-                    kwargs[name][
-                        "type"] = optional_float if optional else float
-                elif (can_be_type(field_type, str)
-                      or can_be_type(field_type, dict)
-                      or is_custom_type(field_type)):
-                    kwargs[name]["type"] = optional_str if optional else str
-                else:
-                    raise ValueError(
-                        f"Unsupported type {field.type} for argument {name}. ")
-            return kwargs
-
         # Model arguments
-        parser.add_argument(
-            '--model',
-            type=str,
-            default=EngineArgs.model,
-            help='Name or path of the huggingface model to use.')
-        parser.add_argument(
-            '--task',
-            default=EngineArgs.task,
-            choices=get_args(TaskOption),
-            help='The task to use the model for. Each vLLM instance only '
-            'supports one task, even if the same model can be used for '
-            'multiple tasks. When the model only supports one task, ``"auto"`` '
-            'can be used to select it; otherwise, you must specify explicitly '
-            'which task to use.')
-        parser.add_argument(
-            '--tokenizer',
-            type=optional_str,
-            default=EngineArgs.tokenizer,
-            help='Name or path of the huggingface tokenizer to use. '
-            'If unspecified, model name or path will be used.')
-        parser.add_argument(
-            "--hf-config-path",
-            type=optional_str,
-            default=EngineArgs.hf_config_path,
-            help='Name or path of the huggingface config to use. '
-            'If unspecified, model name or path will be used.')
-        parser.add_argument(
-            '--skip-tokenizer-init',
-            action='store_true',
-            help='Skip initialization of tokenizer and detokenizer. '
-            'Expects valid prompt_token_ids and None for prompt from '
-            'the input. The generated output will contain token ids.')
-        parser.add_argument(
-            '--revision',
-            type=optional_str,
-            default=None,
-            help='The specific model version to use. It can be a branch '
-            'name, a tag name, or a commit id. If unspecified, will use '
-            'the default version.')
-        parser.add_argument(
-            '--code-revision',
-            type=optional_str,
-            default=None,
-            help='The specific revision to use for the model code on '
-            'Hugging Face Hub. It can be a branch name, a tag name, or a '
-            'commit id. If unspecified, will use the default version.')
-        parser.add_argument(
-            '--tokenizer-revision',
-            type=optional_str,
-            default=None,
-            help='Revision of the huggingface tokenizer to use. '
-            'It can be a branch name, a tag name, or a commit id. '
-            'If unspecified, will use the default version.')
-        parser.add_argument(
-            '--tokenizer-mode',
-            type=str,
-            default=EngineArgs.tokenizer_mode,
-            choices=['auto', 'slow', 'mistral', 'custom'],
-            help='The tokenizer mode.\n\n* "auto" will use the '
-            'fast tokenizer if available.\n* "slow" will '
-            'always use the slow tokenizer. \n* '
-            '"mistral" will always use the `mistral_common` tokenizer. \n* '
-            '"custom" will use --tokenizer to select the '
-            'preregistered tokenizer.')
-        parser.add_argument('--trust-remote-code',
-                            action='store_true',
-                            help='Trust remote code from huggingface.')
-        parser.add_argument(
-            '--allowed-local-media-path',
-            type=str,
-            help="Allowing API requests to read local images or videos "
-            "from directories specified by the server file system. "
-            "This is a security risk. "
-            "Should only be enabled in trusted environments.")
+        model_kwargs = get_kwargs(ModelConfig)
+        model_group = parser.add_argument_group(
+            title="ModelConfig",
+            description=ModelConfig.__doc__,
+        )
+        model_group.add_argument("--model", **model_kwargs["model"])
+        model_group.add_argument("--task", **model_kwargs["task"])
+        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
+        model_group.add_argument("--tokenizer-mode",
+                                 **model_kwargs["tokenizer_mode"])
+        model_group.add_argument("--trust-remote-code",
+                                 **model_kwargs["trust_remote_code"])
+        model_group.add_argument("--dtype", **model_kwargs["dtype"])
+        model_group.add_argument("--seed", **model_kwargs["seed"])
+        model_group.add_argument("--hf-config-path",
+                                 **model_kwargs["hf_config_path"])
+        model_group.add_argument("--allowed-local-media-path",
+                                 **model_kwargs["allowed_local_media_path"])
+        model_group.add_argument("--revision", **model_kwargs["revision"])
+        model_group.add_argument("--code-revision",
+                                 **model_kwargs["code_revision"])
+        model_group.add_argument("--rope-scaling",
+                                 **model_kwargs["rope_scaling"])
+        model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
+        model_group.add_argument("--tokenizer-revision",
+                                 **model_kwargs["tokenizer_revision"])
+        model_group.add_argument("--max-model-len",
+                                 **model_kwargs["max_model_len"])
+        model_group.add_argument("--quantization", "-q",
+                                 **model_kwargs["quantization"])
+        model_group.add_argument("--enforce-eager",
+                                 **model_kwargs["enforce_eager"])
+        model_group.add_argument("--max-seq-len-to-capture",
+                                 **model_kwargs["max_seq_len_to_capture"])
+        model_group.add_argument("--max-logprobs",
+                                 **model_kwargs["max_logprobs"])
+        model_group.add_argument("--disable-sliding-window",
+                                 **model_kwargs["disable_sliding_window"])
+        model_group.add_argument("--disable-cascade-attn",
+                                 **model_kwargs["disable_cascade_attn"])
+        model_group.add_argument("--skip-tokenizer-init",
+                                 **model_kwargs["skip_tokenizer_init"])
+        model_group.add_argument("--enable-prompt-embeds",
+                                 **model_kwargs["enable_prompt_embeds"])
+        model_group.add_argument("--served-model-name",
+                                 **model_kwargs["served_model_name"])
+        # This one is a special case because it is the
+        # opposite of ModelConfig.use_async_output_proc
+        model_group.add_argument(
+            "--disable-async-output-proc",
+            action="store_true",
+            default=EngineArgs.disable_async_output_proc,
+            help="Disable async output processing. This may result in "
+            "lower performance.")
+        model_group.add_argument("--config-format",
+                                 choices=[f.value for f in ConfigFormat],
+                                 **model_kwargs["config_format"])
+        # This one is a special case because it can bool
+        # or str. TODO: Handle this in get_kwargs
+        model_group.add_argument("--hf-token",
+                                 type=str,
+                                 nargs="?",
+                                 const=True,
+                                 default=model_kwargs["hf_token"]["default"],
+                                 help=model_kwargs["hf_token"]["help"])
+        model_group.add_argument("--hf-overrides",
+                                 **model_kwargs["hf_overrides"])
+        model_group.add_argument("--override-neuron-config",
+                                 **model_kwargs["override_neuron_config"])
+        model_group.add_argument("--override-pooler-config",
+                                 **model_kwargs["override_pooler_config"])
+        model_group.add_argument("--logits-processor-pattern",
+                                 **model_kwargs["logits_processor_pattern"])
+        model_group.add_argument("--generation-config",
+                                 **model_kwargs["generation_config"])
+        model_group.add_argument("--override-generation-config",
+                                 **model_kwargs["override_generation_config"])
+        model_group.add_argument("--enable-sleep-mode",
+                                 **model_kwargs["enable_sleep_mode"])
+        model_group.add_argument("--model-impl",
+                                 choices=[f.value for f in ModelImpl],
+                                 **model_kwargs["model_impl"])
+
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
         load_group = parser.add_argument_group(
             title="LoadConfig",
             description=LoadConfig.__doc__,
         )
-        load_group.add_argument('--load-format',
+        load_group.add_argument("--load-format",
                                 choices=[f.value for f in LoadFormat],
                                 **load_kwargs["load_format"])
-        load_group.add_argument('--download-dir',
+        load_group.add_argument("--download-dir",
                                 **load_kwargs["download_dir"])
-        load_group.add_argument('--model-loader-extra-config',
+        load_group.add_argument("--model-loader-extra-config",
                                 **load_kwargs["model_loader_extra_config"])
-        load_group.add_argument('--use-tqdm-on-load',
+        load_group.add_argument("--ignore-patterns",
+                                **load_kwargs["ignore_patterns"])
+        load_group.add_argument("--use-tqdm-on-load",
                                 **load_kwargs["use_tqdm_on_load"])
+        load_group.add_argument('--qlora-adapter-name-or-path',
+                                type=str,
+                                default=None,
+                                help='Name or path of the QLoRA adapter.')
+        load_group.add_argument('--pt-load-map-location',
+                                **load_kwargs["pt_load_map_location"])
+
+        # Guided decoding arguments
+        guided_decoding_kwargs = get_kwargs(DecodingConfig)
+        guided_decoding_group = parser.add_argument_group(
+            title="DecodingConfig",
+            description=DecodingConfig.__doc__,
+        )
+        guided_decoding_group.add_argument("--guided-decoding-backend",
+                                           **guided_decoding_kwargs["backend"])
+        guided_decoding_group.add_argument(
+            "--guided-decoding-disable-fallback",
+            **guided_decoding_kwargs["disable_fallback"])
+        guided_decoding_group.add_argument(
+            "--guided-decoding-disable-any-whitespace",
+            **guided_decoding_kwargs["disable_any_whitespace"])
+        guided_decoding_group.add_argument(
+            "--guided-decoding-disable-additional-properties",
+            **guided_decoding_kwargs["disable_additional_properties"])
+        guided_decoding_group.add_argument(
+            "--enable-reasoning",
+            action=argparse.BooleanOptionalAction,
+            help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
+            "of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
+            "parser backend insteadThis flag (`--enable-reasoning`) will be "
+            "removed in v0.10.0. When `--reasoning-parser` is specified, "
+            "reasoning mode is automatically enabled.")
+        guided_decoding_group.add_argument(
+            "--reasoning-parser",
+            # This choices is a special case because it's not static
+            choices=list(ReasoningParserManager.reasoning_parsers),
+            **guided_decoding_kwargs["reasoning_backend"])
 
-        parser.add_argument(
-            '--config-format',
-            default=EngineArgs.config_format,
-            choices=[f.value for f in ConfigFormat],
-            help='The format of the model config to load.\n\n'
-            '* "auto" will try to load the config in hf format '
-            'if available else it will try to load in mistral format ')
-        parser.add_argument(
-            '--dtype',
-            type=str,
-            default=EngineArgs.dtype,
-            choices=[
-                'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
-            ],
-            help='Data type for model weights and activations.\n\n'
-            '* "auto" will use FP16 precision for FP32 and FP16 models, and '
-            'BF16 precision for BF16 models.\n'
-            '* "half" for FP16. Recommended for AWQ quantization.\n'
-            '* "float16" is the same as "half".\n'
-            '* "bfloat16" for a balance between precision and range.\n'
-            '* "float" is shorthand for FP32 precision.\n'
-            '* "float32" for FP32 precision.')
-        parser.add_argument(
-            '--kv-cache-dtype',
-            type=str,
-            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-            default=EngineArgs.kv_cache_dtype,
-            help='Data type for kv cache storage. If "auto", will use model '
-            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-        parser.add_argument('--max-model-len',
-                            type=human_readable_int,
-                            default=EngineArgs.max_model_len,
-                            help='Model context length. If unspecified, will '
-                            'be automatically derived from the model config. '
-                            'Supports k/m/g/K/M/G in human-readable format.\n'
-                            'Examples:\n'
-                            '- 1k → 1000\n'
-                            '- 1K → 1024\n')
-        parser.add_argument(
-            '--guided-decoding-backend',
-            type=str,
-            default=DecodingConfig.guided_decoding_backend,
-            help='Which engine will be used for guided decoding'
-            ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/mlc-ai/xgrammar and '
-            'https://github.com/guidance-ai/llguidance.'
-            'Valid backend values are "xgrammar", "guidance", and "auto". '
-            'With "auto", we will make opinionated choices based on request '
-            'contents and what the backend libraries currently support, so '
-            'the behavior is subject to change in each release.')
-        parser.add_argument(
-            '--logits-processor-pattern',
-            type=optional_str,
-            default=None,
-            help='Optional regex pattern specifying valid logits processor '
-            'qualified names that can be passed with the `logits_processors` '
-            'extra completion argument. Defaults to None, which allows no '
-            'processors.')
-        parser.add_argument(
-            '--model-impl',
-            type=str,
-            default=EngineArgs.model_impl,
-            choices=[f.value for f in ModelImpl],
-            help='Which implementation of the model to use.\n\n'
-            '* "auto" will try to use the vLLM implementation if it exists '
-            'and fall back to the Transformers implementation if no vLLM '
-            'implementation is available.\n'
-            '* "vllm" will use the vLLM model implementation.\n'
-            '* "transformers" will use the Transformers model '
-            'implementation.\n')
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
         parallel_group = parser.add_argument_group(
@@ -509,366 +549,171 @@ def get_kwargs(cls: type[Any]) -> dict[str, Any]:
             description=ParallelConfig.__doc__,
         )
         parallel_group.add_argument(
-            '--distributed-executor-backend',
+            "--distributed-executor-backend",
             **parallel_kwargs["distributed_executor_backend"])
         parallel_group.add_argument(
-            '--pipeline-parallel-size', '-pp',
+            "--pipeline-parallel-size", "-pp",
             **parallel_kwargs["pipeline_parallel_size"])
-        parallel_group.add_argument('--tensor-parallel-size', '-tp',
+        parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                     **parallel_kwargs["tensor_parallel_size"])
-        parallel_group.add_argument('--data-parallel-size', '-dp',
+        parallel_group.add_argument("--data-parallel-size", "-dp",
                                     **parallel_kwargs["data_parallel_size"])
         parallel_group.add_argument(
-            '--enable-expert-parallel',
+            "--enable-expert-parallel",
             **parallel_kwargs["enable_expert_parallel"])
         parallel_group.add_argument(
-            '--max-parallel-loading-workers',
+            "--max-parallel-loading-workers",
             **parallel_kwargs["max_parallel_loading_workers"])
         parallel_group.add_argument(
-            '--ray-workers-use-nsight',
+            "--ray-workers-use-nsight",
             **parallel_kwargs["ray_workers_use_nsight"])
         parallel_group.add_argument(
-            '--disable-custom-all-reduce',
+            "--disable-custom-all-reduce",
             **parallel_kwargs["disable_custom_all_reduce"])
+        parallel_group.add_argument("--worker-cls",
+                                    **parallel_kwargs["worker_cls"])
+        parallel_group.add_argument("--worker-extension-cls",
+                                    **parallel_kwargs["worker_extension_cls"])
+
         # KV cache arguments
-        parser.add_argument('--block-size',
-                            type=int,
-                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32, 64, 128],
-                            help='Token block size for contiguous chunks of '
-                            'tokens. This is ignored on neuron devices and '
-                            'set to ``--max-model-len``. On CUDA devices, '
-                            'only block sizes up to 32 are supported. '
-                            'On HPU devices, block size defaults to 128.')
-
-        parser.add_argument(
-            "--enable-prefix-caching",
-            action=argparse.BooleanOptionalAction,
-            default=EngineArgs.enable_prefix_caching,
-            help="Enables automatic prefix caching. "
-            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
+        cache_kwargs = get_kwargs(CacheConfig)
+        cache_group = parser.add_argument_group(
+            title="CacheConfig",
+            description=CacheConfig.__doc__,
         )
-        parser.add_argument(
-            "--prefix-caching-hash-algo",
-            type=str,
-            choices=["builtin", "sha256"],
-            default=EngineArgs.prefix_caching_hash_algo,
-            help="Set the hash algorithm for prefix caching. "
-            "Options are 'builtin' (Python's built-in hash) or 'sha256' "
-            "(collision resistant but with certain overheads).",
+        cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
+        cache_group.add_argument("--gpu-memory-utilization",
+                                 **cache_kwargs["gpu_memory_utilization"])
+        cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
+        cache_group.add_argument("--kv-cache-dtype",
+                                 **cache_kwargs["cache_dtype"])
+        cache_group.add_argument("--num-gpu-blocks-override",
+                                 **cache_kwargs["num_gpu_blocks_override"])
+        cache_group.add_argument("--enable-prefix-caching",
+                                 **cache_kwargs["enable_prefix_caching"])
+        cache_group.add_argument("--prefix-caching-hash-algo",
+                                 **cache_kwargs["prefix_caching_hash_algo"])
+        cache_group.add_argument("--cpu-offload-gb",
+                                 **cache_kwargs["cpu_offload_gb"])
+        cache_group.add_argument("--calculate-kv-scales",
+                                 **cache_kwargs["calculate_kv_scales"])
+
+        # Tokenizer arguments
+        tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
+        tokenizer_group = parser.add_argument_group(
+            title="TokenizerPoolConfig",
+            description=TokenizerPoolConfig.__doc__,
         )
-        parser.add_argument('--disable-sliding-window',
-                            action='store_true',
-                            help='Disables sliding window, '
-                            'capping to sliding window size.')
-        parser.add_argument('--use-v2-block-manager',
-                            action='store_true',
-                            default=True,
-                            help='[DEPRECATED] block manager v1 has been '
-                            'removed and SelfAttnBlockSpaceManager (i.e. '
-                            'block manager v2) is now the default. '
-                            'Setting this flag to True or False'
-                            ' has no effect on vLLM behavior.')
-
-        parser.add_argument('--seed',
-                            type=int,
-                            default=EngineArgs.seed,
-                            help='Random seed for operations.')
-        parser.add_argument('--swap-space',
-                            type=float,
-                            default=EngineArgs.swap_space,
-                            help='CPU swap space size (GiB) per GPU.')
-        parser.add_argument(
-            '--cpu-offload-gb',
-            type=float,
-            default=0,
-            help='The space in GiB to offload to CPU, per GPU. '
-            'Default is 0, which means no offloading. Intuitively, '
-            'this argument can be seen as a virtual way to increase '
-            'the GPU memory size. For example, if you have one 24 GB '
-            'GPU and set this to 10, virtually you can think of it as '
-            'a 34 GB GPU. Then you can load a 13B model with BF16 weight, '
-            'which requires at least 26GB GPU memory. Note that this '
-            'requires fast CPU-GPU interconnect, as part of the model is '
-            'loaded from CPU memory to GPU memory on the fly in each '
-            'model forward pass.')
-        parser.add_argument(
-            '--gpu-memory-utilization',
-            type=float,
-            default=EngineArgs.gpu_memory_utilization,
-            help='The fraction of GPU memory to be used for the model '
-            'executor, which can range from 0 to 1. For example, a value of '
-            '0.5 would imply 50%% GPU memory utilization. If unspecified, '
-            'will use the default value of 0.9. This is a per-instance '
-            'limit, and only applies to the current vLLM instance.'
-            'It does not matter if you have another vLLM instance running '
-            'on the same GPU. For example, if you have two vLLM instances '
-            'running on the same GPU, you can set the GPU memory utilization '
-            'to 0.5 for each instance.')
-        parser.add_argument(
-            '--num-gpu-blocks-override',
-            type=int,
-            default=None,
-            help='If specified, ignore GPU profiling result and use this number'
-            ' of GPU blocks. Used for testing preemption.')
-        parser.add_argument(
-            '--max-logprobs',
-            type=int,
-            default=EngineArgs.max_logprobs,
-            help=('Max number of log probs to return logprobs is specified in'
-                  ' SamplingParams.'))
-        parser.add_argument('--disable-log-stats',
-                            action='store_true',
-                            help='Disable logging statistics.')
-        # Quantization settings.
-        parser.add_argument('--quantization',
-                            '-q',
-                            type=optional_str,
-                            choices=[*QUANTIZATION_METHODS, None],
-                            default=EngineArgs.quantization,
-                            help='Method used to quantize the weights. If '
-                            'None, we first check the `quantization_config` '
-                            'attribute in the model config file. If that is '
-                            'None, we assume the model weights are not '
-                            'quantized and use `dtype` to determine the data '
-                            'type of the weights.')
-        parser.add_argument(
-            '--rope-scaling',
-            default=None,
-            type=json.loads,
-            help='RoPE scaling configuration in JSON format. '
-            'For example, ``{"rope_type":"dynamic","factor":2.0}``')
-        parser.add_argument('--rope-theta',
-                            default=None,
-                            type=float,
-                            help='RoPE theta. Use with `rope_scaling`. In '
-                            'some cases, changing the RoPE theta improves the '
-                            'performance of the scaled model.')
-        parser.add_argument(
-            '--hf-token',
-            type=str,
-            nargs='?',
-            const=True,
-            default=None,
-            help='The token to use as HTTP bearer authorization'
-            ' for remote files. If `True`, will use the token '
-            'generated when running `huggingface-cli login` '
-            '(stored in `~/.huggingface`).')
-        parser.add_argument('--hf-overrides',
-                            type=json.loads,
-                            default=EngineArgs.hf_overrides,
-                            help='Extra arguments for the HuggingFace config. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary.')
-        parser.add_argument('--enforce-eager',
-                            action='store_true',
-                            help='Always use eager-mode PyTorch. If False, '
-                            'will use eager mode and CUDA graph in hybrid '
-                            'for maximal performance and flexibility.')
-        parser.add_argument('--max-seq-len-to-capture',
-                            type=int,
-                            default=EngineArgs.max_seq_len_to_capture,
-                            help='Maximum sequence length covered by CUDA '
-                            'graphs. When a sequence has context length '
-                            'larger than this, we fall back to eager mode. '
-                            'Additionally for encoder-decoder models, if the '
-                            'sequence length of the encoder input is larger '
-                            'than this, we fall back to the eager mode.')
-        parser.add_argument('--tokenizer-pool-size',
-                            type=int,
-                            default=EngineArgs.tokenizer_pool_size,
-                            help='Size of tokenizer pool to use for '
-                            'asynchronous tokenization. If 0, will '
-                            'use synchronous tokenization.')
-        parser.add_argument('--tokenizer-pool-type',
-                            type=str,
-                            default=EngineArgs.tokenizer_pool_type,
-                            help='Type of tokenizer pool to use for '
-                            'asynchronous tokenization. Ignored '
-                            'if tokenizer_pool_size is 0.')
-        parser.add_argument('--tokenizer-pool-extra-config',
-                            type=optional_str,
-                            default=EngineArgs.tokenizer_pool_extra_config,
-                            help='Extra config for tokenizer pool. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary. Ignored if '
-                            'tokenizer_pool_size is 0.')
+        tokenizer_group.add_argument("--tokenizer-pool-size",
+                                     **tokenizer_kwargs["pool_size"])
+        tokenizer_group.add_argument("--tokenizer-pool-type",
+                                     **tokenizer_kwargs["pool_type"])
+        tokenizer_group.add_argument("--tokenizer-pool-extra-config",
+                                     **tokenizer_kwargs["extra_config"])
 
         # Multimodal related configs
-        parser.add_argument(
-            '--limit-mm-per-prompt',
-            type=nullable_kvs,
-            default=EngineArgs.limit_mm_per_prompt,
-            # The default value is given in
-            # MultiModalConfig.get_default_limit_per_prompt
-            help=('For each multimodal plugin, limit how many '
-                  'input instances to allow for each prompt. '
-                  'Expects a comma-separated list of items, '
-                  'e.g.: `image=16,video=2` allows a maximum of 16 '
-                  'images and 2 videos per prompt. Defaults to '
-                  '1 (V0) or 999 (V1) for each modality.'))
-        parser.add_argument(
-            '--mm-processor-kwargs',
-            default=None,
-            type=json.loads,
-            help=('Overrides for the multimodal input mapping/processing, '
-                  'e.g., image processor. For example: ``{"num_crops": 4}``.'))
-        parser.add_argument(
-            '--disable-mm-preprocessor-cache',
-            action='store_true',
-            help='If true, then disables caching of the multi-modal '
-            'preprocessor/mapper. (not recommended)')
+        multimodal_kwargs = get_kwargs(MultiModalConfig)
+        multimodal_group = parser.add_argument_group(
+            title="MultiModalConfig",
+            description=MultiModalConfig.__doc__,
+        )
+        multimodal_group.add_argument("--limit-mm-per-prompt",
+                                      **multimodal_kwargs["limit_per_prompt"])
+        multimodal_group.add_argument(
+            "--mm-processor-kwargs",
+            **multimodal_kwargs["mm_processor_kwargs"])
+        multimodal_group.add_argument(
+            "--disable-mm-preprocessor-cache",
+            **multimodal_kwargs["disable_mm_preprocessor_cache"])
 
         # LoRA related configs
-        parser.add_argument('--enable-lora',
-                            action='store_true',
-                            help='If True, enable handling of LoRA adapters.')
-        parser.add_argument('--enable-lora-bias',
-                            action='store_true',
-                            help='If True, enable bias for LoRA adapters.')
-        parser.add_argument('--max-loras',
-                            type=int,
-                            default=EngineArgs.max_loras,
-                            help='Max number of LoRAs in a single batch.')
-        parser.add_argument('--max-lora-rank',
-                            type=int,
-                            default=EngineArgs.max_lora_rank,
-                            help='Max LoRA rank.')
-        parser.add_argument(
-            '--lora-extra-vocab-size',
-            type=int,
-            default=EngineArgs.lora_extra_vocab_size,
-            help=('Maximum size of extra vocabulary that can be '
-                  'present in a LoRA adapter (added to the base '
-                  'model vocabulary).'))
-        parser.add_argument(
-            '--lora-dtype',
-            type=str,
-            default=EngineArgs.lora_dtype,
-            choices=['auto', 'float16', 'bfloat16'],
-            help=('Data type for LoRA. If auto, will default to '
-                  'base model dtype.'))
-        parser.add_argument(
-            '--long-lora-scaling-factors',
-            type=optional_str,
-            default=EngineArgs.long_lora_scaling_factors,
-            help=('Specify multiple scaling factors (which can '
-                  'be different from base model scaling factor '
-                  '- see eg. Long LoRA) to allow for multiple '
-                  'LoRA adapters trained with those scaling '
-                  'factors to be used at the same time. If not '
-                  'specified, only adapters trained with the '
-                  'base model scaling factor are allowed.'))
-        parser.add_argument(
-            '--max-cpu-loras',
-            type=int,
-            default=EngineArgs.max_cpu_loras,
-            help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_loras.'))
-        parser.add_argument(
-            '--fully-sharded-loras',
-            action='store_true',
-            help=('By default, only half of the LoRA computation is '
-                  'sharded with tensor parallelism. '
-                  'Enabling this will use the fully sharded layers. '
-                  'At high sequence length, max rank or '
-                  'tensor parallel size, this is likely faster.'))
-        parser.add_argument('--enable-prompt-adapter',
-                            action='store_true',
-                            help='If True, enable handling of PromptAdapters.')
-        parser.add_argument('--max-prompt-adapters',
-                            type=int,
-                            default=EngineArgs.max_prompt_adapters,
-                            help='Max number of PromptAdapters in a batch.')
-        parser.add_argument('--max-prompt-adapter-token',
-                            type=int,
-                            default=EngineArgs.max_prompt_adapter_token,
-                            help='Max number of PromptAdapters tokens')
-        parser.add_argument("--device",
-                            type=str,
-                            default=EngineArgs.device,
-                            choices=DEVICE_OPTIONS,
-                            help='Device type for vLLM execution.')
-        parser.add_argument('--num-scheduler-steps',
-                            type=int,
-                            default=1,
-                            help=('Maximum number of forward steps per '
-                                  'scheduler call.'))
-
-        parser.add_argument('--speculative-config',
-                            type=json.loads,
-                            default=None,
-                            help='The configurations for speculative decoding.'
-                            ' Should be a JSON string.')
-        parser.add_argument(
-            '--ignore-patterns',
-            action="append",
-            type=str,
-            default=[],
-            help="The pattern(s) to ignore when loading the model."
-            "Default to `original/**/*` to avoid repeated loading of llama's "
-            "checkpoints.")
-        parser.add_argument(
-            '--preemption-mode',
-            type=str,
-            default=None,
-            help='If \'recompute\', the engine performs preemption by '
-            'recomputing; If \'swap\', the engine performs preemption by '
-            'block swapping.')
-
-        parser.add_argument(
-            "--served-model-name",
-            nargs="+",
-            type=str,
-            default=None,
-            help="The model name(s) used in the API. If multiple "
-            "names are provided, the server will respond to any "
-            "of the provided names. The model name in the model "
-            "field of a response will be the first name in this "
-            "list. If not specified, the model name will be the "
-            "same as the ``--model`` argument. Noted that this name(s) "
-            "will also be used in `model_name` tag content of "
-            "prometheus metrics, if multiple names provided, metrics "
-            "tag will take the first one.")
-        parser.add_argument('--qlora-adapter-name-or-path',
-                            type=str,
-                            default=None,
-                            help='Name or path of the QLoRA adapter.')
-
-        parser.add_argument('--show-hidden-metrics-for-version',
-                            type=str,
-                            default=None,
-                            help='Enable deprecated Prometheus metrics that '
-                            'have been hidden since the specified version. '
-                            'For example, if a previously deprecated metric '
-                            'has been hidden since the v0.7.0 release, you '
-                            'use --show-hidden-metrics-for-version=0.7 as a '
-                            'temporary escape hatch while you migrate to new '
-                            'metrics. The metric is likely to be removed '
-                            'completely in an upcoming release.')
-
-        parser.add_argument(
-            '--otlp-traces-endpoint',
-            type=str,
-            default=None,
-            help='Target URL to which OpenTelemetry traces will be sent.')
-        parser.add_argument(
-            '--collect-detailed-traces',
-            type=str,
+        lora_kwargs = get_kwargs(LoRAConfig)
+        lora_group = parser.add_argument_group(
+            title="LoRAConfig",
+            description=LoRAConfig.__doc__,
+        )
+        lora_group.add_argument(
+            "--enable-lora",
+            action=argparse.BooleanOptionalAction,
+            help="If True, enable handling of LoRA adapters.")
+        lora_group.add_argument("--enable-lora-bias",
+                                **lora_kwargs["bias_enabled"])
+        lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
+        lora_group.add_argument("--max-lora-rank",
+                                **lora_kwargs["max_lora_rank"])
+        lora_group.add_argument("--lora-extra-vocab-size",
+                                **lora_kwargs["lora_extra_vocab_size"])
+        lora_group.add_argument(
+            "--lora-dtype",
+            **lora_kwargs["lora_dtype"],
+        )
+        lora_group.add_argument("--long-lora-scaling-factors",
+                                **lora_kwargs["long_lora_scaling_factors"])
+        lora_group.add_argument("--max-cpu-loras",
+                                **lora_kwargs["max_cpu_loras"])
+        lora_group.add_argument("--fully-sharded-loras",
+                                **lora_kwargs["fully_sharded_loras"])
+
+        # PromptAdapter related configs
+        prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig)
+        prompt_adapter_group = parser.add_argument_group(
+            title="PromptAdapterConfig",
+            description=PromptAdapterConfig.__doc__,
+        )
+        prompt_adapter_group.add_argument(
+            "--enable-prompt-adapter",
+            action=argparse.BooleanOptionalAction,
+            help="If True, enable handling of PromptAdapters.")
+        prompt_adapter_group.add_argument(
+            "--max-prompt-adapters",
+            **prompt_adapter_kwargs["max_prompt_adapters"])
+        prompt_adapter_group.add_argument(
+            "--max-prompt-adapter-token",
+            **prompt_adapter_kwargs["max_prompt_adapter_token"])
+
+        # Device arguments
+        device_kwargs = get_kwargs(DeviceConfig)
+        device_group = parser.add_argument_group(
+            title="DeviceConfig",
+            description=DeviceConfig.__doc__,
+        )
+        device_group.add_argument("--device", **device_kwargs["device"])
+
+        # Speculative arguments
+        speculative_group = parser.add_argument_group(
+            title="SpeculativeConfig",
+            description=SpeculativeConfig.__doc__,
+        )
+        speculative_group.add_argument(
+            "--speculative-config",
+            type=json.loads,
             default=None,
-            help="Valid choices are " +
-            ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
-            ". It makes sense to set this only if ``--otlp-traces-endpoint`` is"
-            " set. If set, it will collect detailed traces for the specified "
-            "modules. This involves use of possibly costly and or blocking "
-            "operations and hence might have a performance impact.")
-
-        parser.add_argument(
-            '--disable-async-output-proc',
-            action='store_true',
-            default=EngineArgs.disable_async_output_proc,
-            help="Disable async output processing. This may result in "
-            "lower performance.")
+            help="The configurations for speculative decoding. Should be a "
+            "JSON string.")
+
+        # Observability arguments
+        observability_kwargs = get_kwargs(ObservabilityConfig)
+        observability_group = parser.add_argument_group(
+            title="ObservabilityConfig",
+            description=ObservabilityConfig.__doc__,
+        )
+        observability_group.add_argument(
+            "--show-hidden-metrics-for-version",
+            **observability_kwargs["show_hidden_metrics_for_version"])
+        observability_group.add_argument(
+            "--otlp-traces-endpoint",
+            **observability_kwargs["otlp_traces_endpoint"])
+        # TODO: generalise this special case
+        choices = observability_kwargs["collect_detailed_traces"]["choices"]
+        metavar = f"{{{','.join(choices)}}}"
+        observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
+        observability_kwargs["collect_detailed_traces"]["choices"] += [
+            ",".join(p)
+            for p in permutations(get_args(DetailedTraceModules), r=2)
+        ]
+        observability_group.add_argument(
+            "--collect-detailed-traces",
+            **observability_kwargs["collect_detailed_traces"])
 
         # Scheduler arguments
         scheduler_kwargs = get_kwargs(SchedulerConfig)
@@ -877,9 +722,9 @@ def get_kwargs(cls: type[Any]) -> dict[str, Any]:
             description=SchedulerConfig.__doc__,
         )
         scheduler_group.add_argument(
-            '--max-num-batched-tokens',
+            "--max-num-batched-tokens",
             **scheduler_kwargs["max_num_batched_tokens"])
-        scheduler_group.add_argument('--max-num-seqs',
+        scheduler_group.add_argument("--max-num-seqs",
                                      **scheduler_kwargs["max_num_seqs"])
         scheduler_group.add_argument(
             "--max-num-partial-prefills",
@@ -887,114 +732,83 @@ def get_kwargs(cls: type[Any]) -> dict[str, Any]:
         scheduler_group.add_argument(
             "--max-long-partial-prefills",
             **scheduler_kwargs["max_long_partial_prefills"])
+        scheduler_group.add_argument('--cuda-graph-sizes',
+                                     **scheduler_kwargs["cuda_graph_sizes"])
         scheduler_group.add_argument(
             "--long-prefill-token-threshold",
             **scheduler_kwargs["long_prefill_token_threshold"])
-        scheduler_group.add_argument('--num-lookahead-slots',
+        scheduler_group.add_argument("--num-lookahead-slots",
                                      **scheduler_kwargs["num_lookahead_slots"])
-        scheduler_group.add_argument('--scheduler-delay-factor',
+        scheduler_group.add_argument("--scheduler-delay-factor",
                                      **scheduler_kwargs["delay_factor"])
+        scheduler_group.add_argument("--preemption-mode",
+                                     **scheduler_kwargs["preemption_mode"])
+        scheduler_group.add_argument("--num-scheduler-steps",
+                                     **scheduler_kwargs["num_scheduler_steps"])
         scheduler_group.add_argument(
-            '--enable-chunked-prefill',
-            **scheduler_kwargs["enable_chunked_prefill"])
-        scheduler_group.add_argument(
-            '--multi-step-stream-outputs',
+            "--multi-step-stream-outputs",
             **scheduler_kwargs["multi_step_stream_outputs"])
-        scheduler_group.add_argument('--scheduling-policy',
+        scheduler_group.add_argument("--scheduling-policy",
                                      **scheduler_kwargs["policy"])
+        scheduler_group.add_argument(
+            "--enable-chunked-prefill",
+            **scheduler_kwargs["enable_chunked_prefill"])
         scheduler_group.add_argument(
             "--disable-chunked-mm-input",
             **scheduler_kwargs["disable_chunked_mm_input"])
-        parser.add_argument('--scheduler-cls',
-                            **scheduler_kwargs["scheduler_cls"])
-
-        parser.add_argument(
-            '--override-neuron-config',
-            type=json.loads,
+        scheduler_group.add_argument("--scheduler-cls",
+                                     **scheduler_kwargs["scheduler_cls"])
+
+        # Compilation arguments
+        # compilation_kwargs = get_kwargs(CompilationConfig)
+        compilation_group = parser.add_argument_group(
+            title="CompilationConfig",
+            description=CompilationConfig.__doc__,
+        )
+        compilation_group.add_argument(
+            "--compilation-config",
+            "-O",
+            type=CompilationConfig.from_cli,
             default=None,
-            help="Override or set neuron device configuration. "
-            "e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.")
-        parser.add_argument(
-            '--override-pooler-config',
-            type=PoolerConfig.from_json,
+            help="torch.compile configuration for the model. "
+            "When it is a number (0, 1, 2, 3), it will be "
+            "interpreted as the optimization level.\n"
+            "NOTE: level 0 is the default level without "
+            "any optimization. level 1 and 2 are for internal "
+            "testing only. level 3 is the recommended level "
+            "for production.\n"
+            "To specify the full compilation config, "
+            "use a JSON string, e.g. ``{\"level\": 3, "
+            "\"cudagraph_capture_sizes\": [1, 2, 4, 8]}``\n"
+            "Following the convention of traditional "
+            "compilers, using ``-O`` without space is also "
+            "supported. ``-O3`` is equivalent to ``-O 3``.")
+
+        # KVTransfer arguments
+        # kv_transfer_kwargs = get_kwargs(KVTransferConfig)
+        kv_transfer_group = parser.add_argument_group(
+            title="KVTransferConfig",
+            description=KVTransferConfig.__doc__,
+        )
+        kv_transfer_group.add_argument(
+            "--kv-transfer-config",
+            type=KVTransferConfig.from_cli,
             default=None,
-            help="Override or set the pooling method for pooling models. "
-            "e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.")
-
-        parser.add_argument('--compilation-config',
-                            '-O',
-                            type=CompilationConfig.from_cli,
-                            default=None,
-                            help='torch.compile configuration for the model.'
-                            'When it is a number (0, 1, 2, 3), it will be '
-                            'interpreted as the optimization level.\n'
-                            'NOTE: level 0 is the default level without '
-                            'any optimization. level 1 and 2 are for internal '
-                            'testing only. level 3 is the recommended level '
-                            'for production.\n'
-                            'To specify the full compilation config, '
-                            'use a JSON string.\n'
-                            'Following the convention of traditional '
-                            'compilers, using -O without space is also '
-                            'supported. -O3 is equivalent to -O 3.')
-
-        parser.add_argument('--kv-transfer-config',
-                            type=KVTransferConfig.from_cli,
-                            default=None,
-                            help='The configurations for distributed KV cache '
-                            'transfer. Should be a JSON string.')
-
-        parser.add_argument(
-            '--worker-cls',
-            type=str,
-            default="auto",
-            help='The worker class to use for distributed execution.')
-        parser.add_argument(
-            '--worker-extension-cls',
-            type=str,
-            default="",
-            help='The worker extension class on top of the worker cls, '
-            'it is useful if you just want to add new functions to the worker '
-            'class without changing the existing functions.')
-        parser.add_argument(
-            "--generation-config",
-            type=optional_str,
-            default="auto",
-            help="The folder path to the generation config. "
-            "Defaults to 'auto', the generation config will be loaded from "
-            "model path. If set to 'vllm', no generation config is loaded, "
-            "vLLM defaults will be used. If set to a folder path, the "
-            "generation config will be loaded from the specified folder path. "
-            "If `max_new_tokens` is specified in generation config, then "
-            "it sets a server-wide limit on the number of output tokens "
-            "for all requests.")
-
-        parser.add_argument(
-            "--override-generation-config",
-            type=json.loads,
+            help="The configurations for distributed KV cache "
+            "transfer. Should be a JSON string.")
+        kv_transfer_group.add_argument(
+            '--kv-events-config',
+            type=KVEventsConfig.from_cli,
             default=None,
-            help="Overrides or sets generation config in JSON format. "
-            "e.g. ``{\"temperature\": 0.5}``. If used with "
-            "--generation-config=auto, the override parameters will be merged "
-            "with the default config from the model. If generation-config is "
-            "None, only the override parameters are used.")
-
-        parser.add_argument("--enable-sleep-mode",
-                            action="store_true",
-                            default=False,
-                            help="Enable sleep mode for the engine. "
-                            "(only cuda platform is supported)")
-
-        parser.add_argument(
-            '--calculate-kv-scales',
-            action='store_true',
-            help='This enables dynamic calculation of '
-            'k_scale and v_scale when kv-cache-dtype is fp8. '
-            'If calculate-kv-scales is false, the scales will '
-            'be loaded from the model checkpoint if available. '
-            'Otherwise, the scales will default to 1.0.')
-
-        parser.add_argument(
+            help='The configurations for event publishing.')
+
+        # vLLM arguments
+        # vllm_kwargs = get_kwargs(VllmConfig)
+        vllm_group = parser.add_argument_group(
+            title="VllmConfig",
+            description=VllmConfig.__doc__,
+        )
+        vllm_group.add_argument(
             "--additional-config",
             type=json.loads,
             default=None,
@@ -1003,33 +817,18 @@ def get_kwargs(cls: type[Any]) -> dict[str, Any]:
             "configs are valid for the platform you are using. The input format"
             " is like '{\"config_key\":\"config_value\"}'")
 
-        parser.add_argument(
-            "--enable-reasoning",
-            action="store_true",
-            default=False,
-            help="Whether to enable reasoning_content for the model. "
-            "If enabled, the model will be able to generate reasoning content."
-        )
-
-        parser.add_argument(
-            "--reasoning-parser",
-            type=str,
-            choices=list(ReasoningParserManager.reasoning_parsers),
-            default=None,
-            help=
-            "Select the reasoning parser depending on the model that you're "
-            "using. This is used to parse the reasoning content into OpenAI "
-            "API format. Required for ``--enable-reasoning``.")
-
-        parser.add_argument(
-            "--disable-cascade-attn",
-            action="store_true",
-            default=False,
-            help="Disable cascade attention for V1. While cascade attention "
-            "does not change the mathematical correctness, disabling it "
-            "could be useful for preventing potential numerical issues. "
-            "Note that even if this is set to False, cascade attention will be "
-            "only used when the heuristic tells that it's beneficial.")
+        # Other arguments
+        parser.add_argument('--use-v2-block-manager',
+                            action='store_true',
+                            default=True,
+                            help='[DEPRECATED] block manager v1 has been '
+                            'removed and SelfAttnBlockSpaceManager (i.e. '
+                            'block manager v2) is now the default. '
+                            'Setting this flag to True or False'
+                            ' has no effect on vLLM behavior.')
+        parser.add_argument('--disable-log-stats',
+                            action='store_true',
+                            help='Disable logging statistics.')
 
         return parser
 
@@ -1057,8 +856,7 @@ def create_model_config(self) -> ModelConfig:
             model=self.model,
             hf_config_path=self.hf_config_path,
             task=self.task,
-            # We know this is not None because we set it in __post_init__
-            tokenizer=cast(str, self.tokenizer),
+            tokenizer=self.tokenizer,
             tokenizer_mode=self.tokenizer_mode,
             trust_remote_code=self.trust_remote_code,
             allowed_local_media_path=self.allowed_local_media_path,
@@ -1079,6 +877,7 @@ def create_model_config(self) -> ModelConfig:
             disable_sliding_window=self.disable_sliding_window,
             disable_cascade_attn=self.disable_cascade_attn,
             skip_tokenizer_init=self.skip_tokenizer_init,
+            enable_prompt_embeds=self.enable_prompt_embeds,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             use_async_output_proc=not self.disable_async_output_proc,
@@ -1104,12 +903,14 @@ def create_load_config(self) -> LoadConfig:
 
         if self.quantization == "bitsandbytes":
             self.load_format = "bitsandbytes"
+
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
             use_tqdm_on_load=self.use_tqdm_on_load,
+            pt_load_map_location=self.pt_load_map_location,
         )
 
     def create_speculative_config(
@@ -1224,11 +1025,6 @@ def create_engine_config(
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
-            tokenizer_pool_config=TokenizerPoolConfig.create_config(
-                self.tokenizer_pool_size,
-                self.tokenizer_pool_type,
-                self.tokenizer_pool_extra_config,
-            ),
             ray_workers_use_nsight=self.ray_workers_use_nsight,
             placement_group=placement_group,
             distributed_executor_backend=self.distributed_executor_backend,
@@ -1272,6 +1068,7 @@ def create_engine_config(
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
+            cuda_graph_sizes=self.cuda_graph_sizes,
             num_lookahead_slots=num_lookahead_slots,
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
@@ -1302,8 +1099,6 @@ def create_engine_config(
 
         if self.qlora_adapter_name_or_path is not None and \
             self.qlora_adapter_name_or_path != "":
-            if self.model_loader_extra_config is None:
-                self.model_loader_extra_config = {}
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
@@ -1319,31 +1114,19 @@ def create_engine_config(
                                         if self.enable_prompt_adapter else None
 
         decoding_config = DecodingConfig(
-            guided_decoding_backend=self.guided_decoding_backend,
+            backend=self.guided_decoding_backend,
+            disable_fallback=self.guided_decoding_disable_fallback,
+            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
+            disable_additional_properties=\
+                self.guided_decoding_disable_additional_properties,
             reasoning_backend=self.reasoning_parser
-            if self.enable_reasoning else None,
         )
 
-        show_hidden_metrics = False
-        if self.show_hidden_metrics_for_version is not None:
-            show_hidden_metrics = version._prev_minor_version_was(
-                self.show_hidden_metrics_for_version)
-
-        detailed_trace_modules = []
-        if self.collect_detailed_traces is not None:
-            detailed_trace_modules = self.collect_detailed_traces.split(",")
-        for m in detailed_trace_modules:
-            if m not in ALLOWED_DETAILED_TRACE_MODULES:
-                raise ValueError(
-                    f"Invalid module {m} in collect_detailed_traces. "
-                    f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
         observability_config = ObservabilityConfig(
-            show_hidden_metrics=show_hidden_metrics,
+            show_hidden_metrics_for_version=self.
+            show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
-            collect_model_forward_time="model" in detailed_trace_modules
-            or "all" in detailed_trace_modules,
-            collect_model_execute_time="worker" in detailed_trace_modules
-            or "all" in detailed_trace_modules,
+            collect_detailed_traces=self.collect_detailed_traces,
         )
 
         config = VllmConfig(
@@ -1360,6 +1143,7 @@ def create_engine_config(
             prompt_adapter_config=prompt_adapter_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
+            kv_events_config=self.kv_events_config,
             additional_config=self.additional_config,
         )
 
@@ -1410,19 +1194,12 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=True)
             return False
 
-        if self.additional_config != EngineArgs.additional_config:
-            _raise_or_fallback(feature_name="--additional-config",
-                               recommend_to_remove=False)
-            return False
-
-        # Xgrammar and Guidance are supported.
-        SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
-            "guidance:disable-any-whitespace", "auto"
-        ]
-        if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
-            _raise_or_fallback(feature_name="--guided-decoding-backend",
-                               recommend_to_remove=False)
+        if self.guided_decoding_backend not in get_args(
+                GuidedDecodingBackendV1):
+            _raise_or_fallback(
+                feature_name=
+                f"--guided-decoding-backend={self.guided_decoding_backend}",
+                recommend_to_remove=False)
             return False
 
         # Need at least Ampere for now (FA support required).
@@ -1446,7 +1223,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
             supported = False
             if fp8_attention and will_use_fa:
-                from vllm.vllm_flash_attn.fa_utils import (
+                from vllm.attention.utils.fa_utils import (
                     flash_attn_supports_fp8)
                 supported = flash_attn_supports_fp8()
             if not supported:
@@ -1460,6 +1237,12 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
+        # No text embedding inputs so far.
+        if self.enable_prompt_embeds:
+            _raise_or_fallback(feature_name="--enable-prompt-embeds",
+                               recommend_to_remove=False)
+            return False
+
         # Only Fp16 and Bf16 dtypes since we only support FA.
         V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
         if model_config.dtype not in V1_SUPPORTED_DTYPES:
@@ -1511,7 +1294,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             if speculative_method:
                 if speculative_method in ("ngram", "[ngram]"):
                     is_ngram_enabled = True
-                elif speculative_method == "eagle":
+                elif speculative_method in ("eagle", "eagle3"):
                     is_eagle_enabled = True
             else:
                 speculative_model = self.speculative_config.get("model")
@@ -1523,16 +1306,17 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                    recommend_to_remove=False)
                 return False
 
-        # No Disaggregated Prefill so far.
-        if self.kv_transfer_config != EngineArgs.kv_transfer_config:
-            _raise_or_fallback(feature_name="--kv-transfer-config",
-                               recommend_to_remove=False)
-            return False
-
-        # No FlashInfer or XFormers so far.
+        # No XFormers so far.
         V1_BACKENDS = [
-            "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
-            "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
+            "FLASH_ATTN_VLLM_V1",
+            "FLASH_ATTN",
+            "PALLAS",
+            "PALLAS_VLLM_V1",
+            "TRITON_ATTN_VLLM_V1",
+            "TRITON_MLA",
+            "FLASHMLA",
+            "FLASHINFER",
+            "FLASHINFER_VLLM_V1",
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1634,9 +1418,7 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
                 self.enable_prefix_caching = False
 
             # VLLM_V0 only supports builtin hash algo for prefix caching.
-            if self.prefix_caching_hash_algo is None:
-                self.prefix_caching_hash_algo = "builtin"
-            elif self.prefix_caching_hash_algo == "sha256":
+            if self.prefix_caching_hash_algo == "sha256":
                 raise ValueError(
                     "sha256 is not supported for prefix caching in V0 engine. "
                     "Please use 'builtin'.")
@@ -1655,10 +1437,6 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = True
 
-        # if using prefix caching, we must set a hash algo
-        if self.enable_prefix_caching and self.prefix_caching_hash_algo is None:
-            self.prefix_caching_hash_algo = "builtin"
-
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
@@ -1673,15 +1451,15 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
         # as the platform that vLLM is running on (e.g. the case of scaling
         # vLLM with Ray) and has no GPUs. In this case we use the default
         # values for non-H100/H200 GPUs.
+        from vllm.platforms import current_platform
         try:
-            from vllm.platforms import current_platform
-            device_name = current_platform.get_device_name().lower()
+            device_memory = current_platform.get_device_total_memory()
         except Exception:
             # This is only used to set default_max_num_batched_tokens
-            device_name = "no-device"
+            device_memory = 0
 
-        if "h100" in device_name or "h200" in device_name:
-            # For H100 and H200, we use larger default values.
+        if device_memory >= 70 * GiB_bytes:
+            # For GPUs like H100 and MI300x, use larger default values.
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 16384,
                 UsageContext.OPENAI_API_SERVER: 8192,
@@ -1695,11 +1473,37 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
             }
             default_max_num_seqs = 256
 
+        # tpu specific default values.
+        if current_platform.is_tpu():
+            default_max_num_batched_tokens_tpu = {
+                UsageContext.LLM_CLASS: {
+                    'V6E': 2048,
+                    'V5E': 1024,
+                    'V5P': 512,
+                },
+                UsageContext.OPENAI_API_SERVER: {
+                    'V6E': 1024,
+                    'V5E': 512,
+                    'V5P': 256,
+                }
+            }
+
         use_context_value = usage_context.value if usage_context else None
         if (self.max_num_batched_tokens is None
                 and usage_context in default_max_num_batched_tokens):
-            self.max_num_batched_tokens = default_max_num_batched_tokens[
-                usage_context]
+            if current_platform.is_tpu():
+                chip_name = current_platform.get_device_name()
+                if chip_name in default_max_num_batched_tokens_tpu[
+                        usage_context]:
+                    self.max_num_batched_tokens = \
+                        default_max_num_batched_tokens_tpu[
+                            usage_context][chip_name]
+                else:
+                    self.max_num_batched_tokens = \
+                        default_max_num_batched_tokens[usage_context]
+            else:
+                self.max_num_batched_tokens = default_max_num_batched_tokens[
+                    usage_context]
             logger.debug(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, use_context_value)
@@ -1763,7 +1567,7 @@ def _warn_or_fallback(feature_name: str) -> bool:
 def human_readable_int(value):
     """Parse human-readable integers like '1k', '2M', etc.
     Including decimal values with decimal multipliers.
-    
+
     Examples:
     - '1k' -> 1,000
     - '1K' -> 1,024
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 67c7e109c9f..37bb12d4428 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -475,7 +475,7 @@ async def add_request_async(
             *,
             inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> None:
-        """Async version of :meth:`add_request`."""
+        """Async version of {meth}`add_request`."""
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
@@ -489,16 +489,19 @@ async def add_request_async(
         if arrival_time is None:
             arrival_time = time.time()
 
-        if self.tokenizer is not None:
-            tokenizer = await self.get_tokenizer_async(lora_request)
-            self._validate_token_prompt(prompt, tokenizer=tokenizer)
+        if (isinstance(prompt, dict)
+                and prompt.get("prompt_embeds", None) is not None
+                and not prompt.get("prompt_token_ids", None)):
+            # We use the -2 dimension (instead of 0) in case a batched input
+            # of batch size 1 is passed in.
+            prompt["prompt_token_ids"] = [0
+                                          ] * prompt["prompt_embeds"].shape[-2]
 
-        preprocessed_inputs = await self.input_preprocessor.preprocess_async(
+        processed_inputs = await self.input_preprocessor.preprocess_async(
             prompt,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
-        processed_inputs = self.input_processor(preprocessed_inputs)
 
         if isinstance(params, SamplingParams) and \
             params.guided_decoding is not None:
@@ -526,10 +529,15 @@ async def add_request_async(
         )
 
     async def check_health_async(self) -> None:
-        if self.tokenizer:
-            self.tokenizer.check_health()
         self.model_executor.check_health()
 
+    async def collective_rpc_async(self,
+                                   method: str,
+                                   timeout: Optional[float] = None,
+                                   args: tuple = (),
+                                   kwargs: Optional[dict] = None):
+        raise NotImplementedError
+
 
 async def build_guided_decoding_logits_processor_async(
         sampling_params: SamplingParams, tokenizer: AnyTokenizer,
@@ -574,20 +582,20 @@ async def build_guided_decoding_logits_processor_async(
 
 
 class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for :class:`LLMEngine`.
+    """An asynchronous wrapper for {class}`LLMEngine`.
 
-    This class is used to wrap the :class:`LLMEngine` class to make it
+    This class is used to wrap the {class}`LLMEngine` class to make it
     asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The :class:`LLMEngine` is kicked by the
+    processing incoming requests. The {class}`LLMEngine` is kicked by the
     generate method when there are requests in the waiting queue. The generate
-    method yields the outputs from the :class:`LLMEngine` to the caller.
+    method yields the outputs from the {class}`LLMEngine` to the caller.
 
     Args:
         log_requests: Whether to log the requests.
         start_engine_loop: If True, the background task to run the engine
             will be automatically started in the generate call.
-        *args: Arguments for :class:`LLMEngine`.
-        **kwargs: Arguments for :class:`LLMEngine`.
+        *args: Arguments for {class}`LLMEngine`.
+        **kwargs: Arguments for {class}`LLMEngine`.
     """
 
     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@@ -977,7 +985,7 @@ async def generate(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
@@ -995,7 +1003,7 @@ async def generate(
         Details:
             - If the engine is not running, start the background loop,
               which iteratively invokes
-              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+              {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
               to process the waiting requests.
             - Add the request to the engine's `RequestTracker`.
               On the next background loop, this request will be sent to
@@ -1067,7 +1075,7 @@ async def encode(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
@@ -1081,46 +1089,48 @@ async def encode(
             for the request.
 
         Details:
-            - If the engine is not running, start the background loop,
-              which iteratively invokes
-              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
-              to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-              On the next background loop, this request will be sent to
-              the underlying engine.
-              Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
+        - If the engine is not running, start the background loop,
+            which iteratively invokes
+            {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+            to process the waiting requests.
+        - Add the request to the engine's `RequestTracker`.
+            On the next background loop, this request will be sent to
+            the underlying engine.
+            Also, a corresponding `AsyncStream` will be created.
+        - Wait for the request outputs from `AsyncStream` and yield them.
 
         Example:
-            >>> # Please refer to entrypoints/api_server.py for
-            >>> # the complete example.
-            >>>
-            >>> # initialize the engine and the example input
-            >>> # note that engine_args here is AsyncEngineArgs instance
-            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
-            >>> example_input = {
-            >>>     "input": "What is LLM?",
-            >>>     "request_id": 0,
-            >>> }
-            >>>
-            >>> # start the generation
-            >>> results_generator = engine.encode(
-            >>>    example_input["input"],
-            >>>    PoolingParams(),
-            >>>    example_input["request_id"])
-            >>>
-            >>> # get the results
-            >>> final_output = None
-            >>> async for request_output in results_generator:
-            >>>     if await request.is_disconnected():
-            >>>         # Abort the request if the client disconnects.
-            >>>         await engine.abort(request_id)
-            >>>         # Return or raise an error
-            >>>         ...
-            >>>     final_output = request_output
-            >>>
-            >>> # Process and return the final output
-            >>> ...
+        ```
+        # Please refer to entrypoints/api_server.py for
+        # the complete example.
+    
+        # initialize the engine and the example input
+        # note that engine_args here is AsyncEngineArgs instance
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        example_input = {
+            "input": "What is LLM?",
+            "request_id": 0,
+        }
+    
+        # start the generation
+        results_generator = engine.encode(
+        example_input["input"],
+        PoolingParams(),
+        example_input["request_id"])
+    
+        # get the results
+        final_output = None
+        async for request_output in results_generator:
+            if await request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await engine.abort(request_id)
+                # Return or raise an error
+                ...
+            final_output = request_output
+    
+        # Process and return the final output
+        ...
+        ```
         """
         try:
             async for output in await self.add_request(
@@ -1238,6 +1248,17 @@ async def is_sleeping(self) -> bool:
     async def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine.add_lora(lora_request)
 
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine.collective_rpc_async(method, timeout, args,
+                                                      kwargs)
+
 
 # TODO(v1): Remove this class proxy when V1 goes default.
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2347cdee904..38a20a418e2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -29,9 +29,8 @@
 from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
-from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputs)
-from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
@@ -55,7 +54,7 @@
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+    TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
 from vllm.utils import (Counter, Device, deprecate_kwargs,
@@ -66,7 +65,6 @@
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
-_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
 _R = TypeVar("_R", default=Any)
 
@@ -132,11 +130,11 @@ class LLMEngine:
     iteration-level scheduling and efficient memory management to maximize the
     serving throughput.
 
-    The :class:`~vllm.LLM` class wraps this class for offline batched inference
-    and the :class:`AsyncLLMEngine` class wraps this class for online serving.
+    The {class}`~vllm.LLM` class wraps this class for offline batched inference
+    and the {class}`AsyncLLMEngine` class wraps this class for online serving.
 
-    The config arguments are derived from :class:`~vllm.EngineArgs`. (See
-    :ref:`engine-args`)
+    The config arguments are derived from {class}`~vllm.EngineArgs`. (See
+    {ref}`engine-args`)
 
     Args:
         model_config: The configuration related to the LLM model.
@@ -205,7 +203,7 @@ def validate_outputs(
 
         return outputs_
 
-    tokenizer: Optional[BaseTokenizerGroup]
+    tokenizer: Optional[TokenizerGroup]
 
     def __init__(
         self,
@@ -214,7 +212,6 @@ def __init__(
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
@@ -275,11 +272,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                                                     self.tokenizer,
                                                     mm_registry)
 
-        self.input_registry = input_registry
-        self.input_processor = input_registry.create_input_processor(
-            self.model_config)
-
-        self.model_executor = executor_class(vllm_config=vllm_config, )
+        self.model_executor = executor_class(vllm_config=vllm_config)
 
         if self.model_config.runner_type != "pooling":
             self._initialize_kv_caches()
@@ -321,11 +314,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                     self.parallel_config.disable_custom_all_reduce,
                 })
 
-        if self.tokenizer:
-            # Ping the tokenizer to ensure liveness if it runs in a
-            # different process.
-            self.tokenizer.ping()
-
         self.cached_scheduler_outputs = [
             SchedulerOutputState()
             for _ in range(self.parallel_config.pipeline_parallel_size)
@@ -537,21 +525,12 @@ def __del__(self):
         if model_executor := getattr(self, "model_executor", None):
             model_executor.shutdown()
 
-    def get_tokenizer_group(
-        self,
-        group_type: Type[_G] = BaseTokenizerGroup,
-    ) -> _G:
-        tokenizer_group = self.tokenizer
-
-        if tokenizer_group is None:
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
             raise ValueError("Unable to get tokenizer because "
                              "skip_tokenizer_init is True")
-        if not isinstance(tokenizer_group, group_type):
-            raise TypeError("Invalid type of tokenizer group. "
-                            f"Expected type: {group_type}, but "
-                            f"found type: {type(tokenizer_group)}")
 
-        return tokenizer_group
+        return self.tokenizer
 
     def get_tokenizer(
         self,
@@ -559,11 +538,10 @@ def get_tokenizer(
     ) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
 
-    def _init_tokenizer(self) -> BaseTokenizerGroup:
+    def _init_tokenizer(self) -> TokenizerGroup:
         return init_tokenizer_from_configs(
             model_config=self.model_config,
             scheduler_config=self.scheduler_config,
-            parallel_config=self.parallel_config,
             lora_config=self.lora_config)
 
     def _verify_args(self) -> None:
@@ -667,6 +645,7 @@ def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
@@ -700,6 +679,7 @@ def add_request(
             params: Optional[Union[SamplingParams, PoolingParams]] = None,
             arrival_time: Optional[float] = None,
             lora_request: Optional[LoRARequest] = None,
+            tokenization_kwargs: Optional[dict[str, Any]] = None,
             trace_headers: Optional[Mapping[str, str]] = None,
             prompt_adapter_request: Optional[PromptAdapterRequest] = None,
             priority: int = 0,
@@ -714,11 +694,11 @@ def add_request(
 
         Args:
             request_id: The unique ID of the request.
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             params: Parameters for sampling or pooling.
-                :class:`~vllm.SamplingParams` for text generation.
-                :class:`~vllm.PoolingParams` for pooling.
+                {class}`~vllm.SamplingParams` for text generation.
+                {class}`~vllm.PoolingParams` for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
             lora_request: The LoRA request to add.
@@ -730,10 +710,10 @@ def add_request(
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of :class:`~vllm.Sequence` objects.
-            - Create a :class:`~vllm.SequenceGroup` object
-              from the list of :class:`~vllm.Sequence`.
-            - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
+            - Create `n` number of {class}`~vllm.Sequence` objects.
+            - Create a {class}`~vllm.SequenceGroup` object
+              from the list of {class}`~vllm.Sequence`.
+            - Add the {class}`~vllm.SequenceGroup` object to the scheduler.
 
         Example:
             >>> # initialize engine
@@ -773,17 +753,18 @@ def add_request(
         if arrival_time is None:
             arrival_time = time.time()
 
-        if self.tokenizer is not None:
-            self._validate_token_prompt(
-                prompt,
-                tokenizer=self.get_tokenizer(lora_request=lora_request))
+        if (isinstance(prompt, dict)
+                and prompt.get("prompt_embeds", None) is not None
+                and not prompt.get("prompt_token_ids", None)):
+            seq_len = prompt["prompt_embeds"].shape[0]
+            prompt["prompt_token_ids"] = [0] * seq_len
 
-        preprocessed_inputs = self.input_preprocessor.preprocess(
+        processed_inputs = self.input_preprocessor.preprocess(
             prompt,
+            tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
-        processed_inputs = self.input_processor(preprocessed_inputs)
 
         self._add_processed_request(
             request_id=request_id,
@@ -796,27 +777,6 @@ def add_request(
             priority=priority,
         )
 
-    def _validate_token_prompt(self, prompt: PromptType,
-                               tokenizer: AnyTokenizer):
-        # Guard against out-of-vocab tokens.
-        # For some tokenizers, tokenizer.decode will happily return empty text
-        # for token ids that are out of vocab, and we don't detect token ids
-        # that are greater than the max token id before running the model.
-        # However, these token ids will later crash a cuda kernel at runtime
-        # with an index out of bounds error. This will crash the entire engine.
-        # This needs to happen before multimodal input pre-processing, which
-        # may add dummy <image> tokens that aren't part of the tokenizer's
-        # vocabulary.
-        if is_token_prompt(prompt):
-            prompt_ids = prompt["prompt_token_ids"]
-            if len(prompt_ids) == 0:
-                # Empty prompt check is handled later
-                return
-            max_input_id = max(prompt_ids)
-            if max_input_id > tokenizer.max_token_id:
-                raise ValueError(
-                    "Token id {} is out of vocabulary".format(max_input_id))
-
     def _create_sequence_group_with_sampling(
         self,
         request_id: str,
@@ -901,8 +861,8 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
 
         Details:
             - Refer to the
-              :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
-              from class :class:`~vllm.core.scheduler.Scheduler`.
+              {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
+              from class {class}`~vllm.core.scheduler.Scheduler`.
 
         Example:
             >>> # initialize engine and add a request with request_id
@@ -1287,62 +1247,67 @@ def _advance_to_next_step(
                 if self.scheduler_config.is_multi_step:
                     is_prefill_append = seq.data.get_num_uncomputed_tokens(
                     ) == 0
-                    seq.append_token_id(sample.output_token, sample.logprobs)
+                    seq.append_token_id(sample.output_token, sample.logprobs,
+                                        sample.output_embed)
                     if not is_prefill_append:
                         seq_group.update_num_computed_tokens(1)
                 else:
-                    seq.append_token_id(sample.output_token, sample.logprobs)
+                    seq.append_token_id(sample.output_token, sample.logprobs,
+                                        sample.output_embed)
 
     def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
-        .. figure:: https://i.imgur.com/sv2HssD.png
-            :alt: Overview of the step function
-            :align: center
+        :::{figure} https://i.imgur.com/sv2HssD.png
+        :alt: Overview of the step function
+        :align: center
 
-            Overview of the step function.
+        Overview of the step function.
+        :::
 
         Details:
-            - Step 1: Schedules the sequences to be executed in the next
-              iteration and the token blocks to be swapped in/out/copy.
+        - Step 1: Schedules the sequences to be executed in the next
+            iteration and the token blocks to be swapped in/out/copy.
 
-                - Depending on the scheduling policy,
-                  sequences may be `preempted/reordered`.
-                - A Sequence Group (SG) refer to a group of sequences
-                  that are generated from the same prompt.
+            - Depending on the scheduling policy,
+                sequences may be `preempted/reordered`.
+            - A Sequence Group (SG) refer to a group of sequences
+                that are generated from the same prompt.
 
-            - Step 2: Calls the distributed executor to execute the model.
-            - Step 3: Processes the model output. This mainly includes:
+        - Step 2: Calls the distributed executor to execute the model.
+        - Step 3: Processes the model output. This mainly includes:
 
-                - Decodes the relevant outputs.
-                - Updates the scheduled sequence groups with model outputs
-                  based on its `sampling parameters` (`use_beam_search` or not).
-                - Frees the finished sequence groups.
+            - Decodes the relevant outputs.
+            - Updates the scheduled sequence groups with model outputs
+                based on its `sampling parameters` (`use_beam_search` or not).
+            - Frees the finished sequence groups.
 
-            - Finally, it creates and returns the newly generated results.
+        - Finally, it creates and returns the newly generated results.
 
         Example:
-            >>> # Please see the example/ folder for more detailed examples.
-            >>>
-            >>> # initialize engine and request arguments
-            >>> engine = LLMEngine.from_engine_args(engine_args)
-            >>> example_inputs = [(0, "What is LLM?",
-            >>>    SamplingParams(temperature=0.0))]
-            >>>
-            >>> # Start the engine with an event loop
-            >>> while True:
-            >>>     if example_inputs:
-            >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
-            >>>         engine.add_request(str(req_id),prompt,sampling_params)
-            >>>
-            >>>     # continue the request processing
-            >>>     request_outputs = engine.step()
-            >>>     for request_output in request_outputs:
-            >>>         if request_output.finished:
-            >>>             # return or show the request output
-            >>>
-            >>>     if not (engine.has_unfinished_requests() or example_inputs):
-            >>>         break
+        ```
+        # Please see the example/ folder for more detailed examples.
+
+        # initialize engine and request arguments
+        engine = LLMEngine.from_engine_args(engine_args)
+        example_inputs = [(0, "What is LLM?",
+        SamplingParams(temperature=0.0))]
+    
+        # Start the engine with an event loop
+        while True:
+            if example_inputs:
+                req_id, prompt, sampling_params = example_inputs.pop(0)
+                engine.add_request(str(req_id),prompt,sampling_params)
+
+            # continue the request processing
+            request_outputs = engine.step()
+            for request_output in request_outputs:
+                if request_output.finished:
+                    # return or show the request output
+
+            if not (engine.has_unfinished_requests() or example_inputs):
+                break
+        ```
         """
         if self.parallel_config.pipeline_parallel_size > 1:
             raise NotImplementedError(
@@ -1952,8 +1917,6 @@ def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
 
     def check_health(self) -> None:
-        if self.tokenizer:
-            self.tokenizer.check_health()
         self.model_executor.check_health()
 
     def is_tracing_enabled(self) -> bool:
@@ -2054,15 +2017,23 @@ def _validate_model_input(
         tokenizer = (None if self.tokenizer is None else
                      self.tokenizer.get_lora_tokenizer(lora_request))
 
-        prompt_ids = prompt_inputs["prompt_token_ids"]
+        prompt_ids = prompt_inputs.get("prompt_token_ids", [])
         if not prompt_ids:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 pass  # Mllama may have empty encoder inputs for text-only data
+            if prompt_inputs["type"] == "embeds":
+                pass
             else:
                 raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
+        if tokenizer is not None:
+            max_input_id = max(prompt_ids, default=0)
+            if max_input_id > tokenizer.max_token_id:
+                raise ValueError(
+                    f"Token id {max_input_id} is out of vocabulary")
+
         max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) >= max_prompt_len:
+        if len(prompt_ids) > max_prompt_len:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
@@ -2116,9 +2087,9 @@ def _build_logits_processors(
 
             tokenizer = self.get_tokenizer(lora_request=lora_request)
             guided_decoding.backend = guided_decoding.backend or \
-                self.decoding_config.guided_decoding_backend
+                self.decoding_config.backend
 
-            if self.decoding_config.reasoning_backend is not None:
+            if self.decoding_config.reasoning_backend:
                 logger.debug("Building with reasoning backend %s",
                              self.decoding_config.reasoning_backend)
 
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 7c4265fac20..033551d07c3 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -140,16 +140,13 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
-        if not vllm_config.model_config.enforce_eager:
-            buckets = vllm_config.compilation_config.\
-                cudagraph_capture_sizes.copy()
-            buckets.sort()
         self.histogram_iteration_tokens = self._histogram_cls(
             name="vllm:iteration_tokens_total",
             documentation="Histogram of number of tokens per engine_step.",
             labelnames=labelnames,
-            buckets=buckets)
+            buckets=[
+                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
+            ])
         self.histogram_time_to_first_token = self._histogram_cls(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 6e56cbdbbf8..505d3d06b3c 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -101,7 +101,6 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         self.tokenizer = init_tokenizer_from_configs(
             model_config=self.model_config,
             scheduler_config=engine_config.scheduler_config,
-            parallel_config=engine_config.parallel_config,
             lora_config=engine_config.lora_config)
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer)
@@ -492,7 +491,7 @@ def generate(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
@@ -561,7 +560,7 @@ def encode(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
@@ -616,9 +615,9 @@ async def _process_request(
                 build_guided_decoding_logits_processor_async(
                     sampling_params=params,
                     tokenizer=await self.get_tokenizer(lora_request),
-                    default_guided_backend=(self.decoding_config.guided_decoding_backend
+                    default_guided_backend=(self.decoding_config.backend
                         if self.decoding_config
-                        else DecodingConfig.guided_decoding_backend),
+                        else DecodingConfig.backend),
                     model_config=self.model_config,
                     reasoning_backend=self.decoding_config.reasoning_backend,
                 )
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 6ed5ae0a94f..3d7b73f97a1 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -41,18 +41,18 @@
 
 
 class MQLLMEngine:
-    """A multiprocessing wrapper for :class:`LLMEngine`.
+    """A multiprocessing wrapper for {class}`LLMEngine`.
 
-    This class is used to wrap the :class:`LLMEngine` class to enable use
+    This class is used to wrap the {class}`LLMEngine` class to enable use
     in concurrnet manner. It runs a background loop and uses zeromq to
     receive new requests and stream outputs incrementally via ipc.
 
-    The :class:`LLMEngine` generate or encode process is kicked off when a new
+    The {class}`LLMEngine` generate or encode process is kicked off when a new
     RPCProcessRequest is received by the input_socket.
 
     The self.engine_loop checks the input_socket for new requests,
     adds them to the LLMEngine if there are any, calls the internal
-    :class:`LLMEngine.step()`, and sends the RequestOutputs back over
+    {class}`LLMEngine.step()`, and sends the RequestOutputs back over
     the output_socket.
 
     If use_async_sockets is set, the logic associated with reading new
@@ -64,8 +64,8 @@ class MQLLMEngine:
         ipc_path: Base path for zeromq interprocess messaging
         use_async_sockets: Whether to make send/recv async with GPU
         log_requests: Whether to log the requests.
-        *args: Arguments for :class:`LLMEngine`.
-        **kwargs: Arguments for :class:`LLMEngine`.
+        *args: Arguments for {class}`LLMEngine`.
+        **kwargs: Arguments for {class}`LLMEngine`.
     """
 
     def __init__(self,
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 5f126c7571d..4cfb22c5a75 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -56,8 +56,8 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         scheduled computation.
 
         Args:
-          seq_group: the outputs are associated with this :class:`SequenceGroup`
-          outputs: the :class:`SequenceGroupOutput`s for all scheduler steps
+          seq_group: the outputs are associated with this {class}`SequenceGroup`
+          outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
         """
         for output in outputs:
             # Concatenate single-step prompt logprob processing results.
@@ -167,6 +167,7 @@ def _process_seq_outputs(self, seq: Sequence,
                              sampling_params: SamplingParams) -> None:
         output_token_ids = [sample.output_token for sample in valid_samples]
         output_logprobs = [sample.logprobs for sample in valid_samples]
+        output_embeds = [sample.output_embed for sample in valid_samples]
 
         # Truncate to max_tokens if necessary.
         remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
@@ -178,7 +179,7 @@ def _process_seq_outputs(self, seq: Sequence,
         # generates a fixed number of tokens without evaluating stopping
         # conditions within the block. This can cause an eos token to be
         # unintentionally ignored.
-        if not sampling_params.ignore_eos:
+        if not sampling_params.ignore_eos and self.detokenizer:
             eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
             # Avoiding .index calls as exception throwing in the happy path
             # is expensive.
@@ -190,11 +191,12 @@ def _process_seq_outputs(self, seq: Sequence,
         is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0
         # Incrementally append tokens to the sequence, as if we had only one new
         # token.
-        for output_token_id, output_logprob in zip(output_token_ids,
-                                                   output_logprobs):
+        for output_token_id, output_logprob, output_embed in zip(
+                output_token_ids, output_logprobs, output_embeds):
             seq.append_token_id(
                 token_id=output_token_id,
                 logprobs=output_logprob,
+                token_embed=output_embed,
             )
 
             if is_prefill_sampled_token:
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 4d96791a1f8..ea4b71a5b9c 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -19,7 +19,7 @@
 def single_step_process_prompt_logprob(
         sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
         output: CompletionSequenceGroupOutput) -> None:
-    """Process prompt logprobs associated with the :class:`SequenceGroupOutput`
+    """Process prompt logprobs associated with the {class}`SequenceGroupOutput`
     for a given step.
 
     Do nothing if the output has no prompt logprobs.
@@ -27,9 +27,9 @@ def single_step_process_prompt_logprob(
     Account for the fact that transformers do not compute first-token logprobs.
     
     Args:
-      sg_output_proc: :class:`SequenceGroupOutputProcessor` instance
-      seq_group: the output is associated with this :class:`SequenceGroup`
-      output: the :class:`SequenceGroupOutput` for a single scheduler step
+      sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
+      seq_group: the output is associated with this {class}`SequenceGroup`
+      output: the {class}`SequenceGroupOutput` for a single scheduler step
     """
     prompt_logprobs = output.prompt_logprobs
 
@@ -103,8 +103,8 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         scheduled computation.
         
         Args:
-          seq_group: the output is associated with this :class:`SequenceGroup`
-          outputs: the :class:`SequenceGroupOutput` for a single scheduler step
+          seq_group: the output is associated with this {class}`SequenceGroup`
+          outputs: the {class}`SequenceGroupOutput` for a single scheduler step
         """
         assert len(outputs) == 1, "Single step should only have 1 output."
         output = outputs[0]
@@ -119,7 +119,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         sample = outputs.samples[0]
         seq = seq_group.first_seq
         if not is_async:
-            seq.append_token_id(sample.output_token, sample.logprobs)
+            seq.append_token_id(sample.output_token, sample.logprobs,
+                                sample.output_embed)
         if sampling_params.detokenize and self.detokenizer:
             new_char_count = self.detokenizer.decode_sequence_inplace(
                 seq, sampling_params)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 7e5ac3a2845..e9350612ee5 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, List, Mapping, Optional
+from typing import AsyncGenerator, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -83,6 +83,9 @@ async def beam_search(
         else:
             processed_inputs = preprocessor._prompt_to_llm_inputs(prompt)
 
+        if processed_inputs["type"] == "embeds":
+            raise NotImplementedError
+
         prompt_token_ids = processed_inputs["prompt_token_ids"]
         prompt_text = processed_inputs.get("prompt")
         multi_modal_data = processed_inputs.get("multi_modal_data")
@@ -256,7 +259,7 @@ async def is_tracing_enabled(self) -> bool:
     async def do_log_stats(
         self,
         scheduler_outputs: Optional[SchedulerOutputs] = None,
-        model_output: Optional[List[SamplerOutput]] = None,
+        model_output: Optional[list[SamplerOutput]] = None,
     ) -> None:
         ...
 
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index c81ff958531..1c027181156 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -111,7 +111,7 @@ async def init_app(
     engine = (llm_engine
               if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                   engine_args, usage_context=UsageContext.API_SERVER))
-
+    app.state.engine_client = engine
     return app
 
 
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d6010e1c780..23dded7f226 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -27,10 +27,11 @@
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
+from pydantic import TypeAdapter
 # yapf: enable
-# pydantic needs the TypedDict from typing_extensions
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
+# pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
@@ -482,11 +483,8 @@ def _placeholder_str(self, modality: ModalityStr,
         if modality in ("image", "image_embeds"):
             if model_type == "chatglm":
                 return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "phi3_v":
-                # Workaround since this token is not defined in the tokenizer
+            if model_type in ("phi3_v", "phi4mm"):
                 return f"<|image_{current_count}|>"
-            if model_type == "phi4mm":
-                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
@@ -498,14 +496,17 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
+
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
-                              "internvl_chat", "skywork_chat", "NVLM_D",
-                              "h2ovl_chat", "idefics3", "smolvlm"):
+                              "internvl_chat", "ovis2", "skywork_chat",
+                              "NVLM_D", "h2ovl_chat", "idefics3", "smolvlm"):
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
+            if model_type == "qwen2_5_omni":
+                return "<|vision_start|><|IMAGE|><|vision_end|>"
             if model_type == "molmo":
                 return ""
             if model_type == "aria":
@@ -517,11 +518,11 @@ def _placeholder_str(self, modality: ModalityStr,
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
-            if model_type == "ultravox":
+            if model_type in ("ultravox", "granite_speech"):
                 return "<|audio|>"
             if model_type == "phi4mm":
-                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
-            if model_type == "qwen2_audio":
+                return f"<|audio_{current_count}|>"
+            if model_type in ("qwen2_audio", "qwen2_5_omni"):
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
             if model_type == "minicpmo":
@@ -530,6 +531,8 @@ def _placeholder_str(self, modality: ModalityStr,
         elif modality == "video":
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type == "qwen2_5_omni":
+                return "<|vision_start|><|VIDEO|><|vision_end|>"
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<video>./</video>)"
             if model_type.startswith("llava"):
@@ -878,12 +881,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
-_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
-_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
-_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
+# Need to validate url objects
+_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
+_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
+_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
 
@@ -1094,7 +1098,11 @@ def _parse_chat_message_content(
         if role == 'assistant':
             parsed_msg = _AssistantParser(message)
 
-            if "tool_calls" in parsed_msg:
+            # The 'tool_calls' is not None check ensures compatibility.
+            # It's needed only if downstream code doesn't strictly
+            # follow the OpenAI spec.
+            if ("tool_calls" in parsed_msg
+                and parsed_msg["tool_calls"] is not None):
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
         elif role == "tool":
             parsed_msg = _ToolParser(message)
@@ -1191,14 +1199,25 @@ def apply_hf_chat_template(
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one.")
 
-    return tokenizer.apply_chat_template(
-        conversation=conversation,  # type: ignore[arg-type]
-        tools=tools,  # type: ignore[arg-type]
-        chat_template=hf_chat_template,
-        tokenize=tokenize,
-        **kwargs,
-    )
+    try:
 
+        return tokenizer.apply_chat_template(
+            conversation=conversation,  # type: ignore[arg-type]
+            tools=tools,  # type: ignore[arg-type]
+            chat_template=hf_chat_template,
+            tokenize=tokenize,
+            **kwargs,
+        )
+
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `transformers` while applying chat template")
+        raise ValueError from e
 
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
@@ -1207,6 +1226,8 @@ def apply_mistral_chat_template(
     tools: Optional[list[dict[str, Any]]],
     **kwargs: Any,
 ) -> list[int]:
+    from mistral_common.exceptions import MistralCommonException
+
     # The return value of resolve_mistral_chat_template is always None,
     # and we won't use it.
     resolve_mistral_chat_template(
@@ -1224,5 +1245,16 @@ def apply_mistral_chat_template(
     # if input does not comply with the expected format.
     # We convert those assertion errors to ValueErrors so they can be
     # are properly caught in the preprocessing_input step
-    except AssertionError as e:
+    except (AssertionError, MistralCommonException) as e:
+        raise ValueError from e
+
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `mistral_common` while applying chat "
+            "template")
         raise ValueError from e
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
new file mode 100644
index 00000000000..d5f9f7e729f
--- /dev/null
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+from vllm.collect_env import main as collect_env_main
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+class CollectEnvSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "collect-env"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Collect information about the environment."""
+        collect_env_main()
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "collect-env",
+            help="Start collecting environment information.",
+            description="Start collecting environment information.",
+            usage="vllm collect-env")
+        return make_arg_parser(serve_parser)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [CollectEnvSubcommand()]
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index aa54bd66bed..b7c1afce711 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -5,6 +5,7 @@
 import sys
 
 import vllm.entrypoints.cli.benchmark.main
+import vllm.entrypoints.cli.collect_env
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
@@ -15,6 +16,7 @@
     vllm.entrypoints.cli.openai,
     vllm.entrypoints.cli.serve,
     vllm.entrypoints.cli.benchmark.main,
+    vllm.entrypoints.cli.collect_env,
 ]
 
 
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index b09ee526f14..a4f70a51eba 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -12,9 +12,11 @@
 from vllm import envs
 from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
@@ -40,6 +42,8 @@ async def serve_http(app: FastAPI,
 
     loop = asyncio.get_running_loop()
 
+    watchdog_task = loop.create_task(
+        watchdog_loop(server, app.state.engine_client))
     server_task = loop.create_task(
         server.serve(sockets=[sock] if sock else None))
 
@@ -52,6 +56,7 @@ async def serve_http(app: FastAPI,
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
         server_task.cancel()
+        watchdog_task.cancel()
         if ssl_cert_refresher:
             ssl_cert_refresher.stop()
 
@@ -73,48 +78,69 @@ async def dummy_shutdown() -> None:
                 port, process, " ".join(process.cmdline()))
         logger.info("Shutting down FastAPI HTTP server.")
         return server.shutdown()
+    finally:
+        watchdog_task.cancel()
+
+
+async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
+    """
+    # Watchdog task that runs in the background, checking
+    # for error state in the engine. Needed to trigger shutdown
+    # if an exception arises is StreamingResponse() generator.
+    """
+    VLLM_WATCHDOG_TIME_S = 5.0
+    while True:
+        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
+        terminate_if_errored(server, engine)
+
+
+def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
+    """
+    See discussions here on shutting down a uvicorn server
+    https://github.com/encode/uvicorn/discussions/1103
+    In this case we cannot await the server shutdown here
+    because handler must first return to close the connection
+    for this request.
+    """
+    engine_errored = engine.errored and not engine.is_running
+    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
+        server.should_exit = True
 
 
 def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
-    """Adds handlers for fatal errors that should crash the server"""
+    """
+    VLLM V1 AsyncLLM catches exceptions and returns
+    only two types: EngineGenerateError and EngineDeadError.
+    
+    EngineGenerateError is raised by the per request generate()
+    method. This error could be request specific (and therefore
+    recoverable - e.g. if there is an error in input processing).
+    
+    EngineDeadError is raised by the background output_handler
+    method. This error is global and therefore not recoverable.
+    
+    We register these @app.exception_handlers to return nice
+    responses to the end user if they occur and shut down if needed.
+    See https://fastapi.tiangolo.com/tutorial/handling-errors/
+    for more details on how exception handlers work.
+
+    If an exception is encountered in a StreamingResponse
+    generator, the exception is not raised, since we already sent
+    a 200 status. Rather, we send an error message as the next chunk.
+    Since the exception is not raised, this means that the server
+    will not automatically shut down. Instead, we use the watchdog
+    background task for check for errored state.
+    """
 
     @app.exception_handler(RuntimeError)
-    async def runtime_error_handler(request: Request, __):
-        """On generic runtime error, check to see if the engine has died.
-        It probably has, in which case the server will no longer be able to
-        handle requests. Trigger a graceful shutdown with a SIGTERM."""
-        engine = request.app.state.engine_client
-        if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
-                and not engine.is_running):
-            logger.fatal("AsyncLLMEngine has failed, terminating server "
-                         "process")
-            # See discussions here on shutting down a uvicorn server
-            # https://github.com/encode/uvicorn/discussions/1103
-            # In this case we cannot await the server shutdown here because
-            # this handler must first return to close the connection for
-            # this request.
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
     @app.exception_handler(AsyncEngineDeadError)
-    async def async_engine_dead_handler(_, __):
-        """Kill the server if the async engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("AsyncLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
     @app.exception_handler(MQEngineDeadError)
-    async def mq_engine_dead_handler(_, __):
-        """Kill the server if the mq engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("MQLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
+    @app.exception_handler(EngineDeadError)
+    @app.exception_handler(EngineGenerateError)
+    async def runtime_exception_handler(request: Request, __):
+        terminate_if_errored(
+            server=server,
+            engine=request.app.state.engine_client,
+        )
 
         return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 57c7ab73de3..a04ab885a72 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -13,7 +13,7 @@
 
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, ModelDType, TokenizerMode
 from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
                                    TaskOption)
 from vllm.engine.llm_engine import LLMEngine
@@ -25,12 +25,14 @@
                                          resolve_chat_template_content_format)
 from vllm.entrypoints.score_utils import (_cosine_similarity,
                                           _validate_score_input_lens)
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
-from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
+from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding.guided_fields import (
     GuidedDecodingRequest, LLMGuidedOptions)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput, RequestOutput,
                           ScoringRequestOutput)
@@ -40,7 +42,6 @@
                                   RequestOutputKind, SamplingParams)
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
-from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                         is_list_of)
@@ -114,11 +115,11 @@ class LLM:
             to eager mode. Additionally for encoder-decoder models, if the
             sequence length of the encoder input is larger than this, we fall
             back to the eager mode.
-        disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
+        disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
         disable_async_output_proc: Disable async output processing.
             This may result in lower performance.
         hf_token: The token to use as HTTP bearer authorization for remote files
-            . If `True`, will use the token generated when running 
+            . If `True`, will use the token generated when running
             `huggingface-cli login` (stored in `~/.huggingface`).
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
             HuggingFace config. If a callable, it is called to update the
@@ -126,12 +127,13 @@ class LLM:
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the level of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
-        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
-            :ref:`engine-args`)
+        **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
+            {ref}`engine-args`)
 
-    Note:
-        This class is intended to be used for offline inference. For online
-        serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
+    :::{note}
+    This class is intended to be used for offline inference. For online
+    serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
+    :::
     """
 
     DEPRECATE_LEGACY: ClassVar[bool] = True
@@ -140,7 +142,7 @@ class LLM:
     DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
     """
     A flag to toggle whether to deprecate positional arguments in
-    :meth:`LLM.__init__`.
+    {meth}`LLM.__init__`.
     """
 
     @classmethod
@@ -163,20 +165,20 @@ def __init__(
         self,
         model: str,
         tokenizer: Optional[str] = None,
-        tokenizer_mode: str = "auto",
+        tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         allowed_local_media_path: str = "",
         tensor_parallel_size: int = 1,
-        dtype: str = "auto",
-        quantization: Optional[str] = None,
+        dtype: ModelDType = "auto",
+        quantization: Optional[QuantizationMethods] = None,
         revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
         seed: Optional[int] = None,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
-        enforce_eager: Optional[bool] = None,
+        enforce_eager: bool = False,
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         disable_async_output_proc: bool = False,
@@ -189,12 +191,7 @@ def __init__(
         compilation_config: Optional[Union[int, dict[str, Any]]] = None,
         **kwargs,
     ) -> None:
-        '''
-        LLM constructor.
-
-        Note: if enforce_eager is unset (enforce_eager is None)
-        it defaults to False.
-        '''
+        """LLM constructor."""
 
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
@@ -252,11 +249,15 @@ def __init__(
         self.request_counter = Counter()
         self.default_sampling_params: Union[dict[str, Any], None] = None
 
-    def get_tokenizer(self) -> AnyTokenizer:
-        return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer
+    def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
+            lora_request)
 
     def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
-        tokenizer_group = self.llm_engine.get_tokenizer_group(TokenizerGroup)
+        tokenizer_group = self.llm_engine.get_tokenizer_group()
 
         # While CachedTokenizer is dynamic, have no choice but
         # compare class name. Misjudgment will arise from
@@ -398,7 +399,7 @@ def generate(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for batch inference. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each prompts.
             sampling_params: The sampling parameters for text generation. If
                 None, we use the default sampling parameters.
@@ -413,13 +414,14 @@ def generate(
                 Only applicable when priority scheduling policy is enabled.
 
         Returns:
-            A list of ``RequestOutput`` objects containing the
+            A list of `RequestOutput` objects containing the
             generated completions in the same order as the input prompts.
 
-        Note:
-            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
-            considered legacy and may be deprecated in the future. You should
-            instead pass them via the ``inputs`` parameter.
+        :::{note}
+        Using `prompts` and `prompt_token_ids` as keyword parameters is
+        considered legacy and may be deprecated in the future. You should
+        instead pass them via the `inputs` parameter.
+        :::
         """
         runner_type = self.llm_engine.model_config.runner_type
         if runner_type not in ["generate", "transcription"]:
@@ -462,10 +464,12 @@ def generate(
         self._validate_and_add_requests(
             prompts=parsed_prompts,
             params=sampling_params,
+            use_tqdm=use_tqdm,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             guided_options=guided_options_request,
-            priority=priority)
+            priority=priority,
+        )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
         return self.engine_class.validate_outputs(outputs, RequestOutput)
@@ -486,16 +490,17 @@ def collective_rpc(self,
                 `self` argument, in addition to the arguments passed in `args`
                 and `kwargs`. The `self` argument will be the worker object.
             timeout: Maximum time in seconds to wait for execution. Raises a
-                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
             args: Positional arguments to pass to the worker method.
             kwargs: Keyword arguments to pass to the worker method.
 
         Returns:
             A list containing the results from each worker.
-        
-        Note:
-            It is recommended to use this API to only pass control messages,
-            and set up data-plane communication to pass data.
+
+        :::{note}
+        It is recommended to use this API to only pass control messages,
+        and set up data-plane communication to pass data.
+        :::
         """
 
         return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
@@ -520,11 +525,9 @@ def beam_search(
             prompts: A list of prompts. Each prompt can be a string or a list
                 of token IDs.
             params: The beam search parameters.
-
-        TODO: how does beam search work together with length penalty, frequency
-        penalty, and stopping criteria, etc.?
         """
-
+        # TODO: how does beam search work together with length penalty,
+        # frequency, penalty, and stopping criteria, etc.?
         beam_width = params.beam_width
         max_tokens = params.max_tokens
         temperature = params.temperature
@@ -567,10 +570,12 @@ def create_tokens_prompt_from_beam(
                 mm_kwargs["mm_processor_kwargs"] = prompt[
                     "mm_processor_kwargs"]
 
-            if is_token_prompt(prompt):
+            if "prompt_token_ids" in prompt:
+                prompt = cast(TokensPrompt, prompt)  # Needed for mypy
                 prompt_tokens = prompt["prompt_token_ids"]
             else:
                 prompt_tokens = tokenizer.encode(prompt["prompt"])
+
             instances.append(
                 BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs))
 
@@ -655,13 +660,14 @@ def chat(
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
         tools: Optional[list[dict[str, Any]]] = None,
+        chat_template_kwargs: Optional[dict[str, Any]] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[RequestOutput]:
         """
         Generate responses for a chat conversation.
 
         The chat conversation is converted into a text prompt using the
-        tokenizer and calls the :meth:`generate` method to generate the
+        tokenizer and calls the {meth}`generate` method to generate the
         responses.
 
         Multi-modal inputs can be passed in the same way you would pass them
@@ -695,6 +701,8 @@ def chat(
             continue_final_message: If True, continues the final message in
                 the conversation instead of starting a new one. Cannot be
                 ``True`` if ``add_generation_prompt`` is also ``True``.
+            chat_template_kwargs: Additional kwargs to pass to the chat
+                template.
             mm_processor_kwargs: Multimodal processor kwarg overrides for this
                 chat request. Only used for offline requests.
 
@@ -715,7 +723,7 @@ def chat(
                 cast(list[ChatCompletionMessageParam], messages)
             ]
 
-        tokenizer = self.get_tokenizer()
+        tokenizer = self.get_tokenizer(lora_request)
         model_config = self.llm_engine.get_model_config()
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
@@ -725,6 +733,14 @@ def chat(
             trust_remote_code=model_config.trust_remote_code,
         )
 
+        _chat_template_kwargs: dict[str, Any] = dict(
+            chat_template=chat_template,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+            tools=tools,
+        )
+        _chat_template_kwargs.update(chat_template_kwargs or {})
+
         prompts: list[Union[TokensPrompt, TextPrompt]] = []
 
         for msgs in list_of_messages:
@@ -738,32 +754,25 @@ def chat(
                 content_format=resolved_content_format,
             )
 
-            prompt_data: Union[str, list[int]]
             if isinstance(tokenizer, MistralTokenizer):
-                prompt_data = apply_mistral_chat_template(
+                prompt_token_ids = apply_mistral_chat_template(
                     tokenizer,
                     messages=msgs,
-                    chat_template=chat_template,
-                    tools=tools,
-                    add_generation_prompt=add_generation_prompt,
-                    continue_final_message=continue_final_message,
+                    **_chat_template_kwargs,
                 )
             else:
-                prompt_data = apply_hf_chat_template(
+                prompt_str = apply_hf_chat_template(
                     tokenizer,
                     trust_remote_code=model_config.trust_remote_code,
                     conversation=conversation,
-                    chat_template=chat_template,
-                    tools=tools,
-                    add_generation_prompt=add_generation_prompt,
-                    continue_final_message=continue_final_message,
+                    **_chat_template_kwargs,
                 )
+                # Special tokens are already included in chat templates so
+                # should not be added by the tokenizer in this case.
+                prompt_token_ids = tokenizer.encode(prompt_str,
+                                                    add_special_tokens=False)
 
-            prompt: Union[TokensPrompt, TextPrompt]
-            if is_list_of(prompt_data, int):
-                prompt = TokensPrompt(prompt_token_ids=prompt_data)
-            else:
-                prompt = TextPrompt(prompt=prompt_data)
+            prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)
 
             if mm_data is not None:
                 prompt["multi_modal_data"] = mm_data
@@ -788,6 +797,7 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         *,
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -802,6 +812,7 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[int]] = None,
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -816,6 +827,7 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[list[list[int]]] = None,
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -831,6 +843,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         *,
         prompt_token_ids: list[int],
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -846,6 +859,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         *,
         prompt_token_ids: list[list[int]],
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -859,6 +873,7 @@ def encode(
         prompts: None,
         pooling_params: None,
         prompt_token_ids: Union[list[int], list[list[int]]],
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -877,6 +892,7 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
@@ -890,7 +906,7 @@ def encode(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for batch inference. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
@@ -900,13 +916,14 @@ def encode(
                 generation, if any.
 
         Returns:
-            A list of ``PoolingRequestOutput`` objects containing the
+            A list of `PoolingRequestOutput` objects containing the
             pooled hidden states in the same order as the input prompts.
 
-        Note:
-            Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
-            considered legacy and may be deprecated in the future. You should
-            instead pass them via the ``inputs`` parameter.
+        :::{note}
+        Using `prompts` and `prompt_token_ids` as keyword parameters is
+        considered legacy and may be deprecated in the future. You should
+        instead pass them via the `inputs` parameter.
+        :::
         """
         runner_type = self.llm_engine.model_config.runner_type
         if runner_type != "pooling":
@@ -941,10 +958,16 @@ def encode(
             for pooling_param in pooling_params:
                 pooling_param.verify(self.llm_engine.model_config)
 
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+                                  truncate_prompt_tokens, tokenization_kwargs)
+
         self._validate_and_add_requests(
             prompts=parsed_prompts,
             params=pooling_params,
+            use_tqdm=use_tqdm,
             lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
             prompt_adapter_request=prompt_adapter_request,
         )
 
@@ -957,6 +980,7 @@ def embed(
         prompts: Union[PromptType, Sequence[PromptType]],
         /,
         *,
+        truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
@@ -972,7 +996,7 @@ def embed(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for batch inference. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
@@ -990,6 +1014,7 @@ def embed(
                 "Embedding API is only enabled for `--task embed`")
 
         items = self.encode(prompts,
+                            truncate_prompt_tokens=truncate_prompt_tokens,
                             use_tqdm=use_tqdm,
                             pooling_params=pooling_params,
                             lora_request=lora_request,
@@ -1015,7 +1040,7 @@ def classify(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See :class:`~vllm.inputs.PromptType`
+                for batch inference. See {class}`~vllm.inputs.PromptType`
                 for more details about the format of each prompts.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
@@ -1050,6 +1075,7 @@ def _embedding_score(
 
         encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
+            truncate_prompt_tokens=truncate_prompt_tokens,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request)
@@ -1062,8 +1088,6 @@ def _embedding_score(
         if len(encoded_output_1) == 1:
             encoded_output_1 = encoded_output_1 * len(encoded_output_2)
 
-        scores: list[PoolingRequestOutput] = []
-
         scores = _cosine_similarity(tokenizer=tokenizer,
                                     embed_1=encoded_output_1,
                                     embed_2=encoded_output_2)
@@ -1095,9 +1119,8 @@ def _cross_encoding_score(
         pooling_params = PoolingParams()
 
         tokenization_kwargs: dict[str, Any] = {}
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs["truncation"] = True
-            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+        _validate_truncation_size(self.llm_engine.model_config.max_model_len,
+                                  truncate_prompt_tokens, tokenization_kwargs)
 
         parsed_prompts = []
 
@@ -1113,6 +1136,7 @@ def _cross_encoding_score(
         self._validate_and_add_requests(
             prompts=parsed_prompts,
             params=pooling_params,
+            use_tqdm=use_tqdm,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
@@ -1148,7 +1172,7 @@ def score(
             text_1: can be a single prompt or a list of prompts, in which
                 case it has to have the same length as the ``text_2`` list
             text_2: The texts to pair with the query to form the input
-                to the LLM. See :class:`~vllm.inputs.PromptType` for
+                to the LLM. See {class}`~vllm.inputs.PromptType` for
                 more details about the format of each prompts.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
@@ -1257,7 +1281,7 @@ def sleep(self, level: int = 1):
 
     def wake_up(self, tags: Optional[list[str]] = None):
         """
-        Wake up the engine from sleep mode. See the :meth:`sleep` method
+        Wake up the engine from sleep mode. See the {meth}`sleep` method
         for more details.
         
         Args:
@@ -1318,8 +1342,11 @@ def _validate_and_add_requests(
         prompts: Union[PromptType, Sequence[PromptType]],
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
+        *,
+        use_tqdm: bool,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         prompt_adapter_request: Optional[PromptAdapterRequest],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         guided_options: Optional[GuidedDecodingRequest] = None,
         priority: Optional[list[int]] = None,
     ) -> None:
@@ -1352,10 +1379,15 @@ def _validate_and_add_requests(
                 sp.output_kind = RequestOutputKind.FINAL_ONLY
 
         # Add requests to the engine.
-        for i, prompt in enumerate(prompts):
+        it = prompts
+        if use_tqdm:
+            it = tqdm(it, desc="Adding requests")
+
+        for i, prompt in enumerate(it):
             self._add_request(
                 prompt,
                 params[i] if isinstance(params, Sequence) else params,
+                tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request[i] if isinstance(
                     lora_request, Sequence) else lora_request,
                 prompt_adapter_request=prompt_adapter_request,
@@ -1366,6 +1398,7 @@ def _add_request(
         self,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
@@ -1376,6 +1409,7 @@ def _add_request(
             prompt,
             params,
             lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
             prompt_adapter_request=prompt_adapter_request,
             priority=priority,
         )
@@ -1398,7 +1432,9 @@ def _add_guided_params(
             grammar=guided_options.guided_grammar,
             json_object=guided_options.guided_json_object,
             backend=guided_options.guided_decoding_backend,
-            whitespace_pattern=guided_options.guided_whitespace_pattern)
+            whitespace_pattern=guided_options.guided_whitespace_pattern,
+            structural_tag=guided_options.structural_tag,
+        )
         return params
 
     def _run_engine(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2c15aa8a933..9746d9697a6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -310,33 +310,33 @@ def mount_metrics(app: FastAPI):
     # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
     # before prometheus_client is imported.
     # See https://prometheus.github.io/client_python/multiprocess/
-    from prometheus_client import (CollectorRegistry, make_asgi_app,
+    from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app,
                                    multiprocess)
     from prometheus_fastapi_instrumentator import Instrumentator
 
+    registry = REGISTRY
+
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
         logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR",
                      prometheus_multiproc_dir_path)
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
-        Instrumentator(
-            excluded_handlers=[
-                "/metrics",
-                "/health",
-                "/load",
-                "/ping",
-                "/version",
-                "/server_info",
-            ],
-            registry=registry,
-        ).add().instrument(app).expose(app)
-
-        # Add prometheus asgi middleware to route /metrics requests
-        metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
-    else:
-        # Add prometheus asgi middleware to route /metrics requests
-        metrics_route = Mount("/metrics", make_asgi_app())
+
+    Instrumentator(
+        excluded_handlers=[
+            "/metrics",
+            "/health",
+            "/load",
+            "/ping",
+            "/version",
+            "/server_info",
+        ],
+        registry=registry,
+    ).add().instrument(app).expose(app)
+
+    # Add prometheus asgi middleware to route /metrics requests
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
 
     # Workaround for 307 Redirect for /metrics
     metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
@@ -881,7 +881,8 @@ async def log_response(request: Request, call_next):
                 section async for section in response.body_iterator
             ]
             response.body_iterator = iterate_in_threadpool(iter(response_body))
-            logger.info("response_body={%s}", response_body[0].decode())
+            logger.info("response_body={%s}",
+                        response_body[0].decode() if response_body else None)
             return response
 
     for middleware in args.middleware:
@@ -966,7 +967,6 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
-        enable_reasoning=args.enable_reasoning,
         reasoning_parser=args.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.runner_type == "generate" else None
@@ -1052,7 +1052,7 @@ async def run_server(args, **uvicorn_kwargs) -> None:
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
     valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
-    if args.enable_reasoning \
+    if args.reasoning_parser \
         and args.reasoning_parser not in valid_reasoning_parses:
         raise KeyError(
             f"invalid reasoning parser: {args.reasoning_parser} "
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index af546c3032a..a2639d37479 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,7 +11,7 @@
 from collections.abc import Sequence
 from typing import Optional, Union, get_args
 
-from vllm.engine.arg_utils import AsyncEngineArgs, optional_str
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
@@ -79,7 +79,7 @@ def __call__(
 
 def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument("--host",
-                        type=optional_str,
+                        type=optional_type(str),
                         default=None,
                         help="Host name.")
     parser.add_argument("--port", type=int, default=8000, help="Port number.")
@@ -108,13 +108,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=["*"],
                         help="Allowed headers.")
     parser.add_argument("--api-key",
-                        type=optional_str,
+                        type=optional_type(str),
                         default=None,
                         help="If provided, the server will require this key "
                         "to be presented in the header.")
     parser.add_argument(
         "--lora-modules",
-        type=optional_str,
+        type=optional_type(str),
         default=None,
         nargs='+',
         action=LoRAParserAction,
@@ -126,14 +126,14 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         "\"base_model_name\": \"id\"}``")
     parser.add_argument(
         "--prompt-adapters",
-        type=optional_str,
+        type=optional_type(str),
         default=None,
         nargs='+',
         action=PromptAdapterParserAction,
         help="Prompt adapter configurations in the format name=path. "
         "Multiple adapters can be specified.")
     parser.add_argument("--chat-template",
-                        type=optional_str,
+                        type=optional_type(str),
                         default=None,
                         help="The file path to the chat template, "
                         "or the template in single-line form "
@@ -151,20 +151,20 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         'similar to OpenAI schema. '
         'Example: ``[{"type": "text", "text": "Hello world!"}]``')
     parser.add_argument("--response-role",
-                        type=optional_str,
+                        type=optional_type(str),
                         default="assistant",
                         help="The role name to return if "
                         "``request.add_generation_prompt=true``.")
     parser.add_argument("--ssl-keyfile",
-                        type=optional_str,
+                        type=optional_type(str),
                         default=None,
                         help="The file path to the SSL key file.")
     parser.add_argument("--ssl-certfile",
-                        type=optional_str,
+                        type=optional_type(str),
                         default=None,
                         help="The file path to the SSL cert file.")
     parser.add_argument("--ssl-ca-certs",
-                        type=optional_str,
+                        type=optional_type(str),
                         default=None,
                         help="The CA certificates file.")
     parser.add_argument(
@@ -180,13 +180,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     )
     parser.add_argument(
         "--root-path",
-        type=optional_str,
+        type=optional_type(str),
         default=None,
         help="FastAPI root_path when app is behind a path based routing proxy."
     )
     parser.add_argument(
         "--middleware",
-        type=optional_str,
+        type=optional_type(str),
         action="append",
         default=[],
         help="Additional ASGI middleware to apply to the app. "
@@ -284,11 +284,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-auto-tool-choice requires "
                         "--tool-call-parser")
 
-    # Enable reasoning needs a reasoning parser to be valid
-    if args.enable_reasoning and not args.reasoning_parser:
-        raise TypeError("Error: --enable-reasoning requires "
-                        "--reasoning-parser")
-
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4639b4cea06..40e477f0319 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2,9 +2,9 @@
 
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import json
 import re
 import time
-from argparse import Namespace
 from typing import Annotated, Any, ClassVar, Literal, Optional, Union
 
 import torch
@@ -13,6 +13,7 @@
                       ValidationInfo, field_validator, model_validator)
 from typing_extensions import TypeAlias
 
+from vllm import envs
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
@@ -23,23 +24,7 @@
 
 logger = init_logger(__name__)
 
-# torch is mocked during docs generation,
-# so we have to provide the values as literals
-_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
-_LONG_INFO: Union["torch.iinfo", Namespace]
-
-try:
-    from sphinx.ext.autodoc.mock import _MockModule
-
-    if isinstance(torch, _MockModule):
-        _LONG_INFO = _MOCK_LONG_INFO
-    else:
-        _LONG_INFO = torch.iinfo(torch.long)
-except ModuleNotFoundError:
-    _LONG_INFO = torch.iinfo(torch.long)
-
-assert _LONG_INFO.min == _MOCK_LONG_INFO.min
-assert _LONG_INFO.max == _MOCK_LONG_INFO.max
+_LONG_INFO = torch.iinfo(torch.long)
 
 
 class OpenAIBaseModel(BaseModel):
@@ -139,12 +124,30 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
     strict: Optional[bool] = None
 
 
+class StructuralTag(OpenAIBaseModel):
+    begin: str
+    # schema is the field, but that causes conflicts with pydantic so
+    # instead use structural_tag_schema with an alias
+    structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
+                                                            alias="schema")
+    end: str
+
+
+class StructuralTagResponseFormat(OpenAIBaseModel):
+    type: Literal["structural_tag"]
+    structures: list[StructuralTag]
+    triggers: list[str]
+
+
 class ResponseFormat(OpenAIBaseModel):
-    # type must be "json_schema", "json_object" or "text"
+    # type must be "json_schema", "json_object", or "text"
     type: Literal["text", "json_object", "json_schema"]
     json_schema: Optional[JsonSchemaResponseFormat] = None
 
 
+AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
+
+
 class StreamOptions(OpenAIBaseModel):
     include_usage: Optional[bool] = True
     continuous_usage_stats: Optional[bool] = False
@@ -227,7 +230,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     max_completion_tokens: Optional[int] = None
     n: Optional[int] = 1
     presence_penalty: Optional[float] = 0.0
-    response_format: Optional[ResponseFormat] = None
+    response_format: Optional[AnyResponseFormat] = None
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
@@ -340,6 +343,11 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, the output will follow the context free grammar."),
     )
+    structural_tag: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the structural tag schema."),
+    )
     guided_decoding_backend: Optional[str] = Field(
         default=None,
         description=(
@@ -384,6 +392,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    cache_salt: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit). Not supported by vLLM engine V0."))
 
     # doc: end-chat-completion-extra-params
 
@@ -476,6 +493,12 @@ def to_sampling_params(
                 json_schema = self.response_format.json_schema
                 assert json_schema is not None
                 self.guided_json = json_schema.json_schema
+            elif self.response_format.type == "structural_tag":
+                structural_tag = self.response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag, StructuralTagResponseFormat)
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structural_tag = json.dumps(s_tag_obj)
 
         guided_decoding = GuidedDecodingParams.from_optional(
             json=self._get_guided_json_from_tool() or self.guided_json,
@@ -485,6 +508,7 @@ def to_sampling_params(
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
             whitespace_pattern=self.guided_whitespace_pattern,
+            structural_tag=self.structural_tag,
         )
 
         return SamplingParams.from_optional(
@@ -695,6 +719,20 @@ def check_generation_prompt(cls, data):
                              "`add_generation_prompt` to True.")
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None:
+            if not envs.VLLM_USE_V1:
+                raise ValueError(
+                    "Parameter 'cache_salt' is not supported with "
+                    "this instance of vLLM, which uses engine V0.")
+            if not isinstance(data["cache_salt"],
+                              str) or not data["cache_salt"]:
+                raise ValueError("Parameter 'cache_salt' must be a "
+                                 "non-empty string if provided.")
+        return data
+
 
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
@@ -742,12 +780,13 @@ class CompletionRequest(OpenAIBaseModel):
             "If true (the default), special tokens (e.g. BOS) will be added to "
             "the prompt."),
     )
-    response_format: Optional[ResponseFormat] = Field(
+    response_format: Optional[AnyResponseFormat] = Field(
         default=None,
-        description=
-        ("Similar to chat completion, this parameter specifies the format of "
-         "output. Only {'type': 'json_object'}, {'type': 'json_schema'} or "
-         "{'type': 'text' } is supported."),
+        description=(
+            "Similar to chat completion, this parameter specifies the format "
+            "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
+            ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
+        ),
     )
     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
         default=None,
@@ -982,7 +1021,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
     encoding_format: Literal["float", "base64"] = "float"
     dimensions: Optional[int] = None
     user: Optional[str] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
     # doc: begin-embedding-pooling-params
     additional_data: Optional[Any] = None
@@ -1017,7 +1056,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     encoding_format: Literal["float", "base64"] = "float"
     dimensions: Optional[int] = None
     user: Optional[str] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
     # doc: begin-chat-embedding-pooling-params
     additional_data: Optional[Any] = None
@@ -1084,7 +1123,7 @@ class ScoreRequest(OpenAIBaseModel):
     model: Optional[str] = None
     text_1: Union[list[str], str]
     text_2: Union[list[str], str]
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
     # doc: begin-score-pooling-params
     additional_data: Optional[Any] = None
@@ -1110,7 +1149,7 @@ class RerankRequest(OpenAIBaseModel):
     query: str
     documents: list[str]
     top_n: int = Field(default_factory=lambda: 0)
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
     # doc: begin-rerank-pooling-params
     additional_data: Optional[Any] = None
@@ -1577,14 +1616,6 @@ class TranscriptionRequest(OpenAIBaseModel):
     """
 
     ## TODO (varun) : Support if set to 0, certain thresholds are met !!
-    temperature: float = Field(default=0.0)
-    """The sampling temperature, between 0 and 1.
-
-    Higher values like 0.8 will make the output more random, while lower values
-    like 0.2 will make it more focused / deterministic. If set to 0, the model
-    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
-    to automatically increase the temperature until certain thresholds are hit.
-    """
 
     timestamp_granularities: list[Literal["word", "segment"]] = Field(
         alias="timestamp_granularities[]", default=[])
@@ -1596,18 +1627,60 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
+    # doc: begin-transcription-extra-params
     stream: Optional[bool] = False
-    """Custom field not present in the original OpenAI definition. When set, 
+    """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
-    Completion endpoint. 
+    Completion endpoint.
     """
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
     stream_continuous_usage_stats: Optional[bool] = False
+    # doc: end-transcription-extra-params
+
+    # doc: begin-transcription-sampling-params
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+
+    top_p: Optional[float] = None
+    """Enables nucleus (top-p) sampling, where tokens are selected from the
+    smallest possible set whose cumulative probability exceeds `p`.
+    """
+
+    top_k: Optional[int] = None
+    """Limits sampling to the `k` most probable tokens at each step."""
+
+    min_p: Optional[float] = None
+    """Filters out tokens with a probability lower than `min_p`, ensuring a
+    minimum likelihood threshold during sampling.
+    """
+
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
+    frequency_penalty: Optional[float] = 0.0
+    """The frequency penalty to use for sampling."""
+
+    repetition_penalty: Optional[float] = None
+    """The repetition penalty to use for sampling."""
+
+    presence_penalty: Optional[float] = 0.0
+    """The presence penalty to use for sampling."""
+    # doc: end-transcription-sampling-params
 
     # Default sampling parameters for transcription requests.
     _DEFAULT_SAMPLING_PARAMS: dict = {
-        "temperature": 0,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
     }
 
     def to_sampling_params(
@@ -1619,13 +1692,35 @@ def to_sampling_params(
 
         if default_sampling_params is None:
             default_sampling_params = {}
+
         # Default parameters
         if (temperature := self.temperature) is None:
             temperature = default_sampling_params.get(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
 
         return SamplingParams.from_optional(temperature=temperature,
                                             max_tokens=max_tokens,
+                                            seed=self.seed,
+                                            top_p=top_p,
+                                            top_k=top_k,
+                                            min_p=min_p,
+                                            frequency_penalty=self.frequency_penalty,
+                                            repetition_penalty=repetition_penalty,
+                                            presence_penalty=self.presence_penalty,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY)
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 3ffa5a32c17..fccf459f17d 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -12,7 +12,7 @@
 from prometheus_client import start_http_server
 from tqdm import tqdm
 
-from vllm.engine.arg_utils import AsyncEngineArgs, optional_str
+from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.logger import RequestLogger, logger
 # yapf: disable
@@ -61,7 +61,7 @@ def parse_args():
         "to the output URL.",
     )
     parser.add_argument("--response-role",
-                        type=optional_str,
+                        type=optional_type(str),
                         default="assistant",
                         help="The role name to return if "
                         "`request.add_generation_prompt=True`.")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index dd0b67df4f1..5c11836fbff 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -58,8 +58,7 @@ def __init__(
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
-        enable_reasoning: bool = False,
-        reasoning_parser: Optional[str] = None,
+        reasoning_parser: str = "",
         enable_auto_tools: bool = False,
         tool_parser: Optional[str] = None,
         enable_prompt_tokens_details: bool = False,
@@ -82,18 +81,17 @@ def __init__(
                 " the parallel_tool_calls client option is preset for "
                 "compatibility reasons, it will be ignored.")
 
-        self.enable_reasoning: bool = enable_reasoning
         self.reasoning_parser: Optional[Callable[[AnyTokenizer],
                                                  ReasoningParser]] = None
-        if self.enable_reasoning:
+        if reasoning_parser:
             try:
                 self.reasoning_parser = (
                     ReasoningParserManager.get_reasoning_parser(
                         reasoning_parser))
+                assert self.reasoning_parser is not None
             except Exception as e:
-                raise TypeError("Error: --enable-reasoning requires "
-                                f"reasoning_parser:'{reasoning_parser}' "
-                                "which has not been registered") from e
+                raise TypeError(
+                    f"{reasoning_parser=} has not been registered") from e
         self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
         if self.enable_auto_tools:
             try:
@@ -423,15 +421,12 @@ async def chat_completion_stream_generator(
             not tool_choice_function_name
             and self._should_stream_with_auto_tool_parsing(request))
 
-        should_stream_with_reasoning_parsing = (
-            self._should_stream_with_reasoning_parsing(request))
-
         all_previous_token_ids: Optional[list[list[int]]]
         function_name_returned: Optional[list[bool]] = None
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
-        if tool_choice_auto or should_stream_with_reasoning_parsing:
+        if tool_choice_auto or self.reasoning_parser:
             # These are only required in "auto" tool choice case
             previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
@@ -446,12 +441,7 @@ async def chat_completion_stream_generator(
             previous_texts, all_previous_token_ids = None, None
 
         try:
-            # There is no need to check if the reasoning_parser is None
-            # because the should_stream_with_reasoning_parsing check
-            # already ensures that the reasoning_parser is not None.
-            # but the pre-commit hook requires it.
-            if should_stream_with_reasoning_parsing and \
-                self.reasoning_parser is not None:
+            if self.reasoning_parser:
                 reasoning_parser = self.reasoning_parser(tokenizer)
         except RuntimeError as e:
             logger.exception("Error in reasoning parser creation.")
@@ -459,7 +449,6 @@ async def chat_completion_stream_generator(
             yield f"data: {data}\n\n"
             yield "data: [DONE]\n\n"
             return
-
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
@@ -592,7 +581,7 @@ async def chat_completion_stream_generator(
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if tool_choice_auto or should_stream_with_reasoning_parsing:
+                    if tool_choice_auto or self.reasoning_parser:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]
@@ -603,7 +592,7 @@ async def chat_completion_stream_generator(
 
                     # handle streaming deltas for tools with named tool_choice
                     if tool_choice_function_name:
-                        if (self.enable_reasoning
+                        if (self.reasoning_parser
                                 and not reasoning_parser.is_reasoning_end(
                                     previous_token_ids)):
                             assert reasoning_parser is not None
@@ -630,7 +619,7 @@ async def chat_completion_stream_generator(
                                     current_text = ""
                         else:
                             # Just to add remaining `content`
-                            if self.enable_reasoning:
+                            if self.reasoning_parser:
                                 delta_text = previous_text + delta_text
                                 current_text = ""
 
@@ -660,7 +649,7 @@ async def chat_completion_stream_generator(
 
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
-                    elif tool_choice_auto and self.enable_reasoning:
+                    elif tool_choice_auto and self.reasoning_parser:
                         assert tool_parser is not None
                         assert reasoning_parser is not None
                         assert added_content_delta_arr is not None
@@ -728,8 +717,7 @@ async def chat_completion_stream_generator(
                                 delta_token_ids=output.token_ids,
                                 request=request))
                     # when only reasoning
-                    elif self.enable_reasoning:
-                        assert reasoning_parser is not None
+                    elif self.reasoning_parser:
                         delta_message = (reasoning_parser.
                                          extract_reasoning_content_streaming(
                                              previous_text,
@@ -744,7 +732,7 @@ async def chat_completion_stream_generator(
                         delta_message = DeltaMessage(content=delta_text)
 
                     # update the previous values for the next iteration
-                    if tool_choice_auto or should_stream_with_reasoning_parsing:
+                    if tool_choice_auto or self.reasoning_parser:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_texts[i] = current_text
@@ -931,17 +919,9 @@ async def chat_completion_full_generator(
                 )
             else:
                 logprobs = None
-
-            should_stream_with_reasoning_parsing = (
-                self._should_stream_with_reasoning_parsing(request))
-
-            # In the OpenAI API the finish_reason is "tools_called"
-            # if the tool choice is auto and the model produced a tool
-            # call. The same is not true for named function calls
             auto_tools_called = False
 
-            if should_stream_with_reasoning_parsing and \
-                self.reasoning_parser is not None:
+            if self.reasoning_parser:
                 try:
                     reasoning_parser = self.reasoning_parser(tokenizer)
                 except RuntimeError as e:
@@ -1131,7 +1111,8 @@ def _create_chat_logprobs(
             return_as_token_id is not None else self.return_tokens_as_token_ids
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
-            if step_top_logprobs is None:
+            if step_top_logprobs is None or step_top_logprobs.get(
+                    token_id) is None:
                 token = tokenizer.decode(token_id)
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
@@ -1176,17 +1157,6 @@ def _should_stream_with_auto_tool_parsing(self,
         return (request.tools and self.tool_parser and self.enable_auto_tools
                 and request.tool_choice in ['auto', None])
 
-    def _should_stream_with_reasoning_parsing(self,
-                                              request: ChatCompletionRequest):
-        """
-            Utility function to check if streamed tokens should go through the
-            reasoning parser that was configured.
-    
-            We only want to do this IF reasoning is enabled and a reasoning 
-            parser is configured.
-            """
-        return self.enable_reasoning and self.reasoning_parser is not None
-
     def _should_check_for_unstreamed_tool_arg_tokens(
         self,
         delta_message: Optional[DeltaMessage],
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index ba960de17ca..4b4d2d8b76f 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -21,6 +21,7 @@
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
                           PoolingRequestOutput)
@@ -85,16 +86,7 @@ async def create_embedding(
         request_id = f"embd-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
-        truncate_prompt_tokens = None
-
-        if request.truncate_prompt_tokens is not None:
-            if request.truncate_prompt_tokens <= self.max_model_len:
-                truncate_prompt_tokens = request.truncate_prompt_tokens
-            else:
-                return self.create_error_response(
-                    "truncate_prompt_tokens value is "
-                    "greater than max_model_len."
-                    " Please, select a smaller truncation size.")
+        truncate_prompt_tokens = request.truncate_prompt_tokens
 
         pooling_params = request.to_pooling_params()
 
@@ -104,6 +96,8 @@ async def create_embedding(
             return self.create_error_response(str(e))
 
         try:
+            truncate_prompt_tokens = _validate_truncation_size(
+                self.max_model_len, truncate_prompt_tokens)
             (
                 lora_request,
                 prompt_adapter_request,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index bbc8eddd8b1..25069c28a0a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,6 +10,7 @@
 from pydantic import Field
 from starlette.datastructures import Headers
 
+import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 # yapf conflicts with isort for this block
@@ -125,18 +126,29 @@ async def _check_model(
         self,
         request: AnyRequest,
     ) -> Optional[ErrorResponse]:
+
+        error_response = None
+
         if self._is_model_supported(request.model):
             return None
         if request.model in [
                 lora.lora_name for lora in self.models.lora_requests
         ]:
             return None
+        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
+                load_result := await self.models.resolve_lora(request.model)):
+            if isinstance(load_result, LoRARequest):
+                return None
+            if isinstance(load_result, ErrorResponse) and \
+                load_result.code == HTTPStatus.BAD_REQUEST.value:
+                error_response = load_result
         if request.model in [
                 prompt_adapter.prompt_adapter_name
                 for prompt_adapter in self.models.prompt_adapter_requests
         ]:
             return None
-        return self.create_error_response(
+
+        return error_response or self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
@@ -161,7 +173,7 @@ def _normalize_prompt_text_to_input(
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt: str,
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
         if (self.model_config.encoder_config is not None
@@ -259,11 +271,11 @@ def _tokenize_prompt_input(
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_input: Union[str, list[int]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
-        A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
         that assumes single input.
         """
         return next(
@@ -280,11 +292,11 @@ def _tokenize_prompt_inputs(
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_inputs: Iterable[Union[str, list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> Iterator[TextTokensPrompt]:
         """
-        A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
         that assumes multiple inputs.
         """
         for text in prompt_inputs:
@@ -309,7 +321,7 @@ def _tokenize_prompt_input_or_inputs(
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> list[TextTokensPrompt]:
         """
@@ -344,7 +356,7 @@ async def _preprocess_completion(
         request: CompletionLikeRequest,
         tokenizer: AnyTokenizer,
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]:
         request_prompts = await self._tokenize_prompt_input_or_inputs_async(
@@ -458,6 +470,9 @@ async def _preprocess_chat(
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
         return conversation, [request_prompt], [engine_prompt]
 
     def _log_inputs(
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 7a68452efc6..74433a1a3c3 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -2,6 +2,8 @@
 
 import json
 import pathlib
+from asyncio import Lock
+from collections import defaultdict
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Optional, Union
@@ -15,6 +17,7 @@
                                               UnloadLoRAAdapterRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.utils import AtomicCounter
 
@@ -63,11 +66,19 @@ def __init__(
         self.base_model_paths = base_model_paths
         self.max_model_len = model_config.max_model_len
         self.engine_client = engine_client
+        self.model_config = model_config
 
         self.static_lora_modules = lora_modules
         self.lora_requests: list[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
 
+        self.lora_resolvers: list[LoRAResolver] = []
+        for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers(
+        ):
+            self.lora_resolvers.append(
+                LoRAResolverRegistry.get_resolver(lora_resolver_name))
+        self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
+
         self.prompt_adapter_requests = []
         if prompt_adapters is not None:
             for i, prompt_adapter in enumerate(prompt_adapters, start=1):
@@ -234,6 +245,65 @@ async def _check_unload_lora_adapter_request(
 
         return None
 
+    async def resolve_lora(
+            self, lora_name: str) -> Union[LoRARequest, ErrorResponse]:
+        """Attempt to resolve a LoRA adapter using available resolvers.
+
+        Args:
+            lora_name: Name/identifier of the LoRA adapter
+
+        Returns:
+            LoRARequest if found and loaded successfully.
+            ErrorResponse (404) if no resolver finds the adapter.
+            ErrorResponse (400) if adapter(s) are found but none load.
+        """
+        async with self.lora_resolver_lock[lora_name]:
+            # First check if this LoRA is already loaded
+            for existing in self.lora_requests:
+                if existing.lora_name == lora_name:
+                    return existing
+
+            base_model_name = self.model_config.model
+            unique_id = self.lora_id_counter.inc(1)
+            found_adapter = False
+
+            # Try to resolve using available resolvers
+            for resolver in self.lora_resolvers:
+                lora_request = await resolver.resolve_lora(
+                    base_model_name, lora_name)
+
+                if lora_request is not None:
+                    found_adapter = True
+                    lora_request.lora_int_id = unique_id
+
+                    try:
+                        await self.engine_client.add_lora(lora_request)
+                        self.lora_requests.append(lora_request)
+                        logger.info(
+                            "Resolved and loaded LoRA adapter '%s' using %s",
+                            lora_name, resolver.__class__.__name__)
+                        return lora_request
+                    except BaseException as e:
+                        logger.warning(
+                            "Failed to load LoRA '%s' resolved by %s: %s. "
+                            "Trying next resolver.", lora_name,
+                            resolver.__class__.__name__, e)
+                        continue
+
+            if found_adapter:
+                # An adapter was found, but all attempts to load it failed.
+                return create_error_response(
+                    message=(f"LoRA adapter '{lora_name}' was found "
+                             "but could not be loaded."),
+                    err_type="BadRequestError",
+                    status_code=HTTPStatus.BAD_REQUEST)
+            else:
+                # No adapter was found
+                return create_error_response(
+                    message=f"LoRA adapter {lora_name} does not exist",
+                    err_type="NotFoundError",
+                    status_code=HTTPStatus.NOT_FOUND)
+
 
 def create_error_response(
         message: str,
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 779a3eded2c..7c401d4f5cb 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -21,6 +21,7 @@
                                               PoolingResponseData, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.utils import merge_async_iterators
@@ -85,18 +86,11 @@ async def create_pooling(
         request_id = f"pool-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
-        truncate_prompt_tokens = None
-
-        if request.truncate_prompt_tokens is not None:
-            if request.truncate_prompt_tokens <= self.max_model_len:
-                truncate_prompt_tokens = request.truncate_prompt_tokens
-            else:
-                return self.create_error_response(
-                    "truncate_prompt_tokens value is "
-                    "greater than max_model_len."
-                    " Please, select a smaller truncation size.")
+        truncate_prompt_tokens = request.truncate_prompt_tokens
 
         try:
+            truncate_prompt_tokens = _validate_truncation_size(
+                self.max_model_len, truncate_prompt_tokens)
             (
                 lora_request,
                 prompt_adapter_request,
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 73b4288cbb0..9bdacb5518d 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -18,6 +18,7 @@
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.score_utils import (_cosine_similarity,
                                           _validate_score_input_lens)
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -231,11 +232,6 @@ async def _run_scoring(
         truncate_prompt_tokens: Optional[int] = None,
     ) -> list[PoolingRequestOutput]:
 
-        tokenization_kwargs: dict[str, Any] = {}
-        if truncate_prompt_tokens is not None:
-            tokenization_kwargs["truncation"] = True
-            tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
         (
             lora_request,
             prompt_adapter_request,
@@ -247,12 +243,9 @@ async def _run_scoring(
 
         tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
-        if truncate_prompt_tokens is not None and \
-                truncate_prompt_tokens > self.max_model_len:
-            raise ValueError(
-                f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
-                f"is greater than max_model_len ({self.max_model_len})."
-                f" Please, select a smaller truncation size.")
+        tokenization_kwargs: dict[str, Any] = {}
+        _validate_truncation_size(self.max_model_len, truncate_prompt_tokens,
+                                  tokenization_kwargs)
 
         trace_headers = (None if raw_request is None else await
                          self._get_trace_headers(raw_request.headers))
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 20c3238fb3d..5c181616aa0 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -27,6 +27,7 @@
 
 
 @ToolParserManager.register_module("llama3_json")
+@ToolParserManager.register_module("llama4_json")
 class Llama3JsonToolParser(ToolParser):
     """
     Tool call parser for Llama 3.1 models intended for use with the
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 0661445639d..9dbfe85ecc6 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -38,6 +38,10 @@ def generate_random_id():
         # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
         return "".join(choices(ALPHANUMERIC, k=9))
 
+    @staticmethod
+    def is_valid_id(id: str) -> bool:
+        return id.isalnum() and len(id) == 9
+
 
 @ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):
@@ -70,6 +74,19 @@ def __init__(self, tokenizer: AnyTokenizer):
                 "Mistral Tool Parser could not locate the tool call token in "
                 "the tokenizer!")
 
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if not isinstance(
+                self.model_tokenizer, MistralTokenizer
+        ) and request.tools and request.tool_choice != 'none':
+            # Do not skip special tokens when using chat template
+            # with Mistral parser as TOOL_CALL token is needed
+            # for tool detection.
+            # Note: we don't want skip_special_tokens=False
+            # with MistralTokenizer as it is incompatible
+            request.skip_special_tokens = False
+        return request
+
     def extract_tool_calls(
         self,
         model_output: str,
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 53411a27b41..80b6c07c603 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -46,4 +46,4 @@ def _validate_score_input_lens(
     if len(texts_1) == 0:
         raise ValueError("At least one text element must be given")
     if len(texts_2) == 0:
-        raise ValueError("At least one text_pair element must be given")
+        raise ValueError("At least one text_pair element must be given")
\ No newline at end of file
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index b88c2b3a080..2fe6e1a9e9c 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -3,6 +3,7 @@
 import asyncio
 import functools
 import os
+from typing import Any, Optional
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
@@ -134,3 +135,26 @@ def cli_env_setup():
     if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
         logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def _validate_truncation_size(
+    max_model_len: int,
+    truncate_prompt_tokens: Optional[int],
+    tokenization_kwargs: Optional[dict[str, Any]] = None,
+) -> Optional[int]:
+
+    if truncate_prompt_tokens is not None:
+        if truncate_prompt_tokens <= -1:
+            truncate_prompt_tokens = max_model_len
+
+        if truncate_prompt_tokens > max_model_len:
+            raise ValueError(
+                f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                f"is greater than max_model_len ({max_model_len})."
+                f" Please, select a smaller truncation size.")
+
+        if tokenization_kwargs is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+    return truncate_prompt_tokens
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 0fa5b70c2ef..71f031d1e23 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -8,8 +8,21 @@
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
+if not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # normally, we disable NCCL_CUMEM_ENABLE because it
+    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
+    # see https://github.com/NVIDIA/nccl/issues/1234
+    # for more details.
+    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+    # multi-node NVLink, typically on GB200-NVL72 systems.
+    # The ultimate way to detect multi-node NVLink is to use
+    # NVML APIs, which are too expensive to call here.
+    # As an approximation, we check the existence of
+    # /dev/nvidia-caps-imex-channels, used by
+    # multi-node NVLink to communicate across nodes.
+    # This will still cost some GPU memory, but it is worthwhile
+    # because we can get very fast cross-node bandwidth with NVLink.
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
 
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
diff --git a/vllm/envs.py b/vllm/envs.py
index f80bf878f79..ea40bfff11b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,10 +75,12 @@
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
-    VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
+    VLLM_ROCM_USE_AITER_MLA: bool = True
+    VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
     VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
@@ -96,6 +98,7 @@
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
+    VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
     VLLM_DP_RANK: int = 0
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
@@ -103,10 +106,10 @@
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
-    VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
+    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
 
 
 def get_default_cache_root():
@@ -533,6 +536,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # Whether to use aiter paged attention.
+    # By default is disabled.
+    "VLLM_ROCM_USE_AITER_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_PAGED_ATTN", "False").lower() in
+             ("true", "1")),
+
     # use aiter linear op if aiter ops are enabled
     # The following list of related ops
     # - scaled_mm (per-tensor / rowwise)
@@ -546,18 +555,21 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
              ("true", "1")),
 
-    # Whether to use aiter block scaled moe kernel.
-    # By default this is disabled.
-    "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE":
-    lambda:
-    (os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
-     ("true", "1")),
-
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter mla ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MLA":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MLA", "True").lower() in
+             ("true", "1")),
+    # use rocm skinny gemms
+    "VLLM_ROCM_USE_SKINNY_GEMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
+             ("true", "1")),
+
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
@@ -639,6 +651,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
     ("1", "true"),
 
+    # Use delayed sampling for HPU to reduce host cpu overhead
+    # between each step.
+    "VLLM_HPU_USE_DELAYED_SAMPLING":
+    lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
+    ("1", "true"),
+
     # Rank of the process in the data parallel setting
     "VLLM_DP_RANK":
     lambda: int(os.getenv("VLLM_DP_RANK", "0")),
@@ -684,11 +702,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_V0_USE_OUTLINES_CACHE":
     lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
 
-    # If set, disables TPU-specific optimization for top-k & top-p sampling
-    "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION":
-    lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"]))
-    if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None,
-
     # Gap between padding buckets for the forward pass. So we have
     # 8, we will run forward pass with [16, 24, 32, ...].
     "VLLM_TPU_BUCKET_PADDING_GAP":
@@ -704,6 +717,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # It can be changed with this variable if needed for some reason.
     "VLLM_XGRAMMAR_CACHE_MB":
     lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
+
+    # Control the threshold for msgspec to use 'zero copy' for
+    # serialization/deserialization of tensors. Tensors below
+    # this limit will be encoded into the msgpack buffer, and
+    # tensors above will instead be sent via a separate message.
+    # While the sending side still actually copies the tensor
+    # in all cases, on the receiving side, tensors above this
+    # limit will actually be zero-copy decoded.
+    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
+    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
 }
 
 # end-env-vars-definition
@@ -742,7 +765,7 @@ def compute_hash() -> str:
     variables, ensure that it is included in the factors list if
     it affects the computation graph. For example, different values
     of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that 
+    graphs, so it is included in the factors list. The env vars that
     affect the choice of different kernels or attention backends should
     also be included in the factors list.
     """
@@ -771,6 +794,7 @@ def factorize(name: str):
         if key in environment_variables:
             factorize(key)
 
-    hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+    hash_str = hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
 
     return hash_str
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 58796e5d732..522bd940211 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -74,7 +74,7 @@ def collective_rpc(self,
                 `self` argument, in addition to the arguments passed in `args`
                 and `kwargs`. The `self` argument will be the worker object.
             timeout: Maximum time in seconds to wait for execution. Raises a
-                :exc:`TimeoutError` on timeout. `None` means wait indefinitely.
+                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
             args: Positional arguments to pass to the worker method.
             kwargs: Keyword arguments to pass to the worker method.
 
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 8c004c790fc..2e4b47c1e24 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -34,13 +34,13 @@ def _init_executor(self) -> None:
         if len(device_info) > 1:
             local_rank = int(device_info[1])
         rank = 0
+        is_driver_worker = True
         kwargs = dict(
             vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            is_driver_worker=(not self.parallel_config)
-            or (rank % self.parallel_config.tensor_parallel_size == 0),
+            is_driver_worker=is_driver_worker,
         )
         self.collective_rpc("init_worker", args=([kwargs], ))
         self.collective_rpc("init_device")
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index e195a03c5ca..c75d8f088c5 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -11,6 +11,10 @@
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group,
+                                          is_v1_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -70,15 +74,13 @@ def set_forward_context(attn_metadata: Any,
     if vllm_config.parallel_config.data_parallel_size > 1:
         dp_size = vllm_config.parallel_config.data_parallel_size
         dp_rank = vllm_config.parallel_config.data_parallel_rank
-        if attn_metadata is not None:
-            if hasattr(attn_metadata, "num_prefill_tokens"):
-                # for v0 attention backends
-                batchsize = attn_metadata.num_prefill_tokens + \
-                    attn_metadata.num_decode_tokens
-            else:
-                # for v1 attention backends
-                batchsize = attn_metadata.num_input_tokens
+        if attn_metadata is not None and hasattr(attn_metadata,
+                                                 "num_prefill_tokens"):
+            # for v0 attention backends
+            batchsize = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
         else:
+            # for v1 attention backends or no attn_metadata
             batchsize = num_tokens
         num_tokens_across_dp = [0] * dp_size
         num_tokens_across_dp[dp_rank] = batchsize
@@ -98,6 +100,17 @@ def set_forward_context(attn_metadata: Any,
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata)
+
+    # KVConnector: trigger (possibly async) load before forward.
+    # Each attn layer will block until the reading is complete.
+    trigger_kv_transfer = (attn_metadata is not None
+                           and has_kv_transfer_group()
+                           and is_v1_kv_transfer_group())
+    if trigger_kv_transfer:
+        kv_connector = get_kv_transfer_group()
+        assert isinstance(kv_connector, KVConnectorBase_V1)
+        kv_connector.start_load_kv(_forward_context)
+
     try:
         yield
     finally:
@@ -109,7 +122,7 @@ def set_forward_context(attn_metadata: Any,
                     attn_metadata.num_decode_tokens
             else:
                 # for v1 attention backends
-                batchsize = attn_metadata.num_input_tokens
+                batchsize = num_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
@@ -133,4 +146,12 @@ def set_forward_context(attn_metadata: Any,
                     logger.info(("Batchsize forward time stats "
                                  "(batchsize, count, median_time(ms)): %s"),
                                 forward_stats)
+
+        # KVConnector: each attn layer triggers (possibly async) save.
+        # Ensure all those operations complete before forward() is done.
+        if trigger_kv_transfer:
+            kv_connector = get_kv_transfer_group()
+            assert isinstance(kv_connector, KVConnectorBase_V1)
+            kv_connector.wait_for_save()
+
         _forward_context = prev_context
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 6f8f2cd758f..0673aece910 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,17 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
+from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
-                   SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
-                   TextPrompt, TokenInputs, TokensPrompt,
-                   build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
-                   token_inputs, zip_enc_dec_prompts)
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
+                   TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs,
+                   to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
 from .registry import (DummyData, InputContext, InputProcessingContext,
                        InputRegistry)
 
 INPUT_REGISTRY = InputRegistry()
 """
-The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
 to dispatch data processing according to the target model.
 """
 
@@ -22,12 +21,13 @@
     "SingletonPrompt",
     "ExplicitEncoderDecoderPrompt",
     "TokenInputs",
+    "EmbedsInputs",
     "token_inputs",
+    "embeds_inputs",
     "DecoderOnlyInputs",
     "EncoderDecoderInputs",
     "ProcessorInputs",
     "SingletonInputs",
-    "SingletonInputsAdapter",
     "build_explicit_enc_dec_prompt",
     "to_enc_dec_tuple_list",
     "zip_enc_dec_prompts",
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 138a8f61107..c83ab73b614 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,17 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-
 from collections.abc import Iterable
-from dataclasses import dataclass
-from functools import cached_property
 from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
 
 import torch
-from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
+from typing_extensions import NotRequired, TypedDict, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
-                                 MultiModalPlaceholderDict)
-    from vllm.multimodal.inputs import MultiModalInputs
+    from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
 
 
 class TextPrompt(TypedDict):
@@ -34,6 +29,11 @@ class TextPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
 
 class TokensPrompt(TypedDict):
     """Schema for a tokenized prompt."""
@@ -58,26 +58,44 @@ class TokensPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+class EmbedsPrompt(TypedDict):
+    """Schema for a prompt provided via token embeddings."""
+
+    prompt_embeds: torch.Tensor
+    """The embeddings of the prompt."""
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
 
-SingletonPrompt = Union[str, TextPrompt, TokensPrompt]
+SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
 """
 Set of possible schemas for a single prompt:
 
-- A text prompt (:class:`str` or :class:`TextPrompt`)
-- A tokenized prompt (:class:`TokensPrompt`)
+- A text prompt ({class}`str` or {class}`TextPrompt`)
+- A tokenized prompt ({class}`TokensPrompt`)
+- An embeddings prompt ({class}`EmbedsPrompt`)
 
 Note that "singleton" is as opposed to a data structure
 which encapsulates multiple prompts, i.e. of the sort
 which may be utilized for encoder/decoder models when
 the user desires to express both the encoder & decoder
-prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt`
+prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt`
 
-A prompt of type :class:`SingletonPrompt` may be employed
+A prompt of type {class}`SingletonPrompt` may be employed
 as (1) input to a decoder-only model, (2) input to
 the encoder of an encoder/decoder model, in the scenario
 where the decoder-prompt is not specified explicitly, or
 (3) as a member of a larger data structure encapsulating
-more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt`
+more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt`
 """
 
 _T1_co = TypeVar("_T1_co",
@@ -97,18 +115,18 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
     comprising an explicit encoder prompt and a decoder prompt.
 
     The encoder and decoder prompts, respectively, may be formatted
-    according to any of the :class:`SingletonPrompt` schemas,
+    according to any of the {class}`SingletonPrompt` schemas,
     and are not required to have the same schema.
 
     Only the encoder prompt may have multi-modal data. mm_processor_kwargs
     should be at the top-level, and should not be set in the encoder/decoder
     prompts, since they are agnostic to the encoder/decoder.
 
-    Note that an :class:`ExplicitEncoderDecoderPrompt` may not
+    Note that an {class}`ExplicitEncoderDecoderPrompt` may not
     be used as an input to a decoder-only model,
-    and that the :code:`encoder_prompt` and :code:`decoder_prompt`
+    and that the `encoder_prompt` and `decoder_prompt`
     fields of this data structure themselves must be
-    :class:`SingletonPrompt` instances.
+    {class}`SingletonPrompt` instances.
     """
 
     encoder_prompt: _T1_co
@@ -123,10 +141,11 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 Set of possible schemas for an LLM input, including
 both decoder-only and encoder/decoder input types:
 
-- A text prompt (:class:`str` or :class:`TextPrompt`)
-- A tokenized prompt (:class:`TokensPrompt`)
+- A text prompt ({class}`str` or {class}`TextPrompt`)
+- A tokenized prompt ({class}`TokensPrompt`)
+- An embeddings prompt ({class}`EmbedsPrompt`)
 - A single data structure containing both an encoder and a decoder prompt
-  (:class:`ExplicitEncoderDecoderPrompt`)
+  ({class}`ExplicitEncoderDecoderPrompt`)
 """
 
 
@@ -147,34 +166,9 @@ class TokenInputs(TypedDict):
     The original prompt text corresponding to the token IDs, if available.
     """
 
-    multi_modal_data: NotRequired["MultiModalDataDict"]
+    cache_salt: NotRequired[str]
     """
-    Optional multi-modal data to pass to the model,
-    if the model supports it.
-    """
-
-    multi_modal_inputs: NotRequired["MultiModalKwargs"]
-    """
-    Optional multi-modal inputs to pass to the model,
-    if the model supports it.
-    """
-
-    multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
-    """
-    Placeholder ranges for the multi-modal data.
-    """
-
-    multi_modal_hashes: NotRequired[list[str]]
-    """
-    The hashes of the multi-modal data.
-    """
-
-    mm_processor_kwargs: NotRequired[dict[str, Any]]
-    """
-    Optional multi-modal processor kwargs to be forwarded to the
-    multimodal input mapper & processor. Note that if multiple modalities
-    have registered mappers etc for the model being considered, we attempt
-    to pass the mm_processor_kwargs to each of them.
+    Optional cache salt to be used for prefix caching.
     """
 
 
@@ -182,36 +176,52 @@ def token_inputs(
     prompt_token_ids: list[int],
     token_type_ids: Optional[list[int]] = None,
     prompt: Optional[str] = None,
-    multi_modal_data: Optional["MultiModalDataDict"] = None,
-    multi_modal_inputs: Optional["MultiModalKwargs"] = None,
-    multi_modal_hashes: Optional[list[str]] = None,
-    multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
-    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    cache_salt: Optional[str] = None,
 ) -> TokenInputs:
-    """Construct :class:`TokenInputs` from optional values."""
+    """Construct {class}`TokenInputs` from optional values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
 
     if prompt is not None:
         inputs["prompt"] = prompt
     if token_type_ids is not None:
         inputs["token_type_ids"] = token_type_ids
-    if multi_modal_data is not None:
-        inputs["multi_modal_data"] = multi_modal_data
-    if multi_modal_inputs is not None:
-        inputs["multi_modal_inputs"] = multi_modal_inputs
-    if multi_modal_hashes is not None:
-        inputs["multi_modal_hashes"] = multi_modal_hashes
-    if multi_modal_placeholders is not None:
-        inputs["multi_modal_placeholders"] = multi_modal_placeholders
-    if mm_processor_kwargs is not None:
-        inputs["mm_processor_kwargs"] = mm_processor_kwargs
+    if cache_salt is not None:
+        inputs["cache_salt"] = cache_salt
+
+    return inputs
+
+
+class EmbedsInputs(TypedDict):
+    """Represents embeddings-based inputs."""
+
+    type: Literal["embeds"]
+    """The type of inputs."""
+
+    prompt_embeds: torch.Tensor
+    """The embeddings of the prompt."""
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
+    """
+
+
+def embeds_inputs(
+    prompt_embeds: torch.Tensor,
+    cache_salt: Optional[str] = None,
+) -> EmbedsInputs:
+    """Construct :class:`EmbedsInputs` from optional values."""
+    inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
+
+    if cache_salt is not None:
+        inputs["cache_salt"] = cache_salt
 
     return inputs
 
 
-DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"]
+DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-The inputs in :class:`~vllm.LLMEngine` before they are
+The inputs in {class}`~vllm.LLMEngine` before they are
 passed to the model executor.
 This specifies the data required for decoder-only models.
 """
@@ -219,7 +229,7 @@ def token_inputs(
 
 class EncoderDecoderInputs(TypedDict):
     """
-    The inputs in :class:`~vllm.LLMEngine` before they are
+    The inputs in {class}`~vllm.LLMEngine` before they are
     passed to the model executor.
 
     This specifies the required data for encoder-decoder models.
@@ -231,121 +241,15 @@ class EncoderDecoderInputs(TypedDict):
     """The inputs for the decoder portion."""
 
 
-SingletonInputs = Union[TokenInputs, "MultiModalInputs"]
+SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-A processed :class:`SingletonPrompt` which can be passed to
-:class:`vllm.sequence.Sequence`.
+A processed {class}`SingletonPrompt` which can be passed to
+{class}`vllm.sequence.Sequence`.
 """
 
-
-@dataclass
-class SingletonInputsAdapter:
-    """
-    Unified interface to access the components of :class:`SingletonInputs`.
-    """
-    inputs: SingletonInputs
-
-    @cached_property
-    def prompt(self) -> Optional[str]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token" or inputs["type"] == "multimodal":
-            return inputs.get("prompt")
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def prompt_token_ids(self) -> list[int]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token" or inputs["type"] == "multimodal":
-            return inputs.get("prompt_token_ids", [])
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def token_type_ids(self) -> list[int]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token" or inputs["type"] == "multimodal":
-            return inputs.get("token_type_ids", [])
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def prompt_embeds(self) -> Optional[torch.Tensor]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token" or inputs["type"] == "multimodal":
-            return None
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def multi_modal_data(self) -> "MultiModalDataDict":
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("multi_modal_data", {})
-
-        if inputs["type"] == "multimodal":
-            return inputs.get("mm_kwargs", {})
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("multi_modal_inputs", {})
-
-        if inputs["type"] == "multimodal":
-            return inputs.get("mm_kwargs", {})
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def multi_modal_hashes(self) -> list[str]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("multi_modal_hashes", [])
-
-        if inputs["type"] == "multimodal":
-            # only the case when we use MultiModalInputs
-            return inputs.get("mm_hashes", [])  # type: ignore[return-value]
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("multi_modal_placeholders", {})
-
-        if inputs["type"] == "multimodal":
-            return inputs.get("mm_placeholders", {})
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-    @cached_property
-    def mm_processor_kwargs(self) -> dict[str, Any]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("mm_processor_kwargs", {})
-
-        if inputs["type"] == "multimodal":
-            return {}
-
-        assert_never(inputs)  # type: ignore[arg-type]
-
-
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
 """
-The inputs to :data:`vllm.inputs.InputProcessor`.
+The inputs to {data}`vllm.inputs.InputProcessor`.
 """
 
 _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
@@ -373,8 +277,8 @@ def zip_enc_dec_prompts(
 ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
-    :class:`ExplicitEncoderDecoderPrompt` instances.
-    
+    {class}`ExplicitEncoderDecoderPrompt` instances.
+
     ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
     dictionary will be used for every encoder/decoder prompt. If an iterable is
     provided, it will be zipped with the encoder/decoder prompts.
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 28e207de1fd..d17122b4834 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -6,8 +6,9 @@
 
 from vllm.utils import is_list_of
 
-from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
-                   SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt)
+from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs,
+                   PromptType, SingletonInputs, SingletonPrompt, TextPrompt,
+                   TokensPrompt)
 
 
 class ParsedText(TypedDict):
@@ -84,23 +85,51 @@ class ParsedTokensPrompt(TypedDict):
     content: TokensPrompt
 
 
-def parse_singleton_prompt(
-    prompt: SingletonPrompt,
-) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
+class ParsedEmbedsPrompt(TypedDict):
+    type: Literal['embeds']
+    content: EmbedsPrompt
+
+
+ParsedSingletonPrompt = Union[ParsedStrPrompt, ParsedTextPrompt,
+                              ParsedTokensPrompt, ParsedEmbedsPrompt]
+
+
+@overload
+def parse_singleton_prompt(prompt: str) -> ParsedStrPrompt:
+    ...
+
+
+@overload
+def parse_singleton_prompt(prompt: TextPrompt) -> ParsedTextPrompt:
+    ...
+
+
+@overload
+def parse_singleton_prompt(prompt: TokensPrompt) -> ParsedTokensPrompt:
+    ...
+
+
+@overload
+def parse_singleton_prompt(prompt: EmbedsPrompt) -> ParsedEmbedsPrompt:
+    ...
+
+
+def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
     if isinstance(prompt, str):
         return ParsedStrPrompt(type="str", content=prompt)
     elif isinstance(prompt, dict):
-        if "prompt_token_ids" in prompt:
-            return ParsedTokensPrompt(type="tokens",
-                                      content=prompt)  # type: ignore
+        # Type ignores are because mypy does not correctly infer the TypedDicts
+        # Pyright does succeed.
+        if "prompt_embeds" in prompt:
+            return ParsedEmbedsPrompt(
+                type="embeds", content=prompt)  # type: ignore[typeddict-item]
+        elif "prompt_token_ids" in prompt:
+            return ParsedTokensPrompt(
+                type="tokens", content=prompt)  # type: ignore[typeddict-item]
         elif "prompt" in prompt:
             return ParsedTextPrompt(type="text", content=prompt)
-
-    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
-
-
-def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]:
-    return isinstance(prompt, dict) and "prompt_token_ids" in prompt
+    raise TypeError(
+        "inputs must be a string, TextPrompt, TokensPrompt, or EmbedsPrompt")
 
 
 def is_explicit_encoder_decoder_prompt(
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 669fb96e665..6e8effd6027 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from collections.abc import Mapping
-from typing import Optional, Union, cast
+from typing import Any, Optional, Union, cast
 
 from typing_extensions import assert_never
 
@@ -13,10 +13,13 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalInputs)
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
-from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
-                   PromptType, SingletonInputs, SingletonPrompt, token_inputs)
+from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
+                   EncoderDecoderInputs, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
+                   TokensPrompt, embeds_inputs, token_inputs)
 from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
 
 logger = init_logger(__name__)
@@ -27,7 +30,7 @@ class InputPreprocessor:
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[BaseTokenizerGroup],
+        tokenizer: Optional[TokenizerGroup],
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         super().__init__()
@@ -36,7 +39,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
 
-    def get_tokenizer_group(self) -> BaseTokenizerGroup:
+    def get_tokenizer_group(self) -> TokenizerGroup:
         if self.tokenizer is None:
             raise ValueError("You cannot pass text prompts when "
                              "`skip_tokenizer_init` is True")
@@ -136,13 +139,10 @@ def _prepare_decoder_input_ids_for_generation(
         """
         Prepares `decoder_input_ids` for generation with encoder-decoder models.
 
-        Based on
-
-        https://github.com/huggingface/transformers/blob/
-        4037a2b5b1278736e566aec12e169100275545ea/
-        src/transformers/generation/utils.py
-
-        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
+        Based on:
+        https://github.com/huggingface/transformers/blob/4037a2b5b1278736e566aec12e169100275545ea/src/transformers/generation/utils.py
+        specifically,
+        `GenerationMixin._prepare_decoder_input_ids_for_generation()`.
 
         Arguments:
 
@@ -179,71 +179,82 @@ def _apply_prompt_adapter(
 
         return prompt_token_ids
 
+    def _get_tokenization_kw(
+        self,
+        overrides: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
+        kwargs = dict[str, Any]()
+
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            kwargs["add_special_tokens"] = False
+
+        if overrides:
+            kwargs.update(overrides)
+
+        return kwargs
+
     def _tokenize_prompt(
         self,
         prompt: str,
         lora_request: Optional[LoRARequest],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[int]:
         """
         Apply the model's tokenizer to a text prompt, returning the
         corresponding token IDs.
         """
         tokenizer = self.get_tokenizer_group()
-        add_special_tokens = None
-        if self.model_config.hf_config.model_type == "whisper":
-            # For Whisper, special tokens should be provided by the user based
-            # on the task and language of their request. Also needed to avoid
-            # appending an EOS token to the prompt which disrupts generation.
-            add_special_tokens = False
+        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
 
-        if (self.model_config.encoder_config is not None
-                and self.model_config.encoder_config.get(
-                    "do_lower_case", False)):
+        encoder_config = self.model_config.encoder_config
+
+        if encoder_config and encoder_config.get("do_lower_case", False):
             prompt = prompt.lower()
 
         return tokenizer.encode(prompt=prompt,
                                 lora_request=lora_request,
-                                add_special_tokens=add_special_tokens)
+                                **tokenization_kwargs)
 
     async def _tokenize_prompt_async(
         self,
         prompt: str,
         lora_request: Optional[LoRARequest],
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[int]:
-        """Async version of :meth:`_tokenize_prompt`."""
+        """Async version of {meth}`_tokenize_prompt`."""
         tokenizer = self.get_tokenizer_group()
-        add_special_tokens = None
-        if self.model_config.hf_config.model_type == "whisper":
-            # For Whisper, special tokens should be provided by the user based
-            # on the task and language of their request. Also needed to avoid
-            # appending an EOS token to the prompt which disrupts generation.
-            add_special_tokens = False
-        return await tokenizer.encode_async(
-            prompt=prompt,
-            lora_request=lora_request,
-            add_special_tokens=add_special_tokens)
-
-    def _can_process_multimodal(self) -> bool:
-        model_config = self.model_config
-
-        if not model_config.is_multimodal_model:
-            raise ValueError("Your model does not support multi-modal inputs")
-
-        # Interim measure so we can handle models that have yet to be
-        # updated to use the new multi-modal processor
-        can_process_multimodal = self.mm_registry.has_processor(model_config)
-        if not can_process_multimodal:
-            from vllm.model_executor.models.registry import _VLLM_MODELS
-            if not any(arch in _VLLM_MODELS
-                       for arch in model_config.architectures):
-                logger.warning_once(
-                    "Your model uses the legacy input pipeline, which will be "
-                    "removed in an upcoming release. "
-                    "Please upgrade to the new multi-modal processing pipeline "
-                    "(https://docs.vllm.ai/en/latest/design/mm_processing.html)"
-                )
+        tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
+
+        return await tokenizer.encode_async(prompt=prompt,
+                                            lora_request=lora_request,
+                                            **tokenization_kwargs)
 
-        return can_process_multimodal
+    def _get_mm_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest],
+    ) -> AnyTokenizer:
+        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
+        # while using also multi-modal input
+        if not self.tokenizer:
+            return cast(AnyTokenizer, object())  # Dummy
+
+        tokenizer_group = self.get_tokenizer_group()
+        return tokenizer_group.get_lora_tokenizer(lora_request)
+
+    async def _get_mm_tokenizer_async(
+        self,
+        lora_request: Optional[LoRARequest],
+    ) -> AnyTokenizer:
+        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
+        # while using also multi-modal input
+        if not self.tokenizer:
+            return cast(AnyTokenizer, object())  # Dummy
+
+        tokenizer_group = self.get_tokenizer_group()
+        return await tokenizer_group.get_lora_tokenizer_async(lora_request)
 
     def _process_multimodal(
         self,
@@ -257,14 +268,7 @@ def _process_multimodal(
         Apply the model's multi-modal processor to a multi-modal prompt,
         returning the corresponding token IDs and metadata.
         """
-        # At the moment on model (PrithviGeoSpatialMAE) requires to be
-        # initialized without a tokenizer while using also multi-modal
-        # input.
-        if not self.tokenizer:
-            tokenizer = object()  # Dummy
-        else:
-            tokenizer_group = self.get_tokenizer_group()
-            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
+        tokenizer = self._get_mm_tokenizer(lora_request)
 
         mm_processor = self.mm_registry.create_processor(self.model_config,
                                                          tokenizer=tokenizer)
@@ -283,16 +287,8 @@ async def _process_multimodal_async(
         lora_request: Optional[LoRARequest],
         return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
-        """Async version of :meth:`_process_multimodal`."""
-        # At the moment on model (PrithviGeoSpatialMAE) requires to be
-        # initialized without a tokenizer while using also multi-modal
-        # input.
-        if not self.tokenizer:
-            tokenizer = object()  # Dummy
-        else:
-            tokenizer_group = self.get_tokenizer_group()
-            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
-                lora_request)
+        """Async version of {meth}`_process_multimodal`."""
+        tokenizer = await self._get_mm_tokenizer_async(lora_request)
 
         mm_processor = self.mm_registry.create_processor(self.model_config,
                                                          tokenizer=tokenizer)
@@ -302,162 +298,240 @@ async def _process_multimodal_async(
         return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
                                   return_mm_hashes)
 
-    def _prompt_to_llm_inputs(
+    def _process_embeds(
         self,
-        prompt: SingletonPrompt,
-        lora_request: Optional[LoRARequest] = None,
-        return_mm_hashes: bool = False,
-    ) -> SingletonInputs:
-        """
-        Extract the singleton inputs from a prompt.
+        parsed_content: EmbedsPrompt,
+    ) -> EmbedsInputs:
+        if not self.model_config.enable_prompt_embeds:
+            raise ValueError("You must set `--enable-prompt-embeds` to input "
+                             "`prompt_embeds`.")
 
-        Arguments:
+        prompt_embeds = parsed_content["prompt_embeds"]
 
-        * prompt: single encoder or decoder input prompt
-        * lora_request: this is only valid for decoder prompts
-        * return_mm_hashes: whether to return multimodal hashes
+        # prompt_embeds must be (seq_len, hidden_size), but if the user
+        # passes in a batch of size 1, i.e. (1, seq_len, hidden_size),
+        # we can unambiguously process the intent by squeezing the batch
+        # dimension.
+        if prompt_embeds.ndim == 3:
+            prompt_embeds = prompt_embeds.squeeze(dim=0)
 
-        Returns:
+        if prompt_embeds.ndim != 2:
+            raise ValueError(
+                "prompt_embeds must be of shape (seq_len, hidden_size).")
 
-        * :class:`SingletonInputs` instance
-        """
-        parsed = parse_singleton_prompt(prompt)
+        return embeds_inputs(prompt_embeds=prompt_embeds,
+                             cache_salt=parsed_content.get("cache_salt"))
 
-        if parsed["type"] == "str":
-            prompt_text = parsed["content"]
-            prompt_token_ids = self._tokenize_prompt(
-                prompt_text,
+    async def _process_embeds_async(
+        self,
+        parsed_content: EmbedsPrompt,
+    ) -> EmbedsInputs:
+        return self._process_embeds(parsed_content)
+
+    def _process_tokens(
+        self,
+        parsed_content: TokensPrompt,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_token_ids = parsed_content["prompt_token_ids"]
+        token_type_ids = parsed_content.get("token_type_ids")
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = self._process_multimodal(
+                prompt_token_ids,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
                 lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
             )
-
-            return token_inputs(
-                prompt=prompt_text,
+        else:
+            inputs = token_inputs(
                 prompt_token_ids=prompt_token_ids,
+                token_type_ids=token_type_ids,
             )
 
-        if parsed["type"] == "tokens":
-            tokens_content = parsed["content"]
-
-            prompt_token_ids = tokens_content["prompt_token_ids"]
-            token_type_ids = tokens_content.get("token_type_ids")
-            multi_modal_data = tokens_content.get("multi_modal_data")
-            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None and self._can_process_multimodal():
-                return self._process_multimodal(
-                    prompt_token_ids,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
 
-            return token_inputs(
+        return inputs
+
+    async def _process_tokens_async(
+        self,
+        parsed_content: TokensPrompt,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_token_ids = parsed_content["prompt_token_ids"]
+        token_type_ids = parsed_content.get("token_type_ids")
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = await self._process_multimodal_async(
+                prompt_token_ids,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
+            inputs = token_inputs(
                 prompt_token_ids=prompt_token_ids,
                 token_type_ids=token_type_ids,
-                multi_modal_data=multi_modal_data,
-                mm_processor_kwargs=mm_processor_kwargs,
             )
 
-        if parsed["type"] == "text":
-            text_content = parsed["content"]
-
-            prompt_text = text_content["prompt"]
-            multi_modal_data = text_content.get("multi_modal_data")
-            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None and self._can_process_multimodal():
-                return self._process_multimodal(
-                    prompt_text,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
+
+        return inputs
 
+    def _process_text(
+        self,
+        parsed_content: TextPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_text = parsed_content["prompt"]
+
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = self._process_multimodal(
+                prompt_text,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
             )
-
-            return token_inputs(
+            inputs = token_inputs(
                 prompt=prompt_text,
                 prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
-                mm_processor_kwargs=mm_processor_kwargs,
             )
 
-        assert_never(parsed)
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
 
-    async def _prompt_to_llm_inputs_async(
+        return inputs
+
+    async def _process_text_async(
         self,
-        prompt: SingletonPrompt,
+        parsed_content: TextPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         return_mm_hashes: bool = False,
-    ) -> SingletonInputs:
-        """Async version of :meth:`_extract_prompt_components`."""
-        parsed = parse_singleton_prompt(prompt)
+    ) -> Union[TokenInputs, MultiModalInputs]:
+        prompt_text = parsed_content["prompt"]
 
-        if parsed["type"] == "str":
-            prompt_text = parsed["content"]
+        inputs: Union[TokenInputs, MultiModalInputs]
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
+            inputs = await self._process_multimodal_async(
+                prompt_text,
+                multi_modal_data,
+                parsed_content.get("mm_processor_kwargs"),
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        else:
             prompt_token_ids = await self._tokenize_prompt_async(
                 prompt_text,
                 lora_request=lora_request,
+                tokenization_kwargs=tokenization_kwargs,
             )
-
-            return token_inputs(
+            inputs = token_inputs(
                 prompt=prompt_text,
                 prompt_token_ids=prompt_token_ids,
             )
 
-        if parsed["type"] == "tokens":
-            tokens_content = parsed["content"]
-
-            prompt_token_ids = tokens_content["prompt_token_ids"]
-            multi_modal_data = tokens_content.get("multi_modal_data")
-            mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None and self._can_process_multimodal():
-                return await self._process_multimodal_async(
-                    prompt_token_ids,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
+        if cache_salt := parsed_content.get("cache_salt"):
+            inputs["cache_salt"] = cache_salt
 
-            return token_inputs(
-                prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
-                mm_processor_kwargs=mm_processor_kwargs,
-            )
+        return inputs
 
-        if parsed["type"] == "text":
-            text_content = parsed["content"]
-
-            prompt_text = text_content["prompt"]
-            multi_modal_data = text_content.get("multi_modal_data")
-            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
-
-            if multi_modal_data is not None and self._can_process_multimodal():
-                return await self._process_multimodal_async(
-                    prompt_text,
-                    multi_modal_data,
-                    mm_processor_kwargs,
-                    lora_request=lora_request,
-                    return_mm_hashes=return_mm_hashes,
-                )
+    def _prompt_to_llm_inputs(
+        self,
+        prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> SingletonInputs:
+        """
+        Extract the singleton inputs from a prompt.
 
-            prompt_token_ids = await self._tokenize_prompt_async(
-                prompt_text,
+        Arguments:
+
+        * prompt: single encoder or decoder input prompt
+        * lora_request: this is only valid for decoder prompts
+        * return_mm_hashes: whether to return multimodal hashes
+
+        Returns:
+
+        * {class}`SingletonInputs` instance
+        """
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "embeds":
+            return self._process_embeds(parsed["content"])
+        if parsed["type"] == "tokens":
+            return self._process_tokens(
+                parsed["content"],
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "text":
+            return self._process_text(
+                parsed["content"],
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "str":
+            return self._process_text(
+                TextPrompt(prompt=parsed["content"]),
+                tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
             )
 
-            return token_inputs(
-                prompt=prompt_text,
-                prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
-                mm_processor_kwargs=mm_processor_kwargs,
+        assert_never(parsed)
+
+    async def _prompt_to_llm_inputs_async(
+        self,
+        prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
+    ) -> SingletonInputs:
+        """Async version of {meth}`_prompt_to_llm_inputs`."""
+        parsed = parse_singleton_prompt(prompt)
+
+        if parsed["type"] == "embeds":
+            return await self._process_embeds_async(parsed["content"])
+        if parsed["type"] == "tokens":
+            return await self._process_tokens_async(
+                parsed["content"],
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "text":
+            return await self._process_text_async(
+                parsed["content"],
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
+            )
+        if parsed["type"] == "str":
+            return await self._process_text_async(
+                TextPrompt(prompt=parsed["content"]),
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                return_mm_hashes=return_mm_hashes,
             )
 
         assert_never(parsed)
@@ -467,11 +541,16 @@ def _build_enc_dec_llm_inputs(
         encoder_inputs: SingletonInputs,
         decoder_inputs: Optional[SingletonInputs],
     ) -> EncoderDecoderInputs:
-        if (encoder_inputs["type"] == "token"
-                or encoder_inputs["type"] == "multimodal"):
-            pass
-        else:
-            assert_never(encoder_inputs)  # type: ignore[arg-type]
+        if (encoder_inputs["type"] == "embeds"
+                or decoder_inputs and decoder_inputs["type"] == "embeds"):
+            raise ValueError("Embedding inputs are not supported for encoder-"
+                             "decoder models")
+
+        # Needed for mypy
+        encoder_inputs = cast(Union[TokenInputs, MultiModalInputs],
+                              encoder_inputs)
+        decoder_inputs = cast(Optional[Union[TokenInputs, MultiModalInputs]],
+                              decoder_inputs)
 
         if decoder_inputs is None:
             if self.model_config.hf_config.model_type == "whisper":
@@ -484,77 +563,88 @@ def _build_enc_dec_llm_inputs(
                 dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                     None)
             decoder_inputs = token_inputs(dec_token_ids)
-        elif (decoder_inputs["type"] == "token"
-              or decoder_inputs["type"] == "multimodal"):
-            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
-                decoder_inputs["prompt_token_ids"])
-            decoder_inputs["prompt_token_ids"] = dec_token_ids
-
+        else:
             if "multi_modal_data" in decoder_inputs:
                 raise ValueError("Multi-modal decoder inputs of encoder-"
                                  "decoder models are not supported yet")
-        else:
-            assert_never(encoder_inputs)  # type: ignore[arg-type]
+
+            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                decoder_inputs["prompt_token_ids"])
+            decoder_inputs["prompt_token_ids"] = dec_token_ids
 
         return EncoderDecoderInputs(
             encoder=encoder_inputs,
             decoder=decoder_inputs,
         )
 
-    def _separate_enc_dec_inputs_from_mm_processor_outputs(
+    def _split_enc_dec_mm_inputs(
         self,
-        inputs: SingletonInputs,
+        inputs: Union[SingletonInputs, MultiModalEncDecInputs],
         decoder_inputs_to_override: Optional[SingletonInputs] = None,
     ) -> tuple[SingletonInputs, SingletonInputs]:
         """
         For encoder/decoder models only:
         Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
         """
+        if (inputs["type"] == "embeds" or decoder_inputs_to_override
+                and decoder_inputs_to_override["type"] == "embeds"):
+            raise ValueError("Embedding inputs are not supported for encoder-"
+                             "decoder models")
+
+        # Needed for mypy
+        inputs = cast(
+            Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs],
+            inputs,
+        )
+        decoder_inputs_to_override = cast(
+            Optional[Union[TokenInputs, MultiModalInputs]],
+            decoder_inputs_to_override,
+        )
+
         encoder_inputs: SingletonInputs
         decoder_inputs: SingletonInputs
-        if inputs["type"] == "multimodal":
-            # Multimodal data inputs
-            assert ("encoder_prompt" in inputs
-                    and "encoder_prompt_token_ids" in inputs)
+
+        if inputs["type"] == "multimodal":  # Multimodal data inputs
+            if not ("encoder_prompt" in inputs
+                    and "encoder_prompt_token_ids" in inputs):
+                raise RuntimeError("You should register an encoder-decoder "
+                                   "multi-modal processor for encoder-decoder "
+                                   "models.")
             inputs = cast(MultiModalEncDecInputs, inputs)
+
             encoder_inputs = token_inputs(
                 prompt=inputs["encoder_prompt"],
                 prompt_token_ids=inputs["encoder_prompt_token_ids"],
             )
-            if decoder_inputs_to_override is not None:
-                decoder_inputs = MultiModalInputs(
-                    type="multimodal",
-                    prompt=decoder_inputs_to_override.get("prompt", ""),
-                    prompt_token_ids=decoder_inputs_to_override[
-                        "prompt_token_ids"],
-                    mm_kwargs=inputs["mm_kwargs"],
-                    mm_hashes=inputs["mm_hashes"],
-                    mm_placeholders=inputs["mm_placeholders"],
-                )
-            else:
-                decoder_inputs = MultiModalInputs(
-                    type="multimodal",
-                    prompt=inputs["prompt"],
-                    prompt_token_ids=inputs["prompt_token_ids"],
-                    mm_kwargs=inputs["mm_kwargs"],
-                    mm_hashes=inputs["mm_hashes"],
-                    mm_placeholders=inputs["mm_placeholders"],
-                )
-        elif inputs["type"] == "token":
-            # Text-only inputs
+
+            decoder_prompt_inputs = decoder_inputs_to_override or inputs
+            decoder_inputs = MultiModalInputs(
+                type="multimodal",
+                prompt=decoder_prompt_inputs.get("prompt", ""),
+                prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
+                mm_kwargs=inputs["mm_kwargs"],
+                mm_hashes=inputs["mm_hashes"],
+                mm_placeholders=inputs["mm_placeholders"],
+            )
+            if cache_salt := inputs.get("cache_salt"):
+                decoder_inputs["cache_salt"] = cache_salt
+
+        elif inputs["type"] == "token":  # Text-only inputs
             encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
             decoder_inputs = decoder_inputs_to_override or inputs
         else:
             assert_never(inputs)  # type: ignore[arg-type]
+
         return encoder_inputs, decoder_inputs
 
     def _process_encoder_decoder_prompt(
         self,
         prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
-        Process an input prompt into an :class:`EncoderDecoderInputs` instance.
+        Process an input prompt into an {class}`EncoderDecoderInputs` instance.
 
         There are two types of input prompts:
         singleton prompts which carry only the
@@ -580,36 +670,37 @@ def _process_encoder_decoder_prompt(
 
         Returns:
 
-        * :class:`EncoderDecoderInputs` instance
+        * {class}`EncoderDecoderInputs` instance
         """
         encoder_inputs: SingletonInputs
         decoder_inputs: Optional[SingletonInputs]
 
         if is_explicit_encoder_decoder_prompt(prompt):
             encoder_inputs = self._prompt_to_llm_inputs(
-                prompt["encoder_prompt"])
+                prompt["encoder_prompt"],
+                tokenization_kwargs=tokenization_kwargs,
+            )
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
             else:
                 decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
             # For multimodal model, override decoder prompt from processor
             # with explicit decoder prompt.
-            if self.model_config.is_multimodal_model and (
-                    self._can_process_multimodal()):
+            if self.model_config.is_multimodal_model:
                 encoder_inputs, decoder_inputs = (
-                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
-                        encoder_inputs, decoder_inputs))
+                    self._split_enc_dec_mm_inputs(encoder_inputs,
+                                                  decoder_inputs))
         else:
-            inputs = self._prompt_to_llm_inputs(prompt)
-            if self.model_config.is_multimodal_model and (
-                    self._can_process_multimodal()):
+            inputs = self._prompt_to_llm_inputs(
+                prompt,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
                 encoder_inputs, decoder_inputs = (
-                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
-                        inputs))
+                    self._split_enc_dec_mm_inputs(inputs))
             else:
                 encoder_inputs = inputs
-
                 decoder_inputs = None
 
         return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
@@ -617,42 +708,47 @@ def _process_encoder_decoder_prompt(
     async def _process_encoder_decoder_prompt_async(
         self,
         prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> EncoderDecoderInputs:
-        """Async version of :meth:`_process_encoder_decoder_prompt`."""
+        """Async version of {meth}`_process_encoder_decoder_prompt`."""
         encoder_inputs: SingletonInputs
         decoder_inputs: Optional[SingletonInputs]
 
         if is_explicit_encoder_decoder_prompt(prompt):
             encoder_task = self._prompt_to_llm_inputs_async(
-                prompt["encoder_prompt"])
+                prompt["encoder_prompt"],
+                tokenization_kwargs=tokenization_kwargs,
+            )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 encoder_inputs = await encoder_task
                 decoder_inputs = None
             else:
-                decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
+                decoder_task = self._prompt_to_llm_inputs_async(
+                    decoder_input,
+                    tokenization_kwargs=tokenization_kwargs,
+                )
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
                     encoder_task, decoder_task)
 
             # For multimodal model, override decoder prompt from processor
             # with explicit decoder prompt.
-            if self.model_config.is_multimodal_model and (
-                    self._can_process_multimodal()):
+            if self.model_config.is_multimodal_model:
                 encoder_inputs, decoder_inputs = (
-                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
-                        encoder_inputs, decoder_inputs))
+                    self._split_enc_dec_mm_inputs(encoder_inputs,
+                                                  decoder_inputs))
         else:
-            inputs = await self._prompt_to_llm_inputs_async(prompt)
-            if self.model_config.is_multimodal_model and (
-                    self._can_process_multimodal()):
+            inputs = await self._prompt_to_llm_inputs_async(
+                prompt,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+            if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
                 encoder_inputs, decoder_inputs = (
-                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
-                        inputs))
+                    self._split_enc_dec_mm_inputs(inputs))
             else:
                 encoder_inputs = inputs
-
                 decoder_inputs = None
 
         return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
@@ -662,27 +758,27 @@ def _build_decoder_only_llm_inputs(
         prompt_inputs: DecoderOnlyInputs,
         prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> DecoderOnlyInputs:
-        if (prompt_inputs["type"] == "token"
-                or prompt_inputs["type"] == "multimodal"):
+        if "prompt_token_ids" in prompt_inputs:
+            prompt_inputs = cast(Union[TokenInputs, MultiModalInputs],
+                                 prompt_inputs)  # Needed for mypy
             prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
                 prompt_inputs["prompt_token_ids"],
                 prompt_adapter_request=prompt_adapter_request,
             )
-        else:
-            assert_never(prompt_inputs)  # type: ignore[arg-type]
 
         return prompt_inputs
 
     def _process_decoder_only_prompt(
         self,
         prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
-        Process an input prompt into an :class:`DecoderOnlyInputs` instance.
+        Process an input prompt into an {class}`DecoderOnlyInputs` instance.
 
         Arguments:
 
@@ -693,11 +789,12 @@ def _process_decoder_only_prompt(
 
         Returns:
 
-        * :class:`DecoderOnlyInputs` instance
+        * {class}`DecoderOnlyInputs` instance
         """
 
         prompt_comps = self._prompt_to_llm_inputs(
             prompt,
+            tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
             return_mm_hashes=return_mm_hashes,
         )
@@ -710,13 +807,15 @@ def _process_decoder_only_prompt(
     async def _process_decoder_only_prompt_async(
         self,
         prompt: SingletonPrompt,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
-        """Async version of :meth:`_process_decoder_only_prompt`."""
+        """Async version of {meth}`_process_decoder_only_prompt`."""
         prompt_comps = await self._prompt_to_llm_inputs_async(
             prompt,
+            tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
             return_mm_hashes=return_mm_hashes,
         )
@@ -729,6 +828,7 @@ async def _process_decoder_only_prompt_async(
     def preprocess(
         self,
         prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
@@ -749,6 +849,7 @@ def preprocess(
         # Decoder-only operation
         return self._process_decoder_only_prompt(
             prompt,
+            tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=return_mm_hashes,
@@ -757,11 +858,12 @@ def preprocess(
     async def preprocess_async(
         self,
         prompt: PromptType,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
-        """Async version of :meth:`preprocess`."""
+        """Async version of {meth}`preprocess`."""
         if self.model_config.is_encoder_decoder:
             assert not return_mm_hashes, (
                 "Multimodal hashes for encoder-decoder models should not be ",
@@ -777,6 +879,7 @@ async def preprocess_async(
         # Decoder-only operation
         return await self._process_decoder_only_prompt_async(
             prompt,
+            tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=return_mm_hashes,
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0579893e5d7..aecddbcd751 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,24 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import functools
-from collections import UserDict
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional,
-                    Protocol, Union)
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
-from torch import nn
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
-from typing_extensions import TypeVar, assert_never
+from typing_extensions import TypeVar
 
-from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        resolve_mm_processor_kwargs)
-
-from .data import ProcessorInputs, SingletonInputs
-from .parse import split_enc_dec_inputs
+from vllm.utils import resolve_mm_processor_kwargs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -26,8 +16,6 @@
                                  MultiModalRegistry)
     from vllm.sequence import SequenceData
 
-logger = init_logger(__name__)
-
 _T = TypeVar("_T")
 _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
@@ -50,7 +38,7 @@ def get_hf_config(
     ) -> _C:
         """
         Get the HuggingFace configuration
-        (:class:`transformers.PretrainedConfig`) of the model,
+        ({class}`transformers.PretrainedConfig`) of the model,
         additionally checking its type.
 
         Raises:
@@ -91,7 +79,7 @@ def get_hf_processor(
     ) -> _P:
         """
         Get the HuggingFace processor
-        (:class:`transformers.ProcessorMixin`) of the model,
+        ({class}`transformers.ProcessorMixin`) of the model,
         additionally checking its type.
 
         Raises:
@@ -113,7 +101,8 @@ def init_processor(
         Initialize a HuggingFace-like processor class, merging the
         keyword arguments with those in the model's configuration.
         """
-        base_kwargs = self.model_config.mm_processor_kwargs
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
@@ -146,12 +135,13 @@ def call_hf_processor(
         kwargs: Mapping[str, object] = {},
     ) -> BatchFeature:
         """
-        Call :code:`hf_processor` on the prompt :code:`data`
-        (text, image, audio...) with configurable options :code:`kwargs`.
+        Call `hf_processor` on the prompt `data`
+        (text, image, audio...) with configurable options `kwargs`.
         """
         assert callable(hf_processor)
 
-        base_kwargs = self.model_config.mm_processor_kwargs
+        mm_config = self.model_config.get_multimodal_config()
+        base_kwargs = mm_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
@@ -172,142 +162,23 @@ def call_hf_processor(
             raise RuntimeError(msg) from exc
 
 
-N = TypeVar("N", bound=type[nn.Module])
-
-
 class DummyData(NamedTuple):
-    """Dummy data used for profiling."""
+    """
+    Dummy data used for profiling.
+
+    Note: This is only used in V0.
+    """
 
     seq_data: "SequenceData"
     multi_modal_data: Optional["MultiModalDataDict"] = None
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
 
 
-class DummyDataFactory(Protocol):
-
-    def __call__(
-        self,
-        ctx: InputContext,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-        **mm_processor_kwargs: Any,
-    ) -> DummyData:
-        """
-        Create dummy data to be inputted into the model.
-
-        Note:
-            :data:`InputProcessor` is not applied to the dummy data.
-
-            The :code:`mm_processor_kwargs` are overrides provided at
-            initialization time to values in the config whose values
-            may affect the number of tokens per instance.
-        """
-        ...
-
-
-class _MultiModalCounts(UserDict[str, int]):
-    """
-    Wraps `mm_counts` for a more informative error message
-    when attempting to access a plugin that does not exist.
-    """
-
-    def __getitem__(self, key: str) -> int:
-        try:
-            return super().__getitem__(key)
-        except KeyError as exc:
-            msg = (f"There is no multi-modal plugin with the key: {key}. "
-                   f"Available keys: {set(self.keys())}")
-            raise KeyError(msg) from exc
-
-
-InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs]
-"""Preprocess the inputs to the model."""
-
-
 class InputRegistry:
     """
-    A registry to dispatch data processing
-    according to the target model.
+    Note: This is only used in V0.
     """
 
-    def __init__(self) -> None:
-        self._dummy_factories_by_model_type = \
-            ClassRegistry[nn.Module, DummyDataFactory]()
-        self._dummy_encoder_factories_by_model_type = \
-            ClassRegistry[nn.Module, DummyDataFactory]()
-        self._input_processors_by_model_type = \
-            ClassRegistry[nn.Module, InputProcessor]()
-
-    def _default_dummy_data_factory(
-        self,
-        ctx: InputContext,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> DummyData:
-        """
-        The default dummy data factory represents the longest possible text
-        that can be inputted to the model.
-
-        Note:
-            :data:`InputProcessor` is not applied to the dummy data.
-        """
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
-        return DummyData(SequenceData.from_prompt_token_counts((0, seq_len)))
-
-    def register_dummy_data(self, factory: DummyDataFactory):
-        """
-        Register a dummy data factory to a model class.
-
-        During memory profiling, the provided function is invoked to create
-        dummy data to be inputted into the model. The resulting memory usage
-        should be an upper bound of what the model would use at inference time.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if self._dummy_factories_by_model_type.contains(model_cls,
-                                                            strict=True):
-                logger.warning(
-                    "Model class %s already has dummy data "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
-
-            self._dummy_factories_by_model_type[model_cls] = factory
-
-            return model_cls
-
-        return wrapper
-
-    def _get_dummy_data_factory(self, model_cls: type[nn.Module]):
-        return self._dummy_factories_by_model_type \
-            .get(model_cls, self._default_dummy_data_factory)
-
-    def register_dummy_encoder_data(self, factory: DummyDataFactory):
-        """
-        Register a dummy encoder data factory to a model class
-
-        This is similar to :meth:`~register_dummy_data`, but for encoder input.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if self._dummy_encoder_factories_by_model_type.contains(
-                    model_cls, strict=True):
-                logger.warning(
-                    "Model class %s already has dummy encoder data "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
-
-            self._dummy_encoder_factories_by_model_type[model_cls] = factory
-
-            return model_cls
-
-        return wrapper
-
-    def _get_dummy_encoder_data_factory(self, model_cls: type[nn.Module]):
-        return self._dummy_encoder_factories_by_model_type \
-            .get(model_cls, self._default_dummy_data_factory)
-
     def dummy_data_for_profiling(
         self,
         model_config: "ModelConfig",
@@ -319,169 +190,25 @@ def dummy_data_for_profiling(
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
-
-        Note:
-            This should be called after
-            :meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
         """
         # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-        from vllm.multimodal import MultiModalKwargs
-        from vllm.multimodal.profiling import MultiModalProfiler
         from vllm.sequence import SequenceData
 
-        if mm_registry.has_processor(model_config):
-            processor = mm_registry.create_processor(model_config,
-                                                     disable_cache=True)
-            profiler = MultiModalProfiler(processor)
-
-            dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
-                             if is_encoder_data else
-                             profiler.get_decoder_dummy_data(seq_len))
-            _seq_data = SequenceData.from_seqs(
-                dummy_data_v1.prompt_token_ids)  # type: ignore[attr-defined]
-
-            dummy_data = DummyData(
-                seq_data=_seq_data,
-                multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
-                                         None),
-                multi_modal_placeholders=getattr(dummy_data_v1,
-                                                 "multi_modal_placeholders",
-                                                 None),
-            )
-        else:
-            model_cls, _ = get_model_architecture(model_config)
-            if is_encoder_data:
-                dummy_factory = self._get_dummy_encoder_data_factory(model_cls)
-            else:
-                dummy_factory = self._get_dummy_data_factory(model_cls)
-            mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
-            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
-                dummy_factory,
-                overrides=model_config.mm_processor_kwargs,
-                requires_kw_only=False,
-                allow_var_kwargs=True,
-            )
-
-            dummy_data = dummy_factory(InputContext(model_config), seq_len,
-                                       _MultiModalCounts(mm_counts),
-                                       **mm_processor_kwargs)
-
-        # Having more tokens is over-conservative but otherwise fine
-        num_tokens = dummy_data.seq_data.prompt_token_ids
-        if len(num_tokens) < seq_len:
-            if is_encoder_data:
-                logger.warning_once(
-                    f"Expected at least {seq_len} dummy encoder tokens for "
-                    f"profiling, but found {len(num_tokens)} tokens instead.")
-            else:
-                raise AssertionError(
-                    f"Expected at least {seq_len} dummy tokens for profiling, "
-                    f"but found {len(num_tokens)} tokens instead.")
-
-        if (dummy_data.multi_modal_data is not None and
-                not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)):
-            for k, v in dummy_data.multi_modal_data.items():
-                num_items = len(v) if isinstance(v, list) else 1
-                num_expected = mm_counts[k]
-                assert num_items >= num_expected, (
-                    f"Expected at least {num_expected} dummy '{k}' instances "
-                    f"for profiling, but found {num_items} instances instead.")
-
-        return dummy_data
-
-    def _default_input_processor(
-        self,
-        ctx: InputContext,
-        inputs: ProcessorInputs,
-        **kwargs: object,
-    ) -> ProcessorInputs:
-        """The default input processor is a no-op."""
-        return inputs
-
-    def register_input_processor(self, processor: InputProcessor):
-        """
-        Register an input processor to a model class.
-
-        The provided function is invoked on each input to the model. This
-        happens before
-        :meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if self._input_processors_by_model_type.contains(model_cls,
-                                                             strict=True):
-                logger.warning(
-                    "Model class %s already has input processor "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
-
-            self._input_processors_by_model_type[model_cls] = processor
-
-            return model_cls
+        if not model_config.is_multimodal_model:
+            seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
+            return DummyData(seq_data=seq_data)
 
-        return wrapper
+        # Encoder dummy data does not contain multi-modal data
+        if is_encoder_data:
+            enc_data = mm_registry.get_encoder_dummy_data(
+                model_config, seq_len)
+            seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
+            return DummyData(seq_data=seq_data)
 
-    def _get_model_input_processor(self, model_cls: type[nn.Module]):
-        return self._input_processors_by_model_type \
-            .get(model_cls, self._default_input_processor)
-
-    def _ensure_mm_kwargs(
-        self,
-        inputs: SingletonInputs,
-        mm_processor_kwargs: dict[str, Any],
-    ):
-        if inputs["type"] == "token":
-            # In case the input processor for that model fails to set it
-            if "mm_processor_kwargs" not in inputs:
-                inputs["mm_processor_kwargs"] = mm_processor_kwargs
-        elif inputs["type"] == "multimodal":
-            # Be more strict in V2
-            assert "mm_kwargs" in inputs
-        else:
-            assert_never(inputs["type"])  # type: ignore[arg-type]
-
-    def process_input(self, model_config: "ModelConfig",
-                      inputs: ProcessorInputs) -> ProcessorInputs:
-        """
-        Apply an input processor to an instance of model inputs.
-
-        The model is identified by ``model_config``.
-        """
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
-        processor = self._get_model_input_processor(model_cls)
-
-        # Handle multimodal processor kwargs with priority:
-        #     Inference kwargs -> Init kwargs -> {}
-        # If it's empty, it'll fall back to the default kwarg values
-        mm_processor_kwargs = resolve_mm_processor_kwargs(
-            model_config.mm_processor_kwargs,
-            inputs.get("mm_processor_kwargs", {}),  # type: ignore
-            processor,
-            requires_kw_only=False,
-            allow_var_kwargs=True,
-        )
+        dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len)
 
-        processed_inputs = processor(
-            InputContext(model_config),
-            inputs,
-            **mm_processor_kwargs,
+        return DummyData(
+            seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
+            multi_modal_data=dec_data.multi_modal_data,
+            multi_modal_placeholders=dec_data.multi_modal_placeholders,
         )
-
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-        if encoder_inputs is not None:
-            self._ensure_mm_kwargs(encoder_inputs, mm_processor_kwargs)
-        if decoder_inputs is not None:
-            self._ensure_mm_kwargs(decoder_inputs, mm_processor_kwargs)
-
-        return processed_inputs
-
-    def create_input_processor(self, model_config: "ModelConfig"):
-        """
-        Create an input processor (see :meth:`_process_input`) for a
-        specific model.
-        """
-        return functools.partial(self.process_input, model_config)
diff --git a/vllm/logger.py b/vllm/logger.py
index 2b0b9da2d6f..cf32041c5b7 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import sys
+from collections.abc import Hashable
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
@@ -52,39 +53,39 @@
 
 
 @lru_cache
-def _print_info_once(logger: Logger, msg: str) -> None:
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
     # Set the stacklevel to 2 to print the original caller's line info
-    logger.info(msg, stacklevel=2)
+    logger.info(msg, *args, stacklevel=2)
 
 
 @lru_cache
-def _print_warning_once(logger: Logger, msg: str) -> None:
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
     # Set the stacklevel to 2 to print the original caller's line info
-    logger.warning(msg, stacklevel=2)
+    logger.warning(msg, *args, stacklevel=2)
 
 
 class _VllmLogger(Logger):
     """
     Note:
         This class is just to provide type information.
-        We actually patch the methods directly on the :class:`logging.Logger`
+        We actually patch the methods directly on the {class}`logging.Logger`
         instance to avoid conflicting with other libraries such as
         `intel_extension_for_pytorch.utils._logger`.
     """
 
-    def info_once(self, msg: str) -> None:
+    def info_once(self, msg: str, *args: Hashable) -> None:
         """
-        As :meth:`info`, but subsequent calls with the same message
+        As {meth}`info`, but subsequent calls with the same message
         are silently dropped.
         """
-        _print_info_once(self, msg)
+        _print_info_once(self, msg, *args)
 
-    def warning_once(self, msg: str) -> None:
+    def warning_once(self, msg: str, *args: Hashable) -> None:
         """
-        As :meth:`warning`, but subsequent calls with the same message
+        As {meth}`warning`, but subsequent calls with the same message
         are silently dropped.
         """
-        _print_warning_once(self, msg)
+        _print_warning_once(self, msg, *args)
 
 
 def _configure_vllm_root_logger() -> None:
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index acae0d972f4..5a39705e857 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from vllm.lora.ops.triton_ops.lora_expand import lora_expand
+from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand
 from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
-from vllm.lora.ops.triton_ops.lora_shrink import lora_shrink
+from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink
 
 __all__ = [
     "lora_expand",
diff --git a/vllm/lora/ops/triton_ops/lora_expand.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
similarity index 100%
rename from vllm/lora/ops/triton_ops/lora_expand.py
rename to vllm/lora/ops/triton_ops/lora_expand_op.py
diff --git a/vllm/lora/ops/triton_ops/lora_shrink.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
similarity index 100%
rename from vllm/lora/ops/triton_ops/lora_shrink.py
rename to vllm/lora/ops/triton_ops/lora_shrink_op.py
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index ad5d4b788ec..922d6c06000 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -15,6 +15,5 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     punica_wrapper = punica_wrapper_cls(*args, **kwargs)
     assert punica_wrapper is not None, \
         "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
-    logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] +
-                     ".")
+    logger.info_once("Using %s.", punica_wrapper_qualname.rsplit(".", 1)[1])
     return punica_wrapper
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
new file mode 100644
index 00000000000..6726ca9a903
--- /dev/null
+++ b/vllm/lora/resolver.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AbstractSet, Dict, Optional
+
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+
+logger = init_logger(__name__)
+
+
+class LoRAResolver(ABC):
+    """Base class for LoRA adapter resolvers.
+
+    This class defines the interface for resolving and fetching LoRA adapters.
+    Implementations of this class should handle the logic for locating and
+    downloading LoRA adapters from various sources (e.g. S3, cloud storage,
+    etc.).
+    """
+
+    @abstractmethod
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        """Abstract method to resolve and fetch a LoRA model adapter.
+
+        Implements logic to locate and download LoRA adapter based on the name.
+        Implementations might fetch from a blob storage or other sources.
+
+        Args:
+            base_model_name: The name/identifier of the base model to resolve.
+            lora_name: The name/identifier of the LoRA model to resolve.
+
+        Returns:
+            Optional[LoRARequest]: The resolved LoRA model information, or None
+            if the LoRA model cannot be found.
+        """
+        pass
+
+
+@dataclass
+class _LoRAResolverRegistry:
+    resolvers: Dict[str, LoRAResolver] = field(default_factory=dict)
+
+    def get_supported_resolvers(self) -> AbstractSet[str]:
+        """Get all registered resolver names."""
+        return self.resolvers.keys()
+
+    def register_resolver(
+        self,
+        resolver_name: str,
+        resolver: LoRAResolver,
+    ) -> None:
+        """Register a LoRA resolver.
+        Args:
+            resolver_name: Name to register the resolver under.
+            resolver: The LoRA resolver instance to register.
+        """
+        if resolver_name in self.resolvers:
+            logger.warning(
+                "LoRA resolver %s is already registered, and will be "
+                "overwritten by the new resolver instance %s.", resolver_name,
+                resolver)
+
+        self.resolvers[resolver_name] = resolver
+
+    def get_resolver(self, resolver_name: str) -> LoRAResolver:
+        """Get a registered resolver instance by name.
+        Args:
+            resolver_name: Name of the resolver to get.
+        Returns:
+            The resolver instance.
+        Raises:
+            KeyError: If the resolver is not found in the registry.
+        """
+        if resolver_name not in self.resolvers:
+            raise KeyError(
+                f"LoRA resolver '{resolver_name}' not found. "
+                f"Available resolvers: {list(self.resolvers.keys())}")
+        return self.resolvers[resolver_name]
+
+
+LoRAResolverRegistry = _LoRAResolverRegistry()
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 610cbf87f66..883ca938ea1 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -114,7 +114,7 @@ def parse_fine_tuned_lora_name(
             is_bias whether the tensor is lora bias.
     """
 
-    # LoRA weight qualified name always starts with `base_model.model.`,
+    # LoRA weight qualified name usually starts with `base_model.model.`,
     # so we remove the prefix `base_model.model.` to make the following
     # mapping correctly.
     if "base_model.model." in name:
@@ -123,18 +123,23 @@ def parse_fine_tuned_lora_name(
         # recover the prefix `base_model.model.`
         name = "base_model.model." + name
 
+    # In some situations, we may not start with `base_model.model.`.
+    # If we don't (e.g., ibm-granite/granite-speech-3.3-8b),
+    # we should keep the prefix intact.
+    start_index = 2 if "base_model.model." in name else 0
+
     parts = name.split(".")
     if parts[-1] == "weight" and (parts[-2] == "lora_A"
                                   or parts[-2] == "lora_B"):
-        new_name = ".".join(parts[2:-2])
+        new_name = ".".join(parts[start_index:-2])
         return new_name, parts[-2] == "lora_A", False
 
     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
-        new_name = ".".join(parts[2:-1])
+        new_name = ".".join(parts[start_index:-1])
         return new_name, parts[-1] == "lora_embedding_A", False
 
     if parts[-1] == "bias":
-        new_name = ".".join(parts[2:-2])
+        new_name = ".".join(parts[start_index:-2])
         return new_name, False, True
 
     raise ValueError(f"{name} is unsupported LoRA weight")
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index dfd052f6252..b0d00ee4818 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -107,9 +107,9 @@ def enabled(cls) -> bool:
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
             logger.warning_once(
-                f"Custom op {cls.__name__} was not registered, "
-                f"which means it won't appear in the op registry. "
-                f"It will be enabled/disabled based on the global settings.")
+                "Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.",  # noqa: E501
+                cls.__name__,
+            )
             return CustomOp.default_on()
 
         enabled = f"+{cls.name}" in custom_ops
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index d4ee1be9a44..a2b61a1b19e 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -26,8 +26,8 @@ def maybe_backend_fallback(
     def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                           fallback: str) -> None:
         """Change the backend to the specified fallback with a warning log,
-        or raise a ValueError if the `no-fallback` option is specified."""
-        if guided_params.no_fallback():
+        or raise a ValueError if the `disable_fallback` option is specified."""
+        if guided_params.disable_fallback:
             raise ValueError(message)
 
         logger.warning("%s Falling back to use %s instead.", message, fallback)
@@ -40,7 +40,7 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
         guided_params.backend = "xgrammar"
 
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
-    if guided_params.backend_name == "lm-format-enforcer":
+    if guided_params.backend == "lm-format-enforcer":
         if guided_params.grammar is not None:
             fallback_or_error(
                 guided_params,
@@ -55,7 +55,7 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                 "lm-format-enforcer does not support advanced JSON schema "
                 "features like patterns or numeric ranges.", "outlines")
 
-    if guided_params.backend_name == "xgrammar":
+    if guided_params.backend == "xgrammar":
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (
             xgr_installed)
 
@@ -65,7 +65,7 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
             fallback_or_error(
                 guided_params,
                 "xgrammar does not support advanced JSON schema features like "
-                "enums, patterns or numeric ranges.", "outlines")
+                "string length, item limits, or property bounds.", "outlines")
 
         # xgrammar only supports GBNF grammars, so we must convert Lark.
         # We must check if the grammar is likely Lark and if that
@@ -87,7 +87,7 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                 guided_params,
                 "xgrammar module cannot be imported successfully.", "outlines")
 
-    if (guided_params.backend_name == "outlines"
+    if (guided_params.backend == "outlines"
             and guided_params.json_object is not None):
         # outlines doesn't support json_object, fallback to guidance
         fallback_or_error(guided_params,
@@ -103,7 +103,7 @@ async def get_guided_decoding_logits_processor(
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
 
     reasoner = None
-    if reasoning_backend is not None:
+    if reasoning_backend:
         reasoner_class = ReasoningParserManager.get_reasoning_parser(
             reasoning_backend)
         reasoner = reasoner_class(tokenizer)
@@ -111,7 +111,7 @@ async def get_guided_decoding_logits_processor(
     guided_params = maybe_backend_fallback(guided_params)
 
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend_name == 'outlines':
+    if guided_params.backend == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_outlines_guided_decoding_logits_processor)
@@ -122,12 +122,12 @@ async def get_guided_decoding_logits_processor(
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
-    if guided_params.backend_name == 'xgrammar':
+    if guided_params.backend == 'xgrammar':
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
             guided_params, tokenizer, model_config, reasoner)
-    if guided_params.backend_name == 'guidance':
+    if guided_params.backend == 'guidance':
         from vllm.model_executor.guided_decoding.guidance_decoding import (
             get_local_guidance_guided_decoding_logits_processor)
         return get_local_guidance_guided_decoding_logits_processor(
@@ -146,29 +146,29 @@ def get_local_guided_decoding_logits_processor(
     guided_params = maybe_backend_fallback(guided_params)
 
     reasoner = None
-    if reasoning_backend is not None:
+    if reasoning_backend:
         reasoner_class = ReasoningParserManager.get_reasoning_parser(
             reasoning_backend)
         reasoner = reasoner_class(tokenizer)
 
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend_name == 'outlines':
+    if guided_params.backend == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_local_outlines_guided_decoding_logits_processor)
         return get_local_outlines_guided_decoding_logits_processor(
             guided_params, tokenizer, reasoner)
-    if guided_params.backend_name == 'lm-format-enforcer':
+    if guided_params.backend == 'lm-format-enforcer':
         from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
-    if guided_params.backend_name == 'xgrammar':
+    if guided_params.backend == 'xgrammar':
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
             guided_params, tokenizer, model_config, reasoner)
-    if guided_params.backend_name == 'guidance':
+    if guided_params.backend == 'guidance':
         from vllm.model_executor.guided_decoding.guidance_decoding import (
             get_local_guidance_guided_decoding_logits_processor)
         return get_local_guidance_guided_decoding_logits_processor(
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index f19ebcbe420..0b1f4762bc7 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 from re import escape as regex_escape
 
 import llguidance
@@ -7,6 +8,8 @@
 from vllm.model_executor.guided_decoding.guidance_logits_processors import (
     GuidanceLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
+from vllm.v1.structured_output.backend_guidance import (
+    process_for_additional_properties)
 
 
 def get_local_guidance_guided_decoding_logits_processor(
@@ -18,11 +21,18 @@ def get_local_guidance_guided_decoding_logits_processor(
     """
 
     grm = ""
-    any_whitespace = 'disable-any-whitespace' not in \
-        guided_params.backend_options()
-    if guided_params.json:
+    any_whitespace = not guided_params.disable_any_whitespace
+    if (guide_json := guided_params.json) is not None:
+        # Optionally set additionalProperties to False at the top-level
+        # By default, other backends do not allow additional top-level
+        # properties, so this makes guidance more similar to other backends
+        if guided_params.disable_additional_properties:
+            if not isinstance(guide_json, str):
+                guide_json = json.dumps(guide_json)
+            guide_json = process_for_additional_properties(guide_json)
+
         grm = llguidance.LLMatcher.grammar_from_json_schema(
-            guided_params.json,
+            guide_json,
             overrides={"whitespace_pattern": guided_params.whitespace_pattern},
             defaults={
                 "whitespace_flexible": any_whitespace,
diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py
index db4ce26806c..1593868a164 100644
--- a/vllm/model_executor/guided_decoding/guided_fields.py
+++ b/vllm/model_executor/guided_decoding/guided_fields.py
@@ -27,14 +27,15 @@ class GuidedDecodingRequest:
     guided_decoding_backend: Optional[str] = None
     guided_whitespace_pattern: Optional[str] = None
     guided_json_object: Optional[bool] = None
+    structural_tag: Optional[str] = None
 
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
-        guide_count = sum([
-            self.guided_json is not None, self.guided_regex is not None,
-            self.guided_choice is not None, self.guided_grammar is not None,
-            self.guided_json_object is not None
-        ])
+        guide_count = sum(x is not None
+                          for x in (self.guided_json, self.guided_regex,
+                                    self.guided_choice, self.guided_grammar,
+                                    self.guided_json_object,
+                                    self.structural_tag))
         if guide_count > 1:
             raise ValueError(
                 "You can only use one kind of guided decoding but multiple are "
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 31af4593f11..936fd0f0686 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -61,7 +61,7 @@ def __call__(self, input_ids: List[int],
         """Use the FSM to bias the logits before sampling the next token."""
 
         # Skip the structured logits processing if reasoning is not finished.
-        # reasoner is not None only when `--enable-reasoning` is set.
+        # reasoner is not None only when `--reasoning-parser` is set.
         if self._reasoner is not None:
             if not self._reasoner.is_reasoning_end(input_ids):
                 return scores
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
deleted file mode 100644
index ab6e47c007d..00000000000
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-from transformers import PreTrainedTokenizer
-
-from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
-    DeepSeekReasoner)
-from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
-
-logger = init_logger(__name__)
-
-
-def get_reasoner(tokenizer: PreTrainedTokenizer,
-                 reasoning_backend: str | None) -> Reasoner | None:
-    if reasoning_backend is None:
-        # No reasoning backend specified
-        return None
-    elif reasoning_backend == "deepseek_r1":
-        return DeepSeekReasoner.from_tokenizer(tokenizer)
-    elif reasoning_backend == "granite":
-        logger.warning(
-            "Granite reasoner not yet implemented for structured outputs")
-        return None
-    else:
-        # Raise a warning for unknown reasoning backend and return None
-        # We cannot raise an error here because some reasoning models
-        # may not have a corresponding Reasoner class.
-        logger.warning("Unknown reasoning backend %s for structured outputs ",
-                       reasoning_backend)
-        return None
-
-
-__all__ = ["Reasoner", "get_reasoner"]
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index ba7c1025269..1ad1ef8fbf1 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -10,16 +10,8 @@ def check_object(obj: dict) -> bool:
         if not isinstance(obj, dict):
             return False
 
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
         # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj for key in [
-                    "minimum", "maximum", "exclusiveMinimum",
-                    "exclusiveMaximum", "multipleOf"
-                ]):
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
             return True
 
         # Check for array unsupported keywords
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index ff223c3c9b8..ac2d73626d7 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -175,8 +175,7 @@ def from_guided_params(cls,
             else:
                 json_str = guided_params.json
 
-            any_whitespace = 'disable-any-whitespace' not in \
-                    guided_params.backend_options()
+            any_whitespace = not guided_params.disable_any_whitespace
 
             # Check and log if model with xgrammar and whitespace have history
             # of runaway generation of whitespaces.
@@ -191,11 +190,10 @@ def from_guided_params(cls,
                 model_with_warn = 'Qwen'
 
             if model_with_warn is not None and any_whitespace:
-                msg = (f"{model_with_warn} "
-                       f"model detected, consider set "
-                       f"`guided_backend=xgrammar:disable-any-whitespace` "
-                       f"to prevent runaway generation of whitespaces.")
-                logger.info_once(msg)
+                logger.info_once(
+                    "%s model detected, consider setting `disable_any_whitespace` to prevent runaway generation of whitespaces.",  # noqa: E501
+                    model_with_warn,
+                )
             # Validate the schema and raise ValueError here if it is invalid.
             # This is to avoid exceptions in model execution, which will crash
             # the engine worker process.
@@ -348,7 +346,7 @@ def __call__(self, input_ids: list[int],
                  scores: torch.Tensor) -> torch.Tensor:
 
         # Skip the structured logits processing if reasoning is not finished.
-        # reasoner is not None only when `--enable-reasoning` is set.
+        # reasoner is not None only when `--reasoning-parser` is set.
         if self.reasoner is not None and \
         not self.reasoner.is_reasoning_end(
                 input_ids):
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 1de0f499c1a..f082afb7e9c 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -354,6 +354,7 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
     "gelu": lambda: GeluAndMul(),
     "silu": lambda: SiluAndMul(),
+    "gelu_and_mul": lambda: GeluAndMul(),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..555d1736445
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 00000000000..e539335d4dc
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 00000000000..5de5605d401
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json
new file mode 100644
index 00000000000..2221e99cd1a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..74374c573f3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c275cecc159
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b34b6e4e8a8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json
new file mode 100644
index 00000000000..ab169a0183d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..324ad7b22fe
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..ab6e1555290
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 00000000000..249359fb93d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..99425469f57
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b4efc9b7e44
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 00000000000..03dfc73b6c0
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..9c07695ba91
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..beaac7f641e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json
new file mode 100644
index 00000000000..ebff99e26dc
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..857d11e4889
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README
index 787bd061166..85970e2d1ce 100644
--- a/vllm/model_executor/layers/fused_moe/configs/README
+++ b/vllm/model_executor/layers/fused_moe/configs/README
@@ -9,5 +9,4 @@ The example configurations provided are for the Mixtral model for TP2 on H100
 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
 N = 7168 and for TP4 we have N = 3584.
 
-Please feel free to tune the configurations using scripts in `benchmarks/kernels/benchmark_moe.py`
-Some of the configurations files are copied from the SGLang repository. Thank you!
+See `benchmark/kernels/benchmark_moe.py` on how to generate these config files.
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index d6a27aa0ddc..960c7f83485 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -15,7 +15,7 @@ def cutlass_moe_fp8(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
     topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
+    topk_ids_: torch.Tensor,
     ab_strides1: torch.Tensor,
     c_strides1: torch.Tensor,
     ab_strides2: torch.Tensor,
@@ -23,6 +23,7 @@ def cutlass_moe_fp8(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     out_dtype: torch.dtype = torch.half,
+    expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     """
@@ -57,12 +58,19 @@ def cutlass_moe_fp8(
         quantize the intermediate result between the gemms.
         Shape: scalar or [M]
     - out_dtype (torch.Tensor): The output tensor type.
+    - expert_map (Optional[torch.Tensor]): In the case of Expert parallel,
+        every Rank is responsible for a subset of experts. expert_map is a
+        mapping from global expert-id to local expert-id. When expert_map[i]
+        is -1, it means that this Rank is not responsible for global
+        expert-id i.
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
 
     Returns:
     - torch.Tensor: The fp16 output tensor after applying the MoE layer.
     """
 
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
     assert w1_q.dtype == torch.float8_e4m3fn
     assert w2_q.dtype == torch.float8_e4m3fn
     assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
@@ -96,7 +104,13 @@ def cutlass_moe_fp8(
     k = w1_q.size(1)
     n = w2_q.size(1)
 
-    topk = topk_ids.size(1)
+    local_topk_ids = topk_ids_
+    if expert_map is not None:
+        "Translate info from expert_map to topk_ids"
+        local_topk_ids = torch.where(expert_map[topk_ids_] != -1,
+                                     expert_map[topk_ids_], -1)
+
+    topk = local_topk_ids.size(1)
 
     per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
         a2_scale.numel() != 1 if a2_scale is not None else False)
@@ -120,10 +134,23 @@ def cutlass_moe_fp8(
                                  dtype=torch.int32,
                                  device=device)
 
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-
-    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+    a_map_initializer = torch.empty
+    c2_initializer = torch.empty
+    if expert_map is not None:
+        # With expert_map each Rank processes only a subset of experts. As
+        # a result not all of a_map and c2 tensors are filled. We fill it
+        # zeros for correctness.
+        a_map_initializer = torch.zeros
+        c2_initializer = torch.zeros
+
+    a_map = a_map_initializer((local_topk_ids.numel()),
+                              dtype=torch.int32,
+                              device=device)
+    c_map = torch.empty((local_topk_ids.numel()),
+                        dtype=torch.int32,
+                        device=device)
+
+    ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, a_map, c_map, num_experts, n,
                                 k)
 
@@ -131,7 +158,7 @@ def cutlass_moe_fp8(
     rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
 
     c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
-    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+    c2 = c2_initializer((m * topk, k), device=device, dtype=out_dtype)
 
     ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
                        expert_offsets[:-1], problem_sizes1, ab_strides1,
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 62614a59cbe..238808b226f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -71,8 +71,8 @@ def single_marlin_moe(
     E = w.shape[0]
     N = w.shape[2] // (num_bits // 2)
 
-    topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                        renormalize)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        hidden_states, gating_output, topk, renormalize)
 
     # This might not be an optimal config for a single MMM
     get_config_func = functools.partial(try_get_optimal_moe_config,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 2a988b8644b..c1edbda0dd2 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -23,9 +23,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
-from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
-                                   rocm_aiter_fused_experts,
-                                   rocm_aiter_topk_softmax)
+from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
 logger = init_logger(__name__)
 
@@ -846,6 +844,7 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
 
 def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
     if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_topk_softmax
         return rocm_aiter_topk_softmax
     return vllm_topk_softmax
 
@@ -855,7 +854,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
@@ -869,20 +868,19 @@ def fused_topk(
                            topk,
                            dtype=torch.int32,
                            device=hidden_states.device)
-    token_expert_indicies = torch.empty(M,
-                                        topk,
-                                        dtype=torch.int32,
-                                        device=hidden_states.device)
+    token_expert_indices = torch.empty(M,
+                                       topk,
+                                       dtype=torch.int32,
+                                       device=hidden_states.device)
 
     gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
 
     topk_func = dispatch_topk_func()
     topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
-                                       token_expert_indicies,
+                                       token_expert_indices,
                                        gating_output_float, renormalize)
 
-    del token_expert_indicies  # Not used. Will be used in the future.
-    return topk_weights, topk_ids
+    return topk_weights, topk_ids, token_expert_indices
 
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
@@ -1102,6 +1100,7 @@ def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
 
 def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
     if is_rocm_aiter_moe_enabled():
+        from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
         return rocm_aiter_fused_experts
     if inplace:
         return torch_vllm_inplace_fused_experts
@@ -1510,8 +1509,8 @@ def fused_moe(
                                               topk, renormalize,
                                               num_expert_group, topk_group)
     elif custom_routing_function is None:
-        topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
-                                            renormalize)
+        topk_weights, topk_ids, token_expert_indices = fused_topk(
+            hidden_states, gating_output, topk, renormalize)
     else:
         topk_weights, topk_ids = custom_routing_function(
             hidden_states, gating_output, topk, renormalize)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6e32e3e2f50..35994c8ac6a 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -113,12 +113,9 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-        layer.w13_weight = torch.nn.Parameter(self._maybe_pad_weight(
-            layer.w13_weight.data),
-                                              requires_grad=False)
-        layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight(
-            layer.w2_weight.data),
-                                             requires_grad=False)
+        # Padding the weight for better performance on ROCm
+        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
+        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
         # Lazy import to avoid importing triton.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             is_rocm_aiter_moe_enabled, shuffle_weights)
@@ -127,10 +124,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             shuffled_w13, shuffled_w2 = shuffle_weights(
                 layer.w13_weight.data, layer.w2_weight.data)
 
-            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
-                                                  requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
-                                                 requires_grad=False)
+            layer.w13_weight.data = shuffled_w13
+            layer.w2_weight.data = shuffled_w2
 
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
@@ -422,6 +417,7 @@ def __init__(
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
 
         # Note: here we guard against accessing the TP and DP groups when
         # uninitialized (this happens when testing)
@@ -805,10 +801,11 @@ def select_experts(hidden_states: torch.Tensor,
                 scoring_func=scoring_func,
                 e_score_correction_bias=e_score_correction_bias)
         elif custom_routing_function is None:
-            topk_weights, topk_ids = fused_topk(hidden_states=hidden_states,
-                                                gating_output=router_logits,
-                                                topk=top_k,
-                                                renormalize=renormalize)
+            topk_weights, topk_ids, token_expert_indices = fused_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize)
         else:
             topk_weights, topk_ids = custom_routing_function(
                 hidden_states=hidden_states,
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
new file mode 100644
index 00000000000..cdf7e31c143
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+
+
+def moe_permute(
+    hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+    expert_map: Optional[torch.Tensor] = None,
+    align_block_size: Optional[int] = None,
+    fill_invalid_expert: int = -1
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    This function expands and permutes activation to gather uncontinuous tokens 
+      for each expert.
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.    
+    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - token_expert_indices (torch.Tensor): indice for expanded hidden.
+    - topk (int): The number of top-k experts to select.
+    - n_expert (int): The number of expert.
+    - n_local_expert (int): The number of expert in current EP rank.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
+        from the global expert space to the local expert space of the expert 
+        parallel shard.
+    - align_block_size (Optional[int]): align group gemm block size for deepgemm
+    - fill_invalid_expert(int): fill expert id in m_indices for invalid expert 
+      to workaround DeepGemm unsupported -1 in m_indices
+    Returns:
+    - permuted_hidden_states (torch.Tensor): permuted activation.
+    - expert_first_token_offset (torch.Tensor): offset of the first token
+       of each expert for standard grouped gemm. if enable 'align_block_size'
+       expert_first_token_offset will align up to 'align_block_size'.
+    - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
+    - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records 
+    the group which the j-th row of the LHS belong to.`
+    """
+    n_token, n_hidden = hidden_states.shape
+    assert (n_hidden * hidden_states.element_size()
+            ) % 16 == 0, "permue kernel need hidden dim align to 16B"
+    permuted_row_size = n_token * topk
+    if align_block_size is not None:
+        permuted_row_size = (permuted_row_size + n_expert *
+                             (align_block_size - 1) + align_block_size -
+                             1) // align_block_size * align_block_size
+
+    permuted_hidden_states = torch.empty(
+        (permuted_row_size, n_hidden),
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    m_indices = torch.full((permuted_row_size, ),
+                           fill_invalid_expert,
+                           dtype=torch.int32,
+                           device=hidden_states.device)
+    expert_first_token_offset = torch.empty(n_local_expert + 1,
+                                            dtype=torch.int64,
+                                            device=hidden_states.device)
+    src_row_id2dst_row_id_map = torch.empty((n_token, topk),
+                                            dtype=torch.int32,
+                                            device=hidden_states.device)
+    torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids,
+                                 token_expert_indices, expert_map, n_expert,
+                                 n_local_expert, topk, align_block_size,
+                                 permuted_hidden_states,
+                                 expert_first_token_offset,
+                                 src_row_id2dst_row_id_map, m_indices)
+    return (permuted_hidden_states, expert_first_token_offset,
+            src_row_id2dst_row_id_map, m_indices)
+
+
+def moe_unpermute(
+    permuted_hidden_states: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    src_row_id2dst_row_id_map: torch.Tensor,
+    expert_first_token_offset: torch.Tensor,
+    topk: int,
+    n_expert: int,
+    n_local_expert: int,
+) -> torch.Tensor:
+    """
+    This function expands and permutes activation to gathering uncontinuous 
+      tokens for each expert.
+    Parameters:
+    - permuted_hidden_states (torch.Tensor): permuted activation.
+    - topk_weights (torch.Tensor): topk expert route weight for each token.
+    - topk_ids (torch.Tensor): topk expert route id for each token.
+    - expert_first_token_offset (torch.Tensor): offset of the first token
+       of each expert for grouped gemm.
+    - topk (int): The number of top-k experts to select.
+    - n_expert (int): The number of expert.
+    - n_local_expert (int): The number of expert in current EP rank.
+    Returns:
+    - hidden_states (torch.Tensor): The reduced and unpermuted activation 
+      tensor.  
+    """
+    n_token, n_hidden = topk_weights.shape[0], permuted_hidden_states.shape[-1]
+    assert (n_hidden * permuted_hidden_states.element_size()
+            ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
+    hidden_states = torch.empty((n_token, n_hidden),
+                                dtype=permuted_hidden_states.dtype,
+                                device=permuted_hidden_states.device)
+
+    torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights,
+                                   topk_ids, src_row_id2dst_row_id_map,
+                                   expert_first_token_offset, n_expert,
+                                   n_local_expert, topk, hidden_states)
+    return hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index ac158a7eee5..acaa93f5a23 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,126 +1,385 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional
+from functools import cache
+from typing import List, Optional, Tuple
 
 import torch
 
-import vllm.envs as envs
+from vllm import envs
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 
+@cache
 def is_rocm_aiter_moe_enabled() -> bool:
     return current_platform.is_rocm() \
         and envs.VLLM_ROCM_USE_AITER_MOE \
-        and envs.VLLM_ROCM_USE_AITER \
+        and envs.VLLM_ROCM_USE_AITER
 
 
-def is_rocm_aiter_block_scaled_moe_enabled() -> bool:
-    return is_rocm_aiter_moe_enabled() and \
-        envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
-
-
-def rocm_aiter_fused_experts(
-        *,
+def rocm_aiter_asm_moe_tkw1_impl(
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
         w2: torch.Tensor,
-        topk_weights: torch.Tensor,
+        topk_weight: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_str: str = "silu") -> torch.Tensor:
+
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = \
+        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+
+    return asm_moe_tkw1(hidden_states,
+                        w1,
+                        w2,
+                        topk_weight,
+                        topk_ids,
+                        fc1_scale=fc1_scale,
+                        fc2_scale=fc2_scale,
+                        fc1_smooth_scale=fc1_smooth_scale,
+                        fc2_smooth_scale=fc2_smooth_scale,
+                        a16=a16,
+                        per_tensor_quant_scale=per_tensor_quant_scale,
+                        expert_mask=expert_mask,
+                        activation=activation)
+
+
+def rocm_aiter_asm_moe_tkw1_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weight: torch.Tensor,
         topk_ids: torch.Tensor,
-        use_fp8_w8a8: bool = False,
-        w1_scale: Optional[torch.Tensor] = None,
-        w2_scale: Optional[torch.Tensor] = None,
-        block_shape: Optional[List[int]] = None,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
         expert_mask: Optional[torch.Tensor] = None,
-        **kwagrs  # Ignore additional keyword arguments
-) -> torch.Tensor:
+        activation_str: str = "silu") -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_ck_moe_impl(hidden_states: torch.Tensor, w1: torch.Tensor,
+                           w2: torch.Tensor, topk_weights: torch.Tensor,
+                           topk_ids: torch.Tensor) -> torch.Tensor:
+    from aiter import ck_moe
+    return ck_moe(hidden_states=hidden_states,
+                  w1=w1,
+                  w2=w2,
+                  topk_weights=topk_weights,
+                  topk_ids=topk_ids)
+
 
-    import aiter as rocm_aiter
+def rocm_aiter_ck_moe_fake(hidden_states: torch.Tensor, w1: torch.Tensor,
+                           w2: torch.Tensor, topk_weights: torch.Tensor,
+                           topk_ids: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        hidden_states_dtype: torch.dtype,
+        expert_mask: torch.Tensor,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        block_shape: List[int],
+        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+    from aiter import fmoe_fp8_blockscale_g1u1
+    from aiter.fused_moe_bf16_asm import moe_sorting_ck
+
+    topk = topk_ids.shape[1]
+    model_dim = w1.shape[-1]
+    local_E = E = w1.shape[0]
+    if expert_mask is not None:
+        E = expert_mask.numel()
+
+    (
+        sorted_token_ids,
+        sorted_weight_buf,
+        sorted_expert_ids,
+        num_valid_ids,
+        out_asm,
+    ) = moe_sorting_ck(topk_ids,
+                       topk_weights,
+                       E,
+                       model_dim,
+                       hidden_states_dtype,
+                       expert_mask=expert_mask)
+
+    fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids,
+                             sorted_weight_buf, sorted_expert_ids,
+                             num_valid_ids, topk, w1_scale.view(local_E, -1),
+                             w2_scale.view(local_E, -1),
+                             a1_scale.t().contiguous(), *block_shape,
+                             smooth_scale)
+
+    return out_asm
+
+
+def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        hidden_states_dtype: torch.dtype,
+        expert_mask: torch.Tensor,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        block_shape: List[int],
+        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    return torch.empty_like(a1, dtype=torch.bf16)
+
+
+def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor,
+                            w1: torch.Tensor,
+                            w2: torch.Tensor,
+                            topk_weight: torch.Tensor,
+                            topk_ids: torch.Tensor,
+                            fc1_scale: Optional[torch.Tensor] = None,
+                            fc2_scale: Optional[torch.Tensor] = None,
+                            fc1_smooth_scale: Optional[torch.Tensor] = None,
+                            fc2_smooth_scale: Optional[torch.Tensor] = None,
+                            a16: bool = False,
+                            activation: str = "silu") -> torch.Tensor:
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+    from aiter import ActivationType
+
+    assert activation in ["silu", "gelu"], "The given activation:" \
+                                          f" {activation}"         \
+                                           " is not supported in" \
+                                           " AITER."
+    if activation == "silu":
+        aiter_activation = ActivationType.Silu
+    else:
+        aiter_activation = ActivationType.Gelu
+
+    return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
+                                       w1=w1,
+                                       w2=w2,
+                                       topk_weight=topk_weight,
+                                       topk_ids=topk_ids,
+                                       fc1_scale=fc1_scale,
+                                       fc2_scale=fc2_scale,
+                                       fc1_smooth_scale=fc1_smooth_scale,
+                                       fc2_smooth_scale=fc2_smooth_scale,
+                                       a16=a16,
+                                       activation=aiter_activation)
+
+
+def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor,
+                            w1: torch.Tensor,
+                            w2: torch.Tensor,
+                            topk_weight: torch.Tensor,
+                            topk_ids: torch.Tensor,
+                            fc1_scale: Optional[torch.Tensor] = None,
+                            fc2_scale: Optional[torch.Tensor] = None,
+                            fc1_smooth_scale: Optional[torch.Tensor] = None,
+                            fc2_smooth_scale: Optional[torch.Tensor] = None,
+                            a16: bool = False,
+                            activation: str = "silu") -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    from aiter import topk_softmax
+    topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                 gating_output, renormalize)
+
+
+def rocm_aiter_topk_softmax_fake(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    pass
+
+
+if current_platform.is_rocm():
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe_tkw1",
+        op_func=rocm_aiter_asm_moe_tkw1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_ck_moe",
+        op_func=rocm_aiter_ck_moe_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_ck_moe_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1",
+        op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe",
+        op_func=rocm_aiter_asm_moe_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_topk_softmax",
+        op_func=rocm_aiter_topk_softmax_impl,
+        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+        fake_impl=rocm_aiter_topk_softmax_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
+                             w1: torch.Tensor,
+                             w2: torch.Tensor,
+                             topk_weights: torch.Tensor,
+                             topk_ids: torch.Tensor,
+                             inplace: bool = False,
+                             activation: str = "silu",
+                             apply_router_weight_on_input: bool = False,
+                             use_fp8_w8a8: bool = False,
+                             use_int8_w8a8: bool = False,
+                             use_int8_w8a16: bool = False,
+                             use_int4_w4a16: bool = False,
+                             per_channel_quant: bool = False,
+                             global_num_experts: int = -1,
+                             expert_map: Optional[torch.Tensor] = None,
+                             w1_scale: Optional[torch.Tensor] = None,
+                             w2_scale: Optional[torch.Tensor] = None,
+                             w1_zp: Optional[torch.Tensor] = None,
+                             w2_zp: Optional[torch.Tensor] = None,
+                             a1_scale: Optional[torch.Tensor] = None,
+                             a2_scale: Optional[torch.Tensor] = None,
+                             block_shape: Optional[List[int]] = None,
+                             allow_deep_gemm: bool = False) -> torch.Tensor:
 
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
         per_token_group_quant_fp8)
 
-    if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+
+    # w8a8 block-scaled
+    if block_shape is not None and use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for block scaled moe"
+        )
         assert w1_scale is not None
         assert w2_scale is not None
 
-        local_E = E = w1.shape[0]
-        if expert_mask is not None:
-            E = expert_mask.numel()
-
-        topk = topk_ids.shape[1]
-        model_dim = w1.shape[-1]
-        dtype = hidden_states.dtype
         # The default block sizes are 128 in AITER.
-        if block_shape is None:
-            block_shape = [128, 128]
-
-        scale_blk_k = block_shape[1]
-
-        (
-            sorted_token_ids,
-            sorted_weight_buf,
-            sorted_expert_ids,
-            num_valid_ids,
-            out_asm,
-        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
-                                               topk_weights,
-                                               E,
-                                               model_dim,
-                                               dtype,
-                                               expert_mask=expert_mask)
-
-        a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
-        rocm_aiter.fmoe_fp8_blockscale_g1u1(
-            out_asm,
-            a1,
+        block_shape = [128, 128] if block_shape is None else block_shape
+
+        a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1])
+
+        return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1(
+            topk_ids, topk_weights, hidden_states.dtype, expert_map, a1, w1,
+            w2, w1_scale, w2_scale, a1_scale, block_shape, None)
+
+    # w8a8 per-channel quantization
+    elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when"
+            " `apply_router_weight_on_input` is True")
+
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
             w1,
             w2,
-            sorted_token_ids,
-            sorted_weight_buf,
-            sorted_expert_ids,
-            num_valid_ids,
-            topk,
-            w1_scale.view(local_E, -1),
-            w2_scale.view(local_E, -1),
-            a1_scale.t().contiguous(),
-            block_shape[0],
-            block_shape[1],
-            None,
-        )
-        return out_asm
-
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=expert_map,
+            activation_str=activation)
+
+    # w8a8 per-tensor activation per-tensor weight
     elif use_fp8_w8a8:
-        return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
-                                           w1=w1,
-                                           w2=w2,
-                                           topk_weight=topk_weights,
-                                           topk_ids=topk_ids,
-                                           fc1_scale=w1_scale,
-                                           fc2_scale=w2_scale,
-                                           fc1_smooth_scale=None,
-                                           fc2_smooth_scale=None,
-                                           a16=False)
-
-    return rocm_aiter.ck_moe(hidden_states=hidden_states,
-                             w1=w1,
-                             w2=w2,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids)
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for fp8_w8a8")
+        return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states,
+                                                 w1=w1,
+                                                 w2=w2,
+                                                 topk_weight=topk_weights,
+                                                 topk_ids=topk_ids,
+                                                 fc1_scale=w1_scale,
+                                                 fc2_scale=w2_scale,
+                                                 fc1_smooth_scale=None,
+                                                 fc2_smooth_scale=None,
+                                                 a16=False,
+                                                 activation=activation)
+    if apply_router_weight_on_input:
+        assert (topk_weights.dim() == 2
+                ), "`topk_weights` should be in shape (num_tokens, topk)"
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+
+    # w16a16 fallback to rocm_aiter_ck_moe w16a16
+    return torch.ops.vllm.rocm_aiter_ck_moe(hidden_states=hidden_states,
+                                            w1=w1,
+                                            w2=w2,
+                                            topk_weights=topk_weights,
+                                            topk_ids=topk_ids)
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
                             topk_indices: torch.Tensor,
                             token_expert_indices: torch.Tensor,
                             gating_output: torch.Tensor,
-                            renormalize: bool) -> tuple[torch.Tensor, ...]:
-    import aiter as rocm_aiter
-    rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices,
-                            gating_output, renormalize)
-
+                            renormalize: bool) -> Tuple[torch.Tensor, ...]:
+    torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
+                                           token_expert_indices, gating_output,
+                                           renormalize)
     return topk_weights, topk_indices
 
 
-def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+def shuffle_weights(*tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
     """
     Applies shuffle_weight function from AITER to each 
     input tensor and returns them.
@@ -129,15 +388,14 @@ def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
     *tensors: Variable number of torch.Tensor objects.
 
     Returns:
-    A tuple of shuffled tensors.
+    A Tuple of shuffled tensors.
     """
     from aiter.ops.shuffle import shuffle_weight
-
     return tuple(shuffle_weight(tensor) for tensor in tensors)
 
 
 def expand_weights(*tensors: torch.Tensor,
-                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
+                   expansion_dims: list[int]) -> Tuple[torch.Tensor, ...]:
     """
     Expands the dimensions of input tensors.
 
@@ -147,7 +405,7 @@ def expand_weights(*tensors: torch.Tensor,
         corresponding to each tensor.
 
     Returns:
-        A tuple of tensors with expanded dimensions.
+        A Tuple of tensors with expanded dimensions.
     """
 
     assert len(tensors) == len(expansion_dims), \
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 5e8eb6c54c8..87d9b959e64 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -168,7 +168,8 @@ def forward_hpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from vllm_hpu_extension.ops import HPUFusedRMSNorm
+        from vllm_hpu_extension.kernels import rms_norm
+        HPUFusedRMSNorm = rms_norm()
         if HPUFusedRMSNorm is None:
             return self.forward_native(x, residual)
         if residual is not None:
@@ -240,7 +241,10 @@ def forward_static(
         """PyTorch-native implementation equivalent to forward()."""
         orig_dtype = x.dtype
         if residual is not None:
-            x = x + residual
+            if orig_dtype == torch.float16:
+                x = x + residual.float()
+            else:
+                x = x + residual
             residual = x
 
         x = x.float()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 21035a9e5db..794de4c383b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -6,7 +6,6 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -17,6 +16,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
 # yapf: disable
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            BlockQuantScaleParameter,
@@ -31,6 +31,8 @@
 
 WEIGHT_LOADER_V2_SUPPORTED = [
     "CompressedTensorsLinearMethod",
+    "BitBLASLinearMethod",
+    "GPTQBitBLASLinearMethod",
     "AWQMarlinLinearMethod",
     "AWQLinearMethod",
     "GPTQMarlinLinearMethod",
@@ -50,6 +52,15 @@
 ]
 
 
+def adjust_bitblas_shard(param, shard_size, shard_offset):
+    bitblas_tile_size = getattr(param, "bitblas_tile_size", None)
+    if bitblas_tile_size is not None:
+        return (shard_size // bitblas_tile_size,
+                shard_offset // bitblas_tile_size)
+
+    return shard_size, shard_offset
+
+
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
     if marlin_tile_size is None:
@@ -188,7 +199,7 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return F.linear(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(x, layer.weight, bias)
 
 
 class LinearBase(torch.nn.Module):
@@ -615,6 +626,9 @@ def weight_loader(self,
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
+                shard_size, shard_offset = adjust_bitblas_shard(
+                    param, shard_size, shard_offset)
+
                 if use_bitsandbytes_4bit:
                     index = list(itertools.accumulate([0] + self.output_sizes))
                     orig_offsets = {
@@ -646,6 +660,8 @@ def weight_loader(self,
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
+            shard_size, shard_offset = adjust_bitblas_shard(
+                param, shard_size, shard_offset)
 
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
@@ -913,6 +929,15 @@ def weight_loader_v2(self,
         shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
         shard_size = self._get_shard_size_mapping(loaded_shard_id)
 
+        # Note(simon): This is needed for Qwen3's fp8 quantization.
+        if isinstance(param, BlockQuantScaleParameter):
+            assert self.quant_method is not None
+            assert hasattr(self.quant_method, "quant_config")
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (shard_offset + block_n - 1) // block_n
+            shard_size = (shard_size + block_n - 1) // block_n
+
         param.load_qkv_weight(loaded_weight=loaded_weight,
                               num_heads=self.num_kv_head_replicas,
                               shard_id=loaded_shard_id,
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index b31b980fbe8..9fbad9d2f91 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -10,8 +10,10 @@
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import HAS_TRITON
 
-TRITON3 = version.parse(triton.__version__) >= version.parse("3.0.0")
+TRITON3 = HAS_TRITON and (version.parse(triton.__version__)
+                          >= version.parse("3.0.0"))
 
 if TRITON3:
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 89533955fd7..15e08220b7b 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Type
+from typing import Literal, Type, get_args
 
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 
-QUANTIZATION_METHODS: List[str] = [
+QuantizationMethods = Literal[
     "aqlm",
     "awq",
     "deepspeedfp",
@@ -15,12 +15,12 @@
     "fbgemm_fp8",
     "modelopt",
     "nvfp4",
-    # The order of gptq methods is important for config.py iteration over
-    # override_quantization_method(..)
     "marlin",
+    "bitblas",
     "gguf",
     "gptq_marlin_24",
     "gptq_marlin",
+    "gptq_bitblas",
     "awq_marlin",
     "gptq",
     "compressed-tensors",
@@ -34,6 +34,7 @@
     "moe_wna16",
     "torchao",
 ]
+QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
 # The customized quantization methods which will be added to this dict.
 _CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
@@ -85,6 +86,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .aqlm import AQLMConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
+    from .bitblas import BitBLASConfig
     from .bitsandbytes import BitsAndBytesConfig
     from .compressed_tensors.compressed_tensors import (  # noqa: E501
         CompressedTensorsConfig)
@@ -94,6 +96,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .fp8 import Fp8Config
     from .gguf import GGUFConfig
     from .gptq import GPTQConfig
+    from .gptq_bitblas import GPTQBitBLASConfig
     from .gptq_marlin import GPTQMarlinConfig
     from .gptq_marlin_24 import GPTQMarlin24Config
     from .hqq_marlin import HQQMarlinConfig
@@ -107,7 +110,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .torchao import TorchAOConfig
     from .tpu_int8 import Int8TpuConfig
 
-    method_to_config: Dict[str, Type[QuantizationConfig]] = {
+    method_to_config: dict[str, Type[QuantizationConfig]] = {
         "aqlm": AQLMConfig,
         "awq": AWQConfig,
         "deepspeedfp": DeepSpeedFPConfig,
@@ -116,12 +119,12 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "fbgemm_fp8": FBGEMMFp8Config,
         "modelopt": ModelOptFp8Config,
         "nvfp4": ModelOptNvFp4Config,
-        # The order of gptq methods is important for config.py iteration over
-        # override_quantization_method(..)
         "marlin": MarlinConfig,
+        "bitblas": BitBLASConfig,
         "gguf": GGUFConfig,
         "gptq_marlin_24": GPTQMarlin24Config,
         "gptq_marlin": GPTQMarlinConfig,
+        "gptq_bitblas": GPTQBitBLASConfig,
         "awq_marlin": AWQMarlinConfig,
         "gptq": GPTQConfig,
         "compressed-tensors": CompressedTensorsConfig,
@@ -144,6 +147,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
 
 __all__ = [
     "QuantizationConfig",
+    "QuantizationMethods",
     "get_quantization_config",
     "QUANTIZATION_METHODS",
-]
+]
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 10f5241f9a7..0b74e8faff9 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -12,6 +12,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
@@ -186,7 +187,7 @@ def __repr__(self) -> str:
                 f"out_group_size={self.out_group_size})")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "aqlm"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 227be1497d0..cfc31ae2054 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -7,6 +7,7 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
@@ -44,7 +45,7 @@ def __repr__(self) -> str:
                 f"zero_point={self.zero_point}, "
                 f"modules_to_not_convert={self.modules_to_not_convert})")
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "awq"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index ef4a7765d61..f7c885c2baa 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
                                                set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.awq import (AWQConfig,
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -73,7 +74,7 @@ def __repr__(self) -> str:
                 f"modules_to_not_convert={self.modules_to_not_convert})")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "awq_marlin"
 
     @classmethod
@@ -101,8 +102,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
                    modules_to_not_convert, config)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
         is_valid_user_quant = (user_quant is None or user_quant == "marlin"
                                or user_quant == "awq_marlin")
@@ -129,8 +130,9 @@ def get_quant_method(self, layer: torch.nn.Module,
             # Check if the layer is supported by AWQMarlin.
             if not check_marlin_supports_layer(layer, self.group_size):
                 logger.warning_once(
-                    f"Layer '{prefix}' is not supported by AWQMarlin. "
-                    "Falling back to unoptimized AWQ kernels.")
+                    "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.",  # noqa: E501
+                    prefix,
+                )
                 return AWQConfig.from_config(
                     self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
@@ -138,7 +140,7 @@ def get_quant_method(self, layer: torch.nn.Module,
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
             if not check_moe_marlin_supports_layer(layer, self.group_size):
-                logger.warning_one(
+                logger.warning_once(
                     f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
                     "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 5ef11546fd4..8cf058b406f 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -2,11 +2,16 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
 
 import torch
 from torch import nn
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+else:
+    QuantizationMethods = str
+
 
 class QuantizeMethodBase(ABC):
     """Base class for different quantized methods."""
@@ -66,7 +71,7 @@ def __init__(self):
         self.packed_modules_mapping: Dict[str, List[str]] = dict()
 
     @abstractmethod
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         """Name of the quantization method."""
         raise NotImplementedError
 
@@ -99,8 +104,8 @@ def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
         raise NotImplementedError
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         """
            Detects if this quantization method can support a given checkpoint
            format by overriding the user specified quantization method -- 
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
new file mode 100644
index 00000000000..ab858d72034
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_NUM_BITS,
+    BITBLAS_SUPPORTED_SYM, MINIMUM_BITBLAS_VERSION)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.model_executor.utils import set_weight_attrs
+
+logger = init_logger(__name__)
+
+
+class BitBLASConfig(QuantizationConfig):
+    """Config class for BitBLAS.
+
+    Reference: https://github.com/Microsoft/BitBLAS
+    """
+    TORCH_DTYPE = torch.float16
+    STORAGE_DTYPE = "int8"  # assume int8 storage
+    TORCH_STORAGE_DTYPE = getattr(torch, STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # gptq_with_bitblas prefer "quantized implementation"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: Optional[int],
+        desc_act: Optional[bool],
+        is_sym: Optional[bool],
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {BITBLAS_SUPPORTED_SYM} are supported.")
+
+        storage_dtype = self.STORAGE_DTYPE
+        storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
+
+        self.storage_dtype = storage_dtype
+        self.storage_torch_dtype = self.TORCH_STORAGE_DTYPE
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+    def __repr__(self) -> str:
+        return (f"BitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}, "
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @staticmethod
+    def get_from_keys(config: Dict[str, Any],
+                      keys: List[str],
+                      default: Any = None) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        return default
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "BitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"], -1)
+        desc_act = cls.get_from_keys(config, ["desc_act"], False)
+        is_sym = cls.get_from_keys(config, ["sym"], False)
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_bitblas_format: bool
+        is_bitblas_format = (hf_quant_cfg.get("checkpoint_format") == "bitblas"
+                             or hf_quant_cfg.get("is_bitblas_format", False))
+
+        is_valid_user_quant = (user_quant is None or user_quant == "gptq"
+                               or user_quant == "bitblas")
+
+        if is_bitblas_format and is_valid_user_quant:
+            msg = ("The model is serialized in {} format. Using {} kernel.".
+                   format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["BitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return BitBLASLinearMethod(self)
+        return None
+
+
+class BitBLASLinearMethod(LinearMethodBase):
+    """Linear method for BitBLAS.
+
+    Args:
+        quant_config: The BitBLAS quantization config.
+    """
+    # USE BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS
+    # Instead of BITBLAS_OPTIMIZE_FEATURES
+    # If you want to high contiguous batching
+    # performance
+    OPT_FEATURES = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING = True
+    BITBLAS_DTYPES = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+
+    def __init__(self, quant_config: BitBLASConfig):
+        self.quant_config = quant_config
+
+    def create_weights_gptq(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing quantized 
+        weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_size_per_partition: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or if the 
+            input size per partition is not divisible by the group size in 
+            `quant_config`.
+        """
+        del input_size, output_size  # Unused arguments.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype not in self.quant_config.get_supported_act_dtypes():
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+        group_size = self.quant_config.group_size
+        if group_size is None:
+            group_size = -1
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if (group_size != -1 and input_size_per_partition % group_size != 0):
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({group_size}).")
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self._configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            enable_tuning=self.ENABLE_TUNING,
+            bias=False,
+            layout="nt",
+            bits=self.quant_config.weight_bits,
+        )
+
+        # Initialize quantized weights with dimensions
+        # Quantized 4Bit weights packed.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                self.bitblas_matmul.retrieve_weight_shape(),
+                device="cuda",
+                dtype=self.quant_config.storage_torch_dtype,
+                requires_grad=False,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            bitblas_tile_size=(self.bitblas_matmul.retrieve_weight_shape()[-2]
+                               if self.bitblas_matmul.propagate_b else None),
+            weight_loader=weight_loader,
+        )
+
+        # Compute the number of input groups for channel-wise quantization.
+        input_groups = (1 if group_size == -1 else input_size_per_partition //
+                        group_size)
+
+        # Initialize scales and zeros for the quantized weights.
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                output_size_per_partition,
+                input_groups,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=0,
+                                                **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(output_dim=0,
+                                              input_dim=1,
+                                              **weight_scale_args)
+
+        if self.quant_config.zeros_mode == "quantized":
+            zeros = PackedvLLMParameter(
+                data=torch.empty(
+                    input_groups,
+                    output_size_per_partition // self.quant_config.pack_factor,
+                    device="cuda",
+                    dtype=self.quant_config.storage_torch_dtype,
+                    requires_grad=False,
+                ),
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                weight_loader=weight_loader,
+            )
+
+        else:
+            zeros = BasevLLMParameter(
+                torch.empty(output_size_per_partition,
+                            input_groups,
+                            device="cuda",
+                            dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            # Set attributes to indicate how scales and zeros are applied.
+            set_weight_attrs(zeros, {
+                "input_dim": None if input_groups == 1 else 1,
+                "output_dim": 0,
+            })
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("zeros", zeros)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.quant_config.quant_method == "gptq":
+            return self.create_weights_gptq(layer, input_size_per_partition,
+                                            output_partition_sizes, input_size,
+                                            output_size, params_dtype,
+                                            **extra_weight_attrs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+        out_dtype="float16",
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+
+        with_scaling = False
+        with_zeros = False
+        group_size = self.quant_config.group_size
+        zeros_mode = self.quant_config.zeros_mode
+        if self.quant_config.quant_method == "gptq":
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if self.quant_config.is_sym:
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
+
+        matmul_config = MatmulConfig(
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=out_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=self.quant_config.STORAGE_DTYPE,
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                TUNING_MESSAGE = (f"BitBLAS Operator {config} is tuning ...")
+                logger.info(TUNING_MESSAGE)
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNED_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNED_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created."
+                logger.info(_message)
+        else:
+            _message = (
+                f"BitBLAS Operator {config} found in global_operator_cache.")
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.zeros
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        if self.quant_config.is_sym:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales)
+        else:
+            output_2d = self.bitblas_matmul(x_2d, qweight, scales, qzeros)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
+
+    def apply(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        if self.quant_config.quant_method == "gptq":
+            return self.apply_gptq(*args, **kwargs)
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {self.quant_config.quant_method}")
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index f5d32efe836..a472779d930 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -7,6 +7,7 @@
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
                                                set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.utils import direct_register_custom_op
@@ -56,7 +57,7 @@ def __repr__(self) -> str:
                 f"llm_int8_skip_modules={self.llm_int8_skip_modules})")
 
     @classmethod
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "bitsandbytes"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index cb9a48d7746..0585c09bd84 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -16,6 +16,7 @@
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
@@ -71,8 +72,8 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
     def get_min_capability(cls) -> int:
         return 70
 
-    def get_name(self) -> str:
-        return "compressed_tensors"
+    def get_name(self) -> QuantizationMethods:
+        return "compressed-tensors"
 
     def get_quant_method(
         self,
@@ -384,7 +385,7 @@ def get_scheme(self,
 
         Detect whether a layer_name is found in any target and
         use the quantization scheme corresponding to the matched target
-        to select the CompressedTensorsScheme used for infernece.
+        to select the CompressedTensorsScheme used for inference.
         """
 
         # Find the "target" in the compressed-tensors config
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 628724c5b7d..ae16a20cfaa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -34,6 +34,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsMoEMethod",
     "CompressedTensorsW8A8Fp8MoEMethod",
     "CompressedTensorsW8A8Fp8MoECutlassMethod",
+    "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
 ]
@@ -67,10 +68,12 @@ def get_moe_method(
             else:
                 return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
-              and layer.activation == "silu" and layer.expert_map is None):
+              and layer.activation == "silu"):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
+        elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8Int8MoEMethod(quant_config)
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
@@ -250,6 +253,28 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled)
+
+        # Property to determine if AITER is used
+        if is_rocm_aiter_moe_enabled():
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa E501
+                rocm_aiter_fused_experts, shuffle_weights)
+
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
+
+            self.fused_experts_func = rocm_aiter_fused_experts
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            self.fused_experts_func = fused_experts
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -268,7 +293,6 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        from vllm.model_executor.layers.fused_moe import fused_experts
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -282,10 +306,10 @@ def apply(
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
+        return self.fused_experts_func(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
@@ -489,8 +513,6 @@ def apply(
     ) -> torch.Tensor:
 
         assert activation == "silu"
-        assert global_num_experts == layer.w13_weight.shape[0]
-        assert expert_map is None
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -521,10 +543,143 @@ def apply(
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             out_dtype=x.dtype,
+            expert_map=expert_map,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
 
+class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not per_channel:
+            raise ValueError(
+                "For INT8 Fused MoE layers, we require channelwise, "
+                "dynamic per token quantization. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales:
+            raise ValueError(
+                "For INT8 Fused MoE layers, we require channelwise, "
+                "dynamic per token quantization. Found static input scales.")
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.int8
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+        w13_weight_scale = torch.nn.Parameter(torch.ones(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            1,
+            dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        hidden_size,
+                                                        1,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        assert not self.static_input_scales
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        pass
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            use_int8_w8a8=True,
+            per_channel_quant=True,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale)
+
+
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 67934d37284..df7ec3376b5 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -7,6 +7,7 @@
 import torch.nn.functional as F
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
@@ -41,8 +42,8 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size}")
 
     @classmethod
-    def get_name(cls) -> str:
-        return "DeepSpeedFP"
+    def get_name(cls) -> QuantizationMethods:
+        return "deepspeedfp"
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig":
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index be19b80975e..cce95941b71 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -8,6 +8,7 @@
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -20,7 +21,7 @@ def __init__(self) -> None:
         super().__init__()
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "experts_int8"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 7dddc40f344..1fa2b3a8eee 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -9,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -38,7 +39,7 @@ def __init__(self, ignore_list: List[str], input_scale_ub: float):
         self.fp8_linear = Fp8LinearOp()
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "fbgemm_fp8"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b7327f47733..5515ba27ea1 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -16,6 +16,7 @@
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -83,7 +84,7 @@ def __init__(
         self.weight_block_size = weight_block_size
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "fp8"
 
     @classmethod
@@ -140,6 +141,11 @@ def get_cache_scale(self, name: str) -> Optional[str]:
             return name.replace(".k_proj.output_scale", ".attn.k_scale")
         if name.endswith(".output_scale") and ".v_proj" in name:
             return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
+        # If no matches, return None
         return None
 
 
@@ -575,8 +581,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
-            is_rocm_aiter_moe_enabled, shuffle_weights)
+            expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights)
 
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
@@ -603,7 +608,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
-            if is_rocm_aiter_block_scaled_moe_enabled():
+            if is_rocm_aiter_moe_enabled():
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = shuffle_weights(
                     layer.w13_weight.data, layer.w2_weight.data)
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 6b499f81c55..05058dfaa73 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -31,7 +32,7 @@ def __init__(self, ) -> None:
     def __repr__(self) -> str:
         return ("GGUFConfig()")
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "gguf"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 1c8d6cb1ea7..5059e0cdfd4 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -10,6 +10,7 @@
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
@@ -79,7 +80,7 @@ def __repr__(self) -> str:
                 f"dynamic={self.dynamic}")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "gptq"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
new file mode 100644
index 00000000000..b06c9579d63
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -0,0 +1,444 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional, Set
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
+    BitBLASLinearKernel, MPLinearLayerConfig)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_NUM_BITS as GPTQ_BITBLAS_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_SUPPORTED_SYM as GPTQ_BITBLAS_SUPPORTED_SYM)
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    MINIMUM_BITBLAS_VERSION, bitblas_repeat_scales_on_all_ranks,
+    check_bitblas_supported, verify_bitblas_supported)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+logger = init_logger(__name__)
+
+
+class GPTQBitBLASConfig(QuantizationConfig):
+    """Config class for GPTQ BitBLAS"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    TORCH_DTYPE = torch.float16
+    GPTQ_CKPT_STORAGE_DTYPE = (
+        "int32"  # GPTQ Default Checkpoints use int32 as storage dtype
+    )
+    GPTQ_BITBLAS_STORAGE_DTYPE = "int8"  # BitBLAS uses int8 as storage dtype
+    TORCH_BITBLAS_STORAGE_DTYPE = getattr(torch, GPTQ_BITBLAS_STORAGE_DTYPE)
+    # "original" or "rescale" or "quantized",
+    # the gptq_bitblas prefer "quantized"
+    ZEROS_MODE = "quantized"
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        quant_method: Optional[str],
+        lm_head_quantized: bool,
+    ) -> None:
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError as e:
+            bitblas_import_exception = e
+            raise ValueError(
+                "Trying to use the bitblas backend, but could not import"
+                f"with the following error: {bitblas_import_exception}. "
+                "Please install bitblas through the following command: "
+                f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
+            ) from bitblas_import_exception
+
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.is_sym = is_sym
+        self.quant_method = quant_method
+        self.lm_head_quantized = lm_head_quantized
+
+        # Verify
+        if self.weight_bits not in GPTQ_BITBLAS_SUPPORTED_NUM_BITS:
+            raise ValueError(
+                f"BitBLAS does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {GPTQ_BITBLAS_SUPPORTED_NUM_BITS} "
+                "are supported.")
+
+        if self.is_sym not in GPTQ_BITBLAS_SUPPORTED_SYM:
+            raise ValueError(
+                f"BitBLAS does not support is_sym = {self.is_sym}. "
+                f"Only sym = {GPTQ_BITBLAS_SUPPORTED_SYM} are supported.")
+
+        self.storage_dtype = self.GPTQ_BITBLAS_STORAGE_DTYPE
+
+        storage_nbit = int("".join(c for c in self.GPTQ_CKPT_STORAGE_DTYPE
+                                   if c.isdigit()))
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = storage_nbit // weight_bits
+        self.nbits = weight_bits
+
+        # Zeros type for the quantized weights.
+        self.zeros_mode = self.ZEROS_MODE
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={weight_bits}, sym={is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (f"GPTQBitBLASConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act})"
+                f"is_sym={self.is_sym}, "
+                f"quant_method={self.quant_method})")
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "gptq_bitblas"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQBitBLASConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, is_sym, quant_method,
+                   lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
+        can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (user_quant is None or user_quant == "bitblas"
+                               or user_quant == "gptq_bitblas")
+
+        if can_convert and is_valid_user_quant:
+            msg = ("The model is convertible to {} during runtime."
+                   " Using {} kernel.".format(cls.get_name(), cls.get_name()))
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "gptq":
+            logger.info("Detected that the model can run with gptq_bitblas"
+                        ", however you specified quantization=gptq explicitly,"
+                        " so forcing gptq. Use quantization=gptq_bitblas for"
+                        " faster inference")
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQBitBLASLinearMethod"]:
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return GPTQBitBLASLinearMethod(self)
+        return None
+
+    @property
+    def torch_storage_dtype(self) -> torch.dtype:
+        return self.TORCH_BITBLAS_STORAGE_DTYPE
+
+    @classmethod
+    def is_gptq_bitblas_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        # temporarily disable on ROCm platform
+        if not current_platform.is_cuda():
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if (num_bits is None or group_size is None or sym is None
+                or desc_act is None):
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        # If the capability of the device is too low, cannot convert.
+        major, minor = torch.cuda.get_device_capability()
+        device_capability = major * 10 + minor
+        if device_capability < cls.get_min_capability():
+            return False
+
+        # Otherwise, can convert if model satisfies bitblas constraints.
+        return check_bitblas_supported(quant_type=cls.TYPE_MAP[(num_bits,
+                                                                sym)],
+                                       group_size=group_size)
+
+
+class GPTQBitBLASLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ BitBLAS.
+
+    Args:
+        quant_config: The GPTQ BitBLAS quantization config.
+    """
+
+    kernel_type = BitBLASLinearKernel
+    _kernel_backends_being_used: Set[str] = set()
+
+    def __init__(self, quant_config: GPTQBitBLASConfig) -> None:
+        self.quant_config = quant_config
+        # Verify supported on platform.
+        verify_bitblas_supported(quant_type=self.quant_config.quant_type,
+                                 group_size=self.quant_config.group_size)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates quantized weights for use in linear operations.
+
+        The function initializes and returns a dictionary containing 
+        quantized weights, scales, and zeros
+        for performing quantized matrix multiplication operations.
+
+        Args:
+            input_size_per_partition: The size of the input partition.
+            output_partition_sizes: The size of the output partition.
+            input_size: The total size of the input (unused).
+            output_size: The total size of the output (unused).
+            params_dtype: 
+                The data type of the parameters (expected to be torch.float16).
+
+        Returns:
+            A dictionary containing the quantized weights ('qweight'), 
+            scales ('scales'), and zeros ('zeros').
+
+        Raises:
+            ValueError: If `params_dtype` is not `torch.float16` or 
+            if the input size per partition is not divisible by the 
+            group size in `quant_config`.
+        """
+        if params_dtype != torch.float16:
+            raise ValueError("Parameter data type must be torch.float16, "
+                             f"but got {params_dtype}")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        if input_size_per_partition % group_size != 0:
+            raise ValueError(
+                f"Input size per partition ({input_size_per_partition}) must "
+                f"be divisible by group size ({self.quant_config.group_size})."
+            )
+
+        kernel_type = self.kernel_type
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act
+        )
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for GPTQBitBLASLinearMethod",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if bitblas_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+                                              self.quant_config.group_size,
+                                              is_row_parallel):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Init buffers
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        # Activation order
+        # Ignore warning from fused linear layers such as QKVParallelLinear.
+        g_idx = RowvLLMParameter(data=torch.empty(
+            input_size_per_partition,
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+
+        # Scales
+        scales = Parameter(
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                **extra_weight_attrs,
+                "input_dim": scales_and_zp_input_dim,
+                "output_dim": 1,
+            },
+        )
+
+        # Quantized zero-points
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+        self.kernel = kernel_type(
+            mp_linear_kernel_config,
+            w_q_param_name="qweight",
+            w_s_param_name="scales",
+            w_zp_param_name="qzeros",
+            w_gidx_param_name="g_idx",
+            bitblas_quant_config=self.quant_config,
+        )
+
+        # Initialize or retrieve the BitBLAS matrix multiplication operator.
+        self.kernel.configure_bitblas_matmul(
+            input_size_per_partition,
+            output_size_per_partition,
+            params_dtype=params_dtype,
+            bias=False,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out = self.kernel.apply_gptq_bitblas_linear(layer, x)
+        if bias is not None:
+            out.add_(bias)
+        return out
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 52cd0a5b697..703d54b3bee 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -11,6 +11,7 @@
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
@@ -100,7 +101,7 @@ def __repr__(self) -> str:
                 f"dynamic={self.dynamic}")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "gptq_marlin"
 
     @classmethod
@@ -130,8 +131,8 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
                    lm_head_quantized, dynamic, config)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
         is_valid_user_quant = (user_quant is None or user_quant == "marlin"
@@ -156,7 +157,7 @@ def get_quant_method(self, layer: torch.nn.Module,
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
             if not check_moe_marlin_supports_layer(layer, self.group_size):
-                logger.warning_one(
+                logger.warning_once(
                     f"Layer '{prefix}' is not supported by GPTQMoeMarlin. "
                     "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index dd747e182e2..1fe08e4b34f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -8,6 +8,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.parameter import (BasevLLMParameter,
@@ -85,7 +86,7 @@ def __repr__(self) -> str:
             self.quant_type, self.group_size)
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "gptq_marlin_24"
 
     @classmethod
@@ -108,8 +109,8 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlin24Config":
         return cls(weight_bits, group_size)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         is_marlin_24_format = (
             hf_quant_cfg.get("checkpoint_format") == "marlin_24")
 
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 4edc9aa848a..7bd398137e0 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
@@ -50,7 +51,7 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size})")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "hqq"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index c09cc13cb27..212af278ff8 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -6,6 +6,7 @@
 
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -58,7 +59,7 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size})")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "ipex"
 
     @classmethod
@@ -97,8 +98,8 @@ def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig":
                    lm_head_quantized)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         if not current_platform.is_cpu() and not current_platform.is_xpu():
             return None
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 520e1bc9672..d144bb43610 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -5,6 +5,8 @@
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
     AllSparkLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.bitblas import (  # noqa: E501
+    BitBLASLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
     ExllamaLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
@@ -20,6 +22,7 @@
     MacheteLinearKernel,
     AllSparkLinearKernel,
     MarlinLinearKernel,
+    BitBLASLinearKernel,
     ExllamaLinearKernel,
 ]
 
@@ -76,4 +79,4 @@ def choose_mp_linear_kernel(
     raise ValueError(
         "Failed to find a kernel that can implement the "\
         "WNA16 linear layer. Reasons: \n"
-        + '\n'.join(failure_reasons))
+        + '\n'.join(failure_reasons))
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
new file mode 100644
index 00000000000..21452d08b8a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES, BITBLAS_SUPPORTED_GROUP_SIZES,
+    MINIMUM_BITBLAS_VERSION, bitblas_make_empty_g_idx, bitblas_sort_g_idx,
+    check_bitblas_supports_shape, query_bitblas_supported_quant_types,
+    unpack_gptq_qweight, unpack_gptq_qzeros)
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+logger = init_logger(__name__)
+
+
+class BitBLASLinearKernel(MPLinearKernel):
+
+    OPT_FEATURES: List[int] = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING: bool = True
+    MATMUL_LAYOUT: str = "nt"
+    BITBLAS_DTYPES: Dict[torch.dtype, str] = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+    bitblas_matmul: object = None
+
+    def __init__(
+        self,
+        c: MPLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        w_zp_param_name: Optional[str] = None,
+        w_gidx_param_name: Optional[str] = None,
+        bitblas_quant_config: Optional[QuantizationConfig] = None,
+    ):
+        self.quant_config = bitblas_quant_config
+        super().__init__(c, w_q_param_name, w_s_param_name, w_zp_param_name,
+                         w_gidx_param_name)
+
+    def repack_bitblas_from_gptq(
+        self,
+        b_q_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: Optional[torch.Tensor] = None,
+    ):
+        from bitblas.quantization.utils import general_compress
+        assert self.bitblas_matmul is not None, "bitblas_matmul is None"
+
+        quant_config = self.quant_config
+        # qweight in gptq old quant linear stored with
+        # (outfeatures, infeatures), should be transposed.
+        qweight = b_q_weight.T.contiguous().view(
+            quant_config.torch_storage_dtype)  # type: ignore[union-attr]
+        intweight = unpack_gptq_qweight(
+            qweight,
+            quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
+        if self.bitblas_matmul.weight_transform is not None:  # type: ignore[attr-defined]
+            qweight = self.bitblas_matmul.weight_transform(  # type: ignore[attr-defined]
+                intweight.cpu()).cuda()
+        # scales in gptq old quant linear stored with
+        # (infeatures // group_size, outfeatures), should be transposed.
+        scales = scales.T.contiguous()
+
+        if qzeros is None:
+            return qweight, scales, None
+
+        # qzeros should be de-quantized to int zeros.
+        weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
+        intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
+        zeros: Optional[torch.Tensor] = None
+        zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
+        if zeros_mode == "original":
+            zeros = intzeros.to(torch.float16).contiguous()
+        elif zeros_mode == "rescale":
+            assert zeros is not None, "zeros should not be None"
+            zeros[:, :] = intzeros.to(torch.float16)[:, :] * scales[:, :]
+        elif zeros_mode == "quantized":
+            zeros = (
+                torch.Tensor(
+                    general_compress(
+                        intzeros.T.contiguous().cpu().numpy(),
+                        weight_bits,
+                    )).to(qweight.device).
+                to(quant_config.torch_storage_dtype  # type: ignore[union-attr]
+                   ).contiguous())
+        else:
+            raise ValueError("Unsupported zeros type: {}".format(zeros_mode))
+
+        return qweight, scales, zeros
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+
+        is_bitblas_installed = True
+
+        try:
+            import bitblas
+            if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+        except ImportError:
+            is_bitblas_installed = False
+
+        if not is_bitblas_installed:
+            return False, "bitblas is not installed. Please install bitblas "\
+                          "by running `pip install bitblas>="\
+                           f"{MINIMUM_BITBLAS_VERSION}`"
+
+        quant_types = query_bitblas_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, (f"Quant type ({c.weight_type}) not supported by"
+                           f"  BitBLAS, supported types are: {quant_types}")
+
+        if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
+            return False, (f"Group size ({c.group_size}) not supported by "
+                           "BitBLAS, supported group sizes are: "
+                           f"{BITBLAS_SUPPORTED_GROUP_SIZES}")
+
+        return check_bitblas_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        quant_config = self.quant_config
+
+        # Default names since bitblas requires empty parameters for these,
+        # TODO: remove this requirement from bitblas (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "qzeros"
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = bitblas_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, bitblas_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = bitblas_make_empty_g_idx(device)
+
+        if c.zero_points:
+            raise NotImplementedError("Zero points not supported by BitBLAS")
+        else:
+            setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device))
+
+        # Repack weights
+        bitblas_qweight, bitblas_scales, bitblas_qzeros = (
+            self.repack_bitblas_from_gptq(
+                layer.qweight,
+                layer.scales,
+                None if quant_config.is_sym else  # type: ignore[union-attr]
+                layer.qzeros,  # type: ignore[union-attr]
+            ))
+        replace_parameter(layer, self.w_q_name, bitblas_qweight)
+        replace_parameter(layer, self.w_s_name, bitblas_scales)
+        if bitblas_qzeros is not None:
+            replace_parameter(layer, self.w_zp_name, bitblas_qzeros)
+
+    def configure_bitblas_matmul(
+        self,
+        infeatures: int,
+        outfeatures: int,
+        params_dtype: torch.dtype,
+        bias: bool,
+    ) -> None:
+        enable_tuning = self.ENABLE_TUNING
+        layout = self.MATMUL_LAYOUT
+        bits = self.quant_config.weight_bits  # type: ignore[union-attr]
+        self._configure_bitblas_matmul(
+            infeatures,
+            outfeatures,
+            params_dtype,
+            enable_tuning,
+            bias,
+            layout,
+            bits,
+        )
+
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+        quant_config = self.quant_config
+        with_scaling = False
+        with_zeros = False
+        group_size = quant_config.group_size  # type: ignore[union-attr]
+        zeros_mode = quant_config.zeros_mode  # type: ignore[union-attr]
+        if quant_config.quant_method == "gptq":  # type: ignore[union-attr]
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if quant_config.is_sym:  # type: ignore[union-attr]
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {quant_config.quant_method}"  # type: ignore[union-attr]
+            )  # type: ignore[union-attr]
+
+        matmul_config = MatmulConfig(
+            M=self.OPT_FEATURES,
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=bitblas_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=quant_config.  # type: ignore[union-attr]
+            storage_dtype,  # type: ignore[union-attr]
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning)
+
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(BITBLAS_DATABASE_PATH,
+                                                     BITBLAS_TARGET)
+
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config,
+                                    target=BITBLAS_TARGET,
+                                    enable_tuning=False)
+            if enable_tuning:
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET)
+                TUNING_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database.")
+                logger.info(TUNING_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created without tuning. "
+                logger.info(_message)
+        else:
+            _message = f"BitBLAS Operator {config} retrieved from cache."
+            logger.info(_message)
+        return bitblas_matmul
+
+    def apply_gptq_bitblas_linear(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        output_size_per_partition = self.config.partition_weight_shape[1]
+        out_shape = x.shape[:-1] + (output_size_per_partition, )
+        args = [x, layer.qweight, layer.scales]
+        if self.bitblas_matmul.config.with_zeros:  # type: ignore[attr-defined]
+            args.append(layer.qzeros)
+        output = self.bitblas_matmul(*args)  # type: ignore[operator]
+        return output.view(out_shape)
+
+    def apply_weights(self, layer, x, bias=None):
+        NOT_IMPLEMENT_MESSAGE = (
+            f"{self.__class__.__name__}.apply_weights is not implemented. "
+            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead")
+        raise NotImplementedError(NOT_IMPLEMENT_MESSAGE)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 2bf21a05c46..04772412952 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -111,7 +111,7 @@ def apply_weights(self,
         # * dynamic, i_s is None and x_s computed from x.
         # * static, i_s is scalar and x_s is i_s.
         symmetric = azp_adj is None
-        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
                                                i_s,
                                                i_zp,
                                                symmetric=symmetric)
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 5d766c2c27a..5dff8b09693 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -38,6 +38,9 @@ def create_weights(self, layer: torch.nn.Module):
                                            requires_grad=False)
         layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
                                            requires_grad=False)
+        # Initialize P = softmax(QK^T) scales
+        layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                              requires_grad=False)
 
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError(
@@ -97,5 +100,38 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
 
+        if layer.q_scale > 0.0:
+            q_scale = layer.q_scale
+            if current_platform.is_fp8_fnuz():
+                q_scale *= 2
+            layer.calculate_kv_scales = False
+        else:
+            q_scale = 1.0
+        if layer.prob_scale > 0.0:
+            prob_scale = layer.prob_scale
+            if current_platform.is_fp8_fnuz():
+                prob_scale *= 2
+        else:
+            prob_scale = 1.0
+
+        is_singleton_float = lambda x: isinstance(x, float) or isinstance(
+            x, torch.Tensor) and x.numel() == 1 and x.is_floating_point()
+        if not is_singleton_float(q_scale) or not is_singleton_float(
+                prob_scale):
+            raise ValueError("Only support per-tensor scaling factor"
+                             "for fp8-quantized Q/prob")
+
+        # These are used in the final Attention.forward()
+        layer._q_scale.copy_(q_scale)
+        layer._prob_scale.copy_(prob_scale)
+        if q_scale == 1.0 or prob_scale == 1.0:
+            logger.warning_once(
+                f"Using Q scale {q_scale} and prob scale {prob_scale} "
+                "with fp8 attention. This may cause accuracy issues. "
+                "Please make sure Q/prob scaling factors are "
+                "available in the fp8 checkpoint.")
+
         del layer.k_scale
         del layer.v_scale
+        del layer.q_scale
+        del layer.prob_scale
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 4cf0c677c07..9ef71a7894d 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -8,6 +8,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -63,7 +64,7 @@ def __repr__(self) -> str:
                 f"lm_head_quantized={self.lm_head_quantized})")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "marlin"
 
     @classmethod
@@ -87,8 +88,8 @@ def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
         return cls(group_size, lm_head_quantized)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         # compat: autogptq >=0.8.0 use checkpoint_format: str
         # compat: autogptq <=0.7.1 is_marlin_format: bool
         is_marlin_format = (hf_quant_cfg.get("checkpoint_format") == "marlin"
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 3de15369915..828447dd101 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -11,6 +11,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -42,7 +43,7 @@ def __init__(
                            " the format is experimental and could change.")
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "modelopt"
 
     @classmethod
@@ -184,8 +185,8 @@ def __init__(
             self.exclude_modules = exclude_modules
 
     @classmethod
-    def get_name(cls) -> str:
-        return "modelopt_nvfp4"
+    def get_name(cls) -> QuantizationMethods:
+        return "nvfp4"
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 00c4b661ef2..b8e3a436437 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -9,6 +9,7 @@
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
@@ -64,7 +65,7 @@ def __init__(self, linear_quant_method: str, weight_bits: int,
             self.modules_to_not_convert = modules_to_not_convert
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "moe_wna16"
 
     @classmethod
@@ -100,8 +101,8 @@ def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config":
                    lm_head_quantized, modules_to_not_convert, config)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg,
-                                     user_quant) -> Optional[str]:
+    def override_quantization_method(
+            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
         can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
         if can_convert and user_quant == "moe_wna16":
             return cls.get_name()
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index f6f66803f81..7933eab2a53 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -6,6 +6,7 @@
 
 from torch.nn import Module
 
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 
@@ -30,7 +31,7 @@ def __init__(
         self.dequant_dtype = dequant_dtype
         self.quantize_method = quantize_method
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "neuron_quant"
 
     def get_supported_act_dtypes(self) -> List[str]:
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 592ffc5dad1..004d74e68b9 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -9,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
@@ -50,7 +51,7 @@ def __init__(
                          ignored_layers=ignored_layers)
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "ptpc_fp8"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 1e05917a518..06ff6c71b91 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -8,6 +8,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.parameter import (BasevLLMParameter,
@@ -84,7 +85,7 @@ def __repr__(self) -> str:
             self.weight_bits, self.group_size)
 
     @classmethod
-    def get_name(cls) -> str:
+    def get_name(cls) -> QuantizationMethods:
         return "qqq"
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index ca71da8b736..da231219008 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import fnmatch
-import re
 from typing import Any, Dict, List, Optional, cast
 
 import torch
@@ -9,6 +8,7 @@
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -48,7 +48,7 @@ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
     def get_min_capability(cls) -> int:
         return 70
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "quark"
 
     def get_quant_method(self, layer: torch.nn.Module,
@@ -125,6 +125,13 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
             for q_config in q_configs:
                 q_config["output_tensors"] = None
 
+            # In case q_proj output is also quantized, remove the configuration
+            # to keep qkv consistency.
+            q_proj_q_config = cast(Dict[str, Any],
+                                   layer_quant_config.get("*q_proj"))
+            if q_proj_q_config is not None:
+                q_proj_q_config["output_tensors"] = None
+
         return cls(quant_config=config,
                    kv_cache_group=kv_cache_group,
                    kv_cache_config=kv_cache_config,
@@ -289,25 +296,14 @@ def get_cache_scale(self, name: str) -> Optional[str]:
         :param name: param name
         :return: matching param name for KV cache scale in vLLM
         """
-        if self.kv_cache_group is None or len(self.kv_cache_group) == 0:
-            return None
-
-        kv_proj_names = [
-            re.split(r"[*.]", kv_cache)[-1] for kv_cache in self.kv_cache_group
-        ]
-        if name.endswith(".output_scale"):
-            if len(kv_proj_names) == 1 and kv_proj_names[0] in name:
-                kv_output_scale_name = "." + kv_proj_names[0] + ".output_scale"
-                return name.replace(kv_output_scale_name, ".attn.k_scale")
-
-            elif len(kv_proj_names) == 2:
-                for kv_proj_name in kv_proj_names:
-                    if kv_proj_name in name and kv_proj_name == "k_proj":
-                        return name.replace(".k_proj.output_scale",
-                                            ".attn.k_scale")
-                    elif kv_proj_name in name and kv_proj_name == "v_proj":
-                        return name.replace(".v_proj.output_scale",
-                                            ".attn.v_scale")
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
 
         # If no matches, return None
         return None
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 5c2babcf4ab..751002fa094 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -6,6 +6,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
@@ -20,7 +21,7 @@ def __init__(self, torchao_config) -> None:
     def __repr__(self) -> str:
         return f"TorchAOConfig({self.torchao_config})"
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "torchao"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 14e5bcf6e5b..8333c16ce6a 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -7,6 +7,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.parameter import ModelWeightParameter
@@ -27,7 +28,7 @@ def __init__(
                 f"Unsupported activation scheme {activation_scheme}")
         self.activation_scheme = activation_scheme
 
-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
         return "tpu_int8"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
new file mode 100644
index 00000000000..e26ac4ea3d4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+MINIMUM_BITBLAS_VERSION = "0.1.0"
+
+BITBLAS_MIN_WEIGHT_SIZE_N = 16
+BITBLAS_MIN_WEIGHT_SIZE_K = 16
+GPTQ_BITBLAS_MAX_PARALLEL = 16
+
+BITBLAS_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# For dynamic shape code generation
+BITBLAS_OPTIMIZE_FEATURES = [1, 16, 32, 64, 128, 256, 512, 1024]
+# If want to enable high performance for contiguous batching
+# Please use the following values
+BITBLAS_OPTIMIZE_FEATURES_CONTIGUOUS = [16, 32, 64, 128, 256, 512, 1024]
+
+BITBLAS_SUPPORTED_NUM_BITS = [1, 2, 4, 8]
+BITBLAS_SUPPORTED_SYM = [False, True]
+
+
+# Determines the supported quantization types for BitBLAS based on the
+# device's capability and whether zero-point (zp) is used.
+def query_bitblas_supported_quant_types(has_zp: bool,
+                                        device_capability: Optional[int] = None
+                                        ):
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    if device_capability < 70:
+        return []
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4, scalar_types.uint8]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        # TODO: once fp8_bitblas is merged into "gptq_bitblas" we should be able
+        #  to add `scalar_types.float8_e4m3fn` here
+        return [scalar_types.uint4b8, scalar_types.uint8b128]
+
+
+def _check_bitblas_supported(
+        quant_type: ScalarType,
+        group_size: Optional[int],
+        has_zp: bool,
+        device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        capability_tuple = current_platform.get_device_capability()
+        device_capability = (-1 if capability_tuple is None else
+                             capability_tuple.to_int())
+
+    supported_types = query_bitblas_supported_quant_types(
+        has_zp, device_capability)
+
+    if quant_type not in supported_types:
+        return (False, f"BitBLAS does not support weight_bits = {quant_type}. "
+                f"Only types = {supported_types} "
+                f"are supported (for group_size = {group_size}, "
+                f"device_capability = {device_capability}, zp = {has_zp}).")
+    if (group_size is None or group_size not in BITBLAS_SUPPORTED_GROUP_SIZES):
+        return (False, f"BitBLAS does not support group_size = {group_size}. "
+                f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} "
+                "are supported.")
+
+    # Finally, check if bitblas is installed
+    try:
+        import bitblas
+        if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
+            raise ImportError("bitblas version is wrong. Please "
+                              f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
+    except ImportError:
+        return False, "BitBLAS is not installed."
+
+    return True, None
+
+
+def check_bitblas_supported(quant_type: ScalarType,
+                            group_size: int,
+                            has_zp: bool = False,
+                            device_capability: Optional[int] = None) -> bool:
+    cond, _ = _check_bitblas_supported(quant_type, group_size, has_zp,
+                                       device_capability)
+    return cond
+
+
+def verify_bitblas_supported(quant_type: ScalarType,
+                             group_size: int,
+                             has_zp: bool = False) -> None:
+    cond, err_msg = _check_bitblas_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_bitblas_supports_shape(output_size_per_partition: int,
+                                  input_size_per_partition: int,
+                                  input_size: int, group_size: int) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_N != 0:
+        raise ValueError(f"Weight output_size_per_partition = "
+                         f"{output_size_per_partition} is not divisible by "
+                         f" min_thread_n = {BITBLAS_MIN_WEIGHT_SIZE_N}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % BITBLAS_MIN_WEIGHT_SIZE_K != 0:
+        raise ValueError(f"Weight input_size_per_partition = "
+                         f"{input_size_per_partition} is not divisible "
+                         f"by min_thread_k = {BITBLAS_MIN_WEIGHT_SIZE_K}. "
+                         "Consider reducing tensor_parallel_size or running "
+                         "with --quantization gptq.")
+
+    if (group_size < input_size
+            and input_size_per_partition % group_size != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}."
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq.")
+
+
+def check_bitblas_supports_shape(output_size_per_partition: int,
+                                input_size_per_partition: int,
+                                input_size: int, group_size: int) \
+                                    -> Tuple[bool, Optional[str]]:
+    try:
+        verify_bitblas_supports_shape(output_size_per_partition,
+                                      input_size_per_partition, input_size,
+                                      group_size)
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def bitblas_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def bitblas_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+                                       is_row_parallel: bool) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def bitblas_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+                              requires_grad=False)
+
+
+def bitblas_sort_g_idx(
+        g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def unpack_gptq_qzeros(qzeros, bits, is_gptq_v2=False) -> torch.Tensor:
+    qzeros = qzeros.view(torch.int32)
+    elems_per_int32 = 32 // bits
+    unpacked_zeros = torch.zeros(
+        (qzeros.shape[0], qzeros.shape[1] * elems_per_int32),
+        dtype=torch.int8,
+        device=qzeros.device,
+        requires_grad=False,
+    )
+
+    for col in range(unpacked_zeros.shape[1]):
+        i = col % elems_per_int32
+        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >>
+                                  (bits * i)) & 0xF
+    if not is_gptq_v2:
+        return unpacked_zeros + 1
+    return unpacked_zeros
+
+
+def unpack_gptq_qweight(qweight, bits):
+    qweight = qweight.view(torch.int8)
+    elems_per_int8 = 8 // bits
+    unpacked_weight = torch.zeros(
+        (qweight.shape[0], qweight.shape[1] * elems_per_int8),
+        dtype=torch.int8,
+        device=qweight.device,
+        requires_grad=False,
+    )
+    for col in range(unpacked_weight.shape[1]):
+        i = col % elems_per_int8
+        unpacked_weight[:, col] = (qweight[:, col // elems_per_int8] >>
+                                   (bits * i))
+
+    return torch.bitwise_and(unpacked_weight, 2**bits - 1)
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 98b06b6c2ae..aaaf7a9e0a4 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -85,6 +85,32 @@ def block_dequant(
     return x_dq_block
 
 
+if current_platform.is_rocm():
+    from triton.language import core
+
+    # NOTE: This can be removed when hip.libdevice.round() is available.
+    @core.extern
+    def round_f32(arg0, _builder=None):
+        return core.extern_elementwise("",
+                                       "", [arg0], {
+                                           (core.dtype("fp32"), ):
+                                           ("llvm.round", core.dtype("fp32")),
+                                           (core.dtype("fp64"), ):
+                                           ("llvm.round", core.dtype("fp64")),
+                                       },
+                                       is_pure=True,
+                                       _builder=_builder)
+
+    @triton.jit
+    def round_int8(x):
+        return round_f32(x).to(tl.int8)
+else:
+
+    @triton.jit
+    def round_int8(x):
+        return tl.extra.cuda.libdevice.round(x).to(tl.int8)
+
+
 @triton.jit
 def _per_token_quant_int8(
     x_ptr,
@@ -106,7 +132,7 @@ def _per_token_quant_int8(
     absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
     scale_x = absmax / 127
     x_q = x * (127 / absmax)
-    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+    x_q = round_int8(x_q)
 
     tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
     tl.store(scale_ptr + row_id, scale_x)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index b8e6384d735..8ab45d61005 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 
@@ -17,6 +18,7 @@
 # The condition is determined once as the operations
 # are time consuming.
 USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm()
+                               and torch.__version__[0:3] >= "2.7"
                                and current_platform.has_device_capability(94))
 
 
@@ -131,6 +133,160 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
+def cutlass_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
+                           out_dtype: torch.dtype, scale_a: torch.Tensor,
+                           scale_b: torch.Tensor, bias: torch.Tensor,
+                           output_shape: List, **kwargs) -> torch.Tensor:
+
+    # Fused GEMM_DQ
+    output = ops.cutlass_scaled_mm(qinput,
+                                   weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=scale_a,
+                                   scale_b=scale_b,
+                                   bias=bias)
+    return output.view(*output_shape)
+
+
+def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    from vllm.platforms.rocm import on_mi250_mi300
+    if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300(
+    ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
+        output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b,
+                               current_platform.get_cu_count())
+    else:
+        output = torch._scaled_mm(qinput,
+                                  weight,
+                                  out_dtype=out_dtype,
+                                  scale_a=scale_a,
+                                  scale_b=scale_b,
+                                  bias=bias)
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                    weight: torch.Tensor,
+                                    out_dtype: torch.dtype,
+                                    scale_a: torch.Tensor,
+                                    scale_b: torch.Tensor, bias: torch.Tensor,
+                                    input_2d: torch.Tensor,
+                                    output_shape: List) -> torch.Tensor:
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b,
+                              bias=bias)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+
+    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+
+
+def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                   weight: torch.Tensor,
+                                   out_dtype: torch.dtype,
+                                   scale_a: torch.Tensor,
+                                   scale_b: torch.Tensor, bias: torch.Tensor,
+                                   input_2d: torch.Tensor,
+                                   output_shape: List) -> torch.Tensor:
+    # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
+    #  when using it.
+    #  For now it has only been validated on ROCm platform.
+    #  fp8 rowwise scaling in torch._scaled_mm is introduced in
+    #  https://github.com/pytorch/pytorch/pull/144432 using
+    #  hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
+    #
+    #  For CUDA platform please validate if the torch._scaled_mm supports
+    #  rowwise scaled GEMM before using it
+
+    # Fused GEMM_DQ Rowwise GEMM
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              out_dtype=out_dtype,
+                              scale_a=scale_a,
+                              scale_b=scale_b.t(),
+                              bias=bias)
+
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    output = output.view(*output_shape)
+    return output
+
+
+def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
+                                     weight: torch.Tensor,
+                                     out_dtype: torch.dtype,
+                                     scale_a: torch.Tensor,
+                                     scale_b: torch.Tensor, bias: torch.Tensor,
+                                     input_2d: torch.Tensor,
+                                     output_shape: List,
+                                     **kwargs) -> torch.Tensor:
+    # Use unfused DQ due to limitations with scaled_mm
+
+    # Symmetric quantized GEMM by definition computes the following:
+    #   C = (s_x * X) (s_w * W) + bias
+    # This is equivalent to dequantizing the weights and activations
+    # before applying a GEMM.
+    #
+    # In order to compute quantized operands, a quantized kernel
+    # will rewrite the above like so:
+    #   C = s_w * s_x * (X * W) + bias
+    #
+    # For the scaled_mm fallback case, we break this down, since it
+    # does not support s_w being a vector.
+
+    # GEMM
+    # This computes C = (X * W).
+    # Output in fp32 to allow subsequent ops to happen in-place
+    output = torch._scaled_mm(qinput,
+                              weight,
+                              scale_a=TORCH_DEVICE_IDENTITY,
+                              scale_b=TORCH_DEVICE_IDENTITY,
+                              out_dtype=torch.float32)
+    # A fix for discrepancy in scaled_mm which returns tuple
+    # for torch < 2.5 and a single value in torch >= 2.5
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    # Unpad (undo num_token_padding)
+    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0])
+
+    # DQ
+    # C = sw * sx * (X * W) + bias
+    output = output * x_scale * scale_b.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(out_dtype).view(*output_shape)
+
+
+def dispatch_w8a8_scaled_mm(
+        cutlass_fp8_supported: bool, per_tensor_weights: bool,
+        per_tensor_activations: bool, use_per_token_if_dynamic: Optional[bool]
+) -> Callable[..., torch.Tensor]:
+
+    if cutlass_fp8_supported:
+        return cutlass_w8a8_scaled_mm
+    if per_tensor_weights and per_tensor_activations:
+        if current_platform.is_rocm():
+            return rocm_per_tensor_w8a8_scaled_mm
+        return torch_per_tensor_w8a8_scaled_mm
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    if (use_per_token_if_dynamic and not per_tensor_weights
+            and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
+        return torch_per_token_w8a8_scaled_mm
+    return torch_channelwise_w8a8_scaled_mm
+
+
 # TODO(luka): follow similar pattern for marlin and block-fp8-linear
 #  https://github.com/vllm-project/vllm/issues/14397
 class Fp8LinearOp:
@@ -156,7 +312,8 @@ def __init__(self,
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = config.level < CompilationLevel.PIECEWISE
-        self.output_padding = 17 if pad_output else None
+        self.output_padding = 17 if (
+            pad_output and not current_platform.is_rocm()) else None
 
     def apply(
         self,
@@ -195,18 +352,6 @@ def apply(
                 input_scale,
                 scale_ub=input_scale_ub,
                 use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-            # Fused GEMM_DQ
-            output = ops.cutlass_scaled_mm(qinput,
-                                           weight,
-                                           out_dtype=out_dtype,
-                                           scale_a=x_scale,
-                                           scale_b=weight_scale,
-                                           bias=bias)
-            return output.view(*output_shape)
-
-        # torch.scaled_mm supports per tensor weights + activations only
-        # so fallback to naive if per channel or per token
         else:
             if input.dtype != current_platform.fp8_dtype():
                 # Maybe apply padding to output, see comment in __init__
@@ -218,84 +363,21 @@ def apply(
             else:
                 qinput, x_scale = input_2d, input_scale
 
-            per_tensor_weights = (weight_scale.numel() == 1)
-            per_tensor_activations = (x_scale.numel() == 1)
-
-            if per_tensor_weights and per_tensor_activations:
-                # Fused GEMM_DQ
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=out_dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale,
-                                          bias=bias)
-                # A fix for discrepancy in scaled_mm which returns tuple
-                # for torch < 2.5 and a single value in torch >= 2.5
-                if type(output) is tuple and len(output) == 2:
-                    output = output[0]
-
-                return torch.narrow(output, 0, 0,
-                                    input_2d.shape[0]).view(*output_shape)
-
-            elif (use_per_token_if_dynamic and not per_tensor_weights
-                  and not per_tensor_activations
-                  and USE_ROWWISE_TORCH_SCALED_MM):
-                # For now validated on ROCm platform
-                # fp8 rowwise scaling in torch._scaled_mm is introduced in
-                # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
-                # and ROCm 6.3, which only exists in torch 2.7 and above.
-                # For CUDA platform please validate if the
-                # torch._scaled_mm support rowwise scaled GEMM
-                # Fused GEMM_DQ Rowwise GEMM
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=out_dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale.t(),
-                                          bias=bias)
-
-                output = torch.narrow(output, 0, 0, input_2d.shape[0])
-                output = output.view(*output_shape)
-                return output
-
-            else:
-                # Fallback for channelwise case, where we use unfused DQ
-                # due to limitations with scaled_mm
-
-                # Symmetric quantized GEMM by definition computes the following:
-                #   C = (s_x * X) (s_w * W) + bias
-                # This is equivalent to dequantizing the weights and activations
-                # before applying a GEMM.
-                #
-                # In order to compute quantized operands, a quantized kernel
-                # will rewrite the above like so:
-                #   C = s_w * s_x * (X * W) + bias
-                #
-                # For the scaled_mm fallback case, we break this down, since it
-                # does not support s_w being a vector.
-
-                # GEMM
-                # This computes C = (X * W).
-                # Output in fp32 to allow subsequent ops to happen in-place
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          scale_a=TORCH_DEVICE_IDENTITY,
-                                          scale_b=TORCH_DEVICE_IDENTITY,
-                                          out_dtype=torch.float32)
-                # A fix for discrepancy in scaled_mm which returns tuple
-                # for torch < 2.5 and a single value in torch >= 2.5
-                if type(output) is tuple and len(output) == 2:
-                    output = output[0]
-                # Unpad (undo num_token_padding)
-                output = torch.narrow(output, 0, 0, input_2d.shape[0])
-                x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-
-                # DQ
-                # C = sw * sx * (X * W) + bias
-                output = output * x_scale * weight_scale.t()
-                if bias is not None:
-                    output = output + bias
-                return output.to(dtype=input.dtype).view(*output_shape)
+        per_tensor_weights = (weight_scale.numel() == 1)
+        per_tensor_activations = (x_scale.numel() == 1)
+
+        w8a8_scaled_mm_func = dispatch_w8a8_scaled_mm(
+            self.cutlass_fp8_supported, per_tensor_weights,
+            per_tensor_activations, use_per_token_if_dynamic)
+
+        return w8a8_scaled_mm_func(qinput=qinput,
+                                   weight=weight,
+                                   out_dtype=out_dtype,
+                                   scale_a=x_scale,
+                                   scale_b=weight_scale,
+                                   bias=bias,
+                                   input_2d=input_2d,
+                                   output_shape=output_shape)
 
 
 def normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 62e27b71486..d1d3326ac3f 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -261,15 +261,16 @@ def _get_accepted(
         True, then a token can be accepted, else it should be
         rejected.
 
-        Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
-        :math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according
-        to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
+        Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
+        {math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according
+        to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
         same conditional probability according to the draft model, the token
         is accepted with probability:
 
-        .. math::
-            \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
-                           {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
+        :::{math}
+        \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
+                        {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
+        :::
 
         This implementation does not apply causality. When using the output,
         if a token is rejected, subsequent tokens should not be used.
@@ -312,18 +313,20 @@ def _get_recovered_probs(
         target model is recovered (within hardware numerics).
 
         The probability distribution used in this rejection case is constructed
-        as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of
-        :math:`x` given context :math:`x_1, \dots, x_n` according to the target
-        model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability
+        as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of
+        {math}`x` given context {math}`x_1, \dots, x_n` according to the target
+        model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability
         according to the draft model:
 
-        .. math::
-            x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
+        :::{math}
+        x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
+        :::
 
-        where :math:`(f(x))_+` is defined as:
+        where {math}`(f(x))_+` is defined as:
 
-        .. math::
-            (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
+        :::{math}
+        (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
+        :::
 
         See https://github.com/vllm-project/vllm/pull/2336 for a visualization
         of the draft, target, and recovered probability distributions.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 624ed63ab8b..523250c3080 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -32,6 +32,9 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 
+if current_platform.is_cuda():
+    from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., :x.shape[-1] // 2]
@@ -46,20 +49,12 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
-def _apply_rotary_emb(
+def _apply_rotary_emb_torch(
     x: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
     is_neox_style: bool,
 ) -> torch.Tensor:
-    """
-    Args:
-        x: [num_tokens, num_heads, head_size]
-        cos: [num_tokens, head_size // 2]
-        sin: [num_tokens, head_size // 2]
-        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
-            positional embeddings.
-    """
     cos = cos.unsqueeze(-2).to(x.dtype)
     sin = sin.unsqueeze(-2).to(x.dtype)
     if is_neox_style:
@@ -75,6 +70,23 @@ def _apply_rotary_emb(
         return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
+def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                      is_neox_style: bool) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    if current_platform.is_cuda():
+        return apply_rotary_emb(x.unsqueeze(0), cos, sin,
+                                not is_neox_style).squeeze(0)
+    else:
+        return _apply_rotary_emb_torch(x, cos, sin, is_neox_style)
+
+
 @CustomOp.register("rotary_embedding")
 class RotaryEmbedding(CustomOp):
     """Original rotary positional embedding."""
@@ -141,14 +153,16 @@ def forward_native(
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
-        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query_rot = _apply_rotary_emb_torch(query_rot, cos, sin,
+                                            self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., :self.rotary_dim]
         key_pass = key[..., self.rotary_dim:]
-        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key_rot = _apply_rotary_emb_torch(key_rot, cos, sin,
+                                          self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
@@ -988,8 +1002,9 @@ def forward(
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
-    @staticmethod
+    @classmethod
     def get_input_positions(
+        cls,
         input_tokens: List[int],
         hf_config: PretrainedConfig,
         image_grid_thw: Optional[Union[List[List[int]], torch.Tensor]],
@@ -997,6 +1012,8 @@ def get_input_positions(
         second_per_grid_ts: Optional[List[float]],
         context_len: int = 0,
         seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
@@ -1006,7 +1023,7 @@ def get_input_positions(
             second_per_grid_ts
 
         llm_positions, mrope_position_delta = \
-            MRotaryEmbedding.get_input_positions_tensor(
+            cls.get_input_positions_tensor(
                 input_tokens=input_tokens,
                 hf_config=hf_config,
                 image_grid_thw=image_grid_thw,
@@ -1014,12 +1031,52 @@ def get_input_positions(
                 second_per_grid_ts=second_per_grid_ts,
                 context_len=context_len,
                 seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
             )
 
         return llm_positions.tolist(), mrope_position_delta
 
-    @staticmethod
+    @classmethod
     def get_input_positions_tensor(
+        cls,
+        input_tokens: List[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        second_per_grid_ts: List[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> Tuple[torch.Tensor, int]:
+        from vllm.transformers_utils.config import thinker_uses_mrope
+        if thinker_uses_mrope(hf_config):
+            return cls._omni_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
+            )
+        else:
+            return cls._vl_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
+
+    @classmethod
+    def _vl_get_input_positions_tensor(
+        cls,
         input_tokens: List[int],
         hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
@@ -1037,11 +1094,6 @@ def get_input_positions_tensor(
         tokens_per_second = getattr(hf_config.vision_config,
                                     "tokens_per_second", 1.0)
 
-        if isinstance(image_grid_thw, torch.Tensor):
-            image_grid_thw = image_grid_thw.tolist()
-        if isinstance(video_grid_thw, torch.Tensor):
-            video_grid_thw = video_grid_thw.tolist()
-
         input_tokens_tensor = torch.tensor(input_tokens)
         vision_start_indices = torch.argwhere(
             input_tokens_tensor == vision_start_token_id).squeeze(1)
@@ -1121,6 +1173,224 @@ def get_input_positions_tensor(
 
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _omni_get_input_positions_tensor(
+        cls,
+        input_tokens: List[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        video_grid_thw: Union[List[List[int]], torch.Tensor],
+        second_per_grid_ts: Optional[List[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> Tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value (Qwen2.5-Omni version).
+
+        Differences from MRotaryEmbedding:
+            1. Add audio support (and related `audio_feature_lengths`).
+            2. Add `use_audio_in_video` option to read audio from video inputs.
+                In this case, audio and vision position ids will be split into
+                chunks and interleaved.
+
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = hf_config.thinker_config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_start_token_id = thinker_config.vision_start_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            if src_item[idx] not in [
+                    audio_token_id, video_token_id, image_token_id
+            ]:
+                if use_audio_in_video and idx > 0:
+                    if src_item[idx] == vision_end_token_id and \
+                        src_item[idx - 1] == audio_end_token_id:
+                        # processing the <|audio_eos|> before <|vision_eos|>
+                        start_idx -= 1
+                    elif src_item[idx] == audio_start_token_id and \
+                        src_item[idx - 1] == vision_start_token_id:
+                        # processing the <|audio_bos|> after <|vision_eos|>
+                        start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx],
+                                           dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1)
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs,
+                    grid_ws)
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2)
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (torch.arange(grid_t) *
+                           second_per_grid_ts[video_idx] *
+                           tokens_per_second).long()
+                t_index_split_chunk = cls._split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk)
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: List[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = len(
+                        t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    new_src_item.extend([video_token_id] *
+                                        vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
+                        start_idx, video_idx, spatial_merge_size, t_chunk,
+                        grid_hs, grid_ws).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len -
+                            added_audio_len) * [audio_token_id])
+                    audio_start_idx = start_idx if len(
+                        audio_llm_pos_ids_list
+                    ) == 0 else audio_llm_pos_ids_list[-1][0].item() + 1
+                    if min(t_ntoken_per_chunk,
+                           pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (torch.arange(
+                            min(t_ntoken_per_chunk, pure_audio_len -
+                                added_audio_len)).expand(3, -1) +
+                                                  audio_start_idx).split(1,
+                                                                         dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(t_ntoken_per_chunk,
+                                           pure_audio_len - added_audio_len)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id])
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(
+                            3, -1) + llm_pos_ids_list[-1].max() + 1).split(
+                                1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = torch.cat(llm_pos_ids_list,
+                                         dim=1).max() + 1 - len(src_item)
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
+    @staticmethod
+    def _get_llm_pos_ids_for_vision(
+        start_idx: int,
+        vision_idx: int,
+        spatial_merge_size: int,
+        t_index: List[int],
+        grid_hs: torch.Tensor,
+        grid_ws: torch.Tensor,
+    ) -> torch.Tensor:
+        llm_pos_ids_list = []
+        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+        h_index = (torch.arange(llm_grid_h).view(1, -1, 1).expand(
+            len(t_index), -1, llm_grid_w).flatten())
+        w_index = (torch.arange(llm_grid_w).view(1, 1, -1).expand(
+            len(t_index), llm_grid_h, -1).flatten())
+        t_index_tensor = torch.Tensor(t_index).to(llm_grid_h.device).view(
+            -1, 1).expand(-1, llm_grid_h * llm_grid_w).long().flatten()
+        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+        return llm_pos_ids
+
+    @staticmethod
+    def _split_list_into_ranges(lst: torch.Tensor,
+                                interval: int) -> List[List[int]]:
+        ranges: List[List[int]] = [[]
+                                   for _ in range((max(lst) // interval) + 1)]
+        for num in lst:
+            index = num // interval
+            ranges[index].append(num)
+        return ranges
+
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
@@ -1144,6 +1414,58 @@ def get_next_input_positions_tensor(
             mrope_position_delta + seq_len,
         ).expand(3, -1)
 
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: Union[List[int], torch.Tensor],
+        video_second_per_grid_t: float,
+    ) -> List[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(thinker_config.vision_config,
+                                    "tokens_per_second", 25)
+
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (torch.arange(grid_t) * video_second_per_grid_t *
+                   tokens_per_second).long()
+        t_index_split_chunk = cls._split_list_into_ranges(
+            t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = len(t_chunk) * grid_h * grid_w // (
+                spatial_merge_size**2)
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk,
+                                   audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
+
 
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
 
@@ -1176,7 +1498,7 @@ def get_rope(
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
-    if rope_scaling is None:
+    if not rope_scaling:
         rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
                                      is_neox_style, dtype)
     else:
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 1ee1332ac45..920c0f5a6ec 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -110,6 +110,11 @@ class SamplerOutput(
     # 'broadcasted' to all other PP ranks for next step.
     sampled_token_ids_cpu: Optional[torch.Tensor] = None
 
+    # On-device tensor containing the sampled token embeddings (embeddings
+    # corresponding to the sampled token ids). Used when prompt embeddings are
+    # specified in lieu of prompt token ids or text.
+    sampled_token_embeds: Optional[torch.Tensor] = None
+
     # Spec decode metrics populated by workers.
     spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None
 
@@ -183,7 +188,7 @@ def __init__(self):
 
         # Whether or not the SamplerOutput should have on-device tensors
         # containing the sampled token ids and probabilities. This is used by
-        # speculative decoding.
+        # speculative decoding and when prompt embeddings are specified.
         self.include_gpu_probs_tensor = False
         self.should_modify_greedy_probs_inplace = False
 
@@ -230,7 +235,7 @@ def forward(
         * Defer Pythonization of sampling result & logprobs
           tensor
         * Encapsulate arguments required for deferred Pythonization
-          in the :class:`SamplerOutput` structure
+          in the {class}`SamplerOutput` structure
 
         Args:
             logits: (num_tokens, vocab_size).
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index 95362c280b4..527a301cd8e 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -107,14 +107,15 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids):
         A draft token_id x_{n+k} is accepted if it satisfies the
         following condition
     
-        .. math::
-            p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
-            \min \left( \epsilon, \delta * \exp \left(
-                -H(p_{\text{original}}(
-                    \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
+        :::{math}
+        p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
+        \min \left( \epsilon, \delta * \exp \left(
+            -H(p_{\text{original}}(
+                \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
+        :::
         
-        where :math:`p_{\text{original}}` corresponds to target_probs 
-        and :math:`\epsilon` and :math:`\delta` correspond to hyperparameters
+        where {math}`p_{\text{original}}` corresponds to target_probs 
+        and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
         specified using self._posterior_threshold and self._posterior_alpha
 
         This method computes the posterior probabilities for the given
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index a9ef973917e..adb966c4b1c 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -1,9 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Utility methods for model layers."""
-from typing import Tuple
+from typing import Callable, Optional, Tuple
 
 import torch
 
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.platforms import current_platform
+
 
 def get_token_bin_counts_and_mask(
     tokens: torch.Tensor,
@@ -47,12 +51,49 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
         output_tokens_tensor, vocab_size, num_seqs)
     repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
         1, vocab_size)
-    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
-                                      repetition_penalties, 1.0)[logits > 0]
-    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
-                                       repetition_penalties, 1.0)[logits <= 0]
+
+    # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+    penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
+                            1.0)
+
+    # If logits are positive, divide by penalty, otherwise multiply by penalty.
+    scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
+    logits *= scaling
+
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
     logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
     logits -= presence_penalties.unsqueeze(dim=1) * output_mask
     return logits
+
+
+def rocm_unquantized_gemm(x: torch.Tensor,
+                          weight: torch.Tensor,
+                          bias: Optional[torch.Tensor] = None):
+    from vllm.platforms.rocm import on_mi250_mi300
+    k = weight.shape[1]
+    use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300() and \
+                    x.dtype in [torch.float16, torch.bfloat16] \
+                    and k % 8 == 0 and bias is None)
+
+    if use_skinny is not True:
+        return torch.nn.functional.linear(x, weight, bias)
+
+    x_view = x.view(-1, x.size(-1))
+    n = x_view.shape[0]
+    m = weight.shape[0]
+    cu_count = current_platform.get_cu_count()
+
+    if m > 8 and 0 < n < 4:
+        out = ops.wvSplitK(weight, x_view, cu_count)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    elif m % 4 == 0 and n == 1 and k <= 8192:
+        out = ops.LLMM1(weight, x_view, 4)
+        return out.view(*x.shape[:-1], weight.shape[0])
+    return torch.nn.functional.linear(x, weight, bias)
+
+
+def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
+    if current_platform.is_rocm():
+        return rocm_unquantized_gemm
+    return torch.nn.functional.linear
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 1eb0c8c2ef4..d5eaeec1ae2 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -12,6 +12,7 @@
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
 from vllm.model_executor.parameter import BasevLLMParameter
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -40,7 +41,7 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return F.linear(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(x, layer.weight, bias)
 
     def embedding(self, layer: torch.nn.Module,
                   input_: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b0a0a20aa76..01f75db9ee8 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -384,6 +384,7 @@ def _get_weights_iterator(
             weights_iterator = pt_weights_iterator(
                 hf_weights_files,
                 self.load_config.use_tqdm_on_load,
+                self.load_config.pt_load_map_location,
             )
 
         if current_platform.is_tpu():
@@ -611,8 +612,12 @@ class ShardedStateLoader(BaseModelLoader):
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
 
-    def __init__(self, load_config: LoadConfig):
+    def __init__(self,
+                 load_config: LoadConfig,
+                 runai_model_streamer: bool = False):
         super().__init__(load_config)
+
+        self.runai_model_streamer = runai_model_streamer
         extra_config = ({} if load_config.model_loader_extra_config is None
                         else load_config.model_loader_extra_config.copy())
         self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
@@ -659,7 +664,7 @@ def get_end_ptr(tensor: torch.Tensor) -> int:
 
     def _prepare_weights(self, model_name_or_path: str,
                          revision: Optional[str]):
-        if os.path.isdir(model_name_or_path):
+        if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
             return model_name_or_path
         else:
             allow_patterns = ["*.safetensors"]
@@ -678,12 +683,13 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
         target_device = torch.device(device_config.device)
-        from safetensors.torch import safe_open
 
         from vllm.distributed import get_tensor_model_parallel_rank
 
-        local_model_path = self._prepare_weights(model_config.model,
-                                                 model_config.revision)
+        model_weights = model_config.model
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+        local_model_path = model_weights
 
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
@@ -695,40 +701,56 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                 local_model_path,
                 self.pattern.format(rank=rank, part="*"),
             )
-            filepaths = glob.glob(pattern)
+
+            filepaths = []
+            if is_s3(local_model_path):
+                file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}"
+                filepaths = s3_glob(path=local_model_path,
+                                    allow_pattern=[file_pattern])
+            else:
+                filepaths = glob.glob(pattern)
             if not filepaths:
                 # TODO: support un-sharded checkpoints too
                 raise ValueError(
                     f"Could not find checkpoint files '{pattern}', only "
                     f"pre-sharded checkpoints are currently supported!")
             state_dict = self._filter_subtensors(model.state_dict())
-            for path in filepaths:
-                with safe_open(path, framework="pt") as f:
-                    for key in f.keys():  # noqa: SIM118
-                        tensor = f.get_tensor(key)
-                        # If loading with LoRA enabled, additional padding may
-                        # be added to certain parameters. We only load into a
-                        # narrowed view of the parameter data.
-                        param_data = state_dict[key].data
-                        param_shape = state_dict[key].shape
-                        for dim, size in enumerate(tensor.shape):
-                            if size < param_shape[dim]:
-                                param_data = param_data.narrow(dim, 0, size)
-                        if tensor.shape != param_shape:
-                            logger.warning(
-                                "loading tensor of shape %s into "
-                                "parameter '%s' of shape %s",
-                                tensor.shape,
-                                key,
-                                param_shape,
-                            )
-                        param_data.copy_(tensor)
-                        state_dict.pop(key)
+            for key, tensor in self.iterate_over_files(filepaths):
+                # If loading with LoRA enabled, additional padding may
+                # be added to certain parameters. We only load into a
+                # narrowed view of the parameter data.
+                param_data = state_dict[key].data
+                param_shape = state_dict[key].shape
+                for dim, size in enumerate(tensor.shape):
+                    if size < param_shape[dim]:
+                        param_data = param_data.narrow(dim, 0, size)
+                if tensor.shape != param_shape:
+                    logger.warning(
+                        "loading tensor of shape %s into "
+                        "parameter '%s' of shape %s",
+                        tensor.shape,
+                        key,
+                        param_shape,
+                    )
+                param_data.copy_(tensor)
+                state_dict.pop(key)
             if state_dict:
                 raise ValueError(
                     f"Missing keys {tuple(state_dict)} in loaded state!")
         return model.eval()
 
+    def iterate_over_files(
+            self, paths) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        if self.runai_model_streamer:
+            yield from runai_safetensors_weights_iterator(paths, True)
+        else:
+            from safetensors.torch import safe_open
+            for path in paths:
+                with safe_open(path, framework="pt") as f:
+                    for key in f.keys():  # noqa: SIM118
+                        tensor = f.get_tensor(key)
+                        yield key, tensor
+
     @staticmethod
     def save_model(
         model: torch.nn.Module,
@@ -869,6 +891,7 @@ def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
             iterator = pt_weights_iterator(
                 hf_weights_files,
                 self.load_config.use_tqdm_on_load,
+                self.load_config.pt_load_map_location,
             )
         for org_name, param in iterator:
             # mapping weight names from transformers to vllm while preserving
@@ -1515,4 +1538,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.RUNAI_STREAMER:
         return RunaiModelStreamerLoader(load_config)
 
+    if load_config.load_format == LoadFormat.RUNAI_STREAMER_SHARDED:
+        return ShardedStateLoader(load_config, runai_model_streamer=True)
+
     return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
index 67aaad10fcf..a7b313f4e50 100644
--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -180,7 +180,6 @@ def _get_neuron_config_after_override(default_neuron_config,
                                              NeuronConfig, QuantizationConfig,
                                              SparseAttnConfig)
 
-    overridden_neuron_config = overridden_neuron_config or {}
     sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
     if sparse_attn:
         overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 15f37aad6d8..0ca6b6fd88b 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -30,15 +30,6 @@ def set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
-def is_transformers_impl_compatible(
-        arch: str,
-        module: Optional["transformers.PreTrainedModel"] = None) -> bool:
-    mod = module or getattr(transformers, arch, None)
-    if mod is None:
-        return False
-    return mod.is_backend_compatible()
-
-
 def resolve_transformers_arch(model_config: ModelConfig,
                               architectures: list[str]):
     for i, arch in enumerate(architectures):
@@ -55,20 +46,32 @@ def resolve_transformers_arch(model_config: ModelConfig,
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_class_from_dynamic_module(module, model_config.model)
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
         }
-        custom_model_module = auto_modules.get("AutoModel")
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom).")
+            model_module = auto_modules["AutoModel"]
         # TODO(Isotr0py): Further clean up these raises.
         # perhaps handled them in _ModelRegistry._raise_for_unsupported?
         if model_config.model_impl == ModelImpl.TRANSFORMERS:
-            if not is_transformers_impl_compatible(arch, custom_model_module):
+            if not model_module.is_backend_compatible():
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
             architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
-            if not is_transformers_impl_compatible(arch, custom_model_module):
+            if not model_module.is_backend_compatible():
                 raise ValueError(
                     f"{arch} has no vLLM implementation and the Transformers "
                     "implementation is not compatible with vLLM. Try setting "
@@ -97,10 +100,10 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     vllm_supported_archs = ModelRegistry.get_supported_archs()
-    is_vllm_supported = any(arch in vllm_supported_archs
-                            for arch in architectures)
-    if (not is_vllm_supported
-            or model_config.model_impl == ModelImpl.TRANSFORMERS):
+    vllm_not_supported = not any(arch in vllm_supported_archs
+                                 for arch in architectures)
+    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
+            model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
         architectures = resolve_transformers_arch(model_config, architectures)
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 1bb592f492e..10bc55ca5f7 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -464,7 +464,7 @@ def fastsafetensors_weights_iterator(
     hf_weights_files: List[str],
     use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    """Iterate over the weights in the model safetensor files 
+    """Iterate over the weights in the model safetensor files
     using fastsafetensor library."""
     if torch.distributed.is_initialized():
         pg = torch.distributed.group.WORLD
@@ -502,6 +502,7 @@ def fastsafetensors_weights_iterator(
 def pt_weights_iterator(
     hf_weights_files: List[str],
     use_tqdm_on_load: bool,
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
     for bin_file in tqdm(
@@ -510,7 +511,9 @@ def pt_weights_iterator(
             disable=not enable_tqdm(use_tqdm_on_load),
             bar_format=_BAR_FORMAT,
     ):
-        state = torch.load(bin_file, map_location="cpu", weights_only=True)
+        state = torch.load(bin_file,
+                           map_location=pt_load_map_location,
+                           weights_only=True)
         yield from state.items()
         del state
 
@@ -716,10 +719,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
             logger.warning_once(
-                f"Found kv_scale in the checkpoint (e.g. {name}), "
-                "but not found the expected name in the model "
-                f"(e.g. {remapped_name}). kv_scale is "
-                "not loaded.")
+                "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
+                name,
+                remapped_name,
+            )
             return None
         return remapped_name
 
@@ -738,10 +741,12 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
                 remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
                 logger.warning_once(
-                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
-                    "but not found the expected name in the model "
-                    f"(e.g. {remapped_name}). {scale_name} is "
-                    "not loaded.")
+                    "Found %s in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). %s is not loaded.",  # noqa: E501
+                    scale_name,
+                    name,
+                    remapped_name,
+                    scale_name,
+                )
                 return None
             return remapped_name
 
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
new file mode 100644
index 00000000000..730e770dc3d
--- /dev/null
+++ b/vllm/model_executor/models/aimv2.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# A modified implementation of the AIMv2 Transformer
+# inserted here also the image tokenizer used by Ovis2
+from typing import Optional
+
+import torch
+from torch import nn, softmax
+from torch.nn import functional as F
+from torch.nn.functional import gumbel_softmax, pad
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.transformers_utils.configs.ovis2 import (AIMv2Config,
+                                                   Aimv2VisualTokenizerConfig)
+
+IMAGE_INDICATOR_IDS = [-301, -302, -303, -304,
+                       -305]  # kept for vocab prefixed tokens
+
+
+def st_argmax(y_soft: torch.Tensor, dim: int):  # straight-through softmax
+    index = y_soft.max(dim, keepdim=True)[1]
+    y_hard = torch.zeros_like(
+        y_soft, memory_format=torch.legacy_contiguous_format).scatter_(
+            dim, index, 1.0)
+    ret = y_hard - y_soft.detach() + y_soft
+    return ret
+
+
+class Aimv2VisualTokenizer(torch.nn.Module):
+
+    def __init__(self,
+                 config: Aimv2VisualTokenizerConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "",
+                 **kwargs):
+        super().__init__()
+        self.config = config
+        self.backbone = AIMv2Model(
+            config=config.backbone_config,  # noqa
+            quant_config=quant_config,
+            prefix=f"{prefix}.visual_tokenizer")
+        # reserved tokens for IMAGE_INDICATORS
+        head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            ReplicatedLinear(
+                config.backbone_config.hidden_size * config.hidden_stride *
+                config.hidden_stride,
+                head_dim,
+                bias=False,
+            ), torch.nn.LayerNorm(head_dim))
+
+    @property
+    def dtype(self):
+        return self.backbone.dtype
+
+    @property
+    def device(self):
+        return self.backbone.device
+
+    def tokenize(self, logits):
+        if self.config.tokenize_function == 'softmax':
+            tokens = softmax(logits, dim=-1)
+        elif self.config.tokenize_function == 'gumbel_argmax':
+            tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
+        elif self.config.tokenize_function == 'st_argmax':
+            tokens = st_argmax(logits, dim=-1)
+        else:
+            raise ValueError(
+                'Invalid `max_type`, expected softmax or gumbel_argmax '
+                f'or st_argmax, but got {self.config.tokenize_function}')
+        return tokens
+
+    def encode(self, pixel_values):
+        features = self.backbone(pixel_values)
+        if self.config.drop_cls_token:
+            features = features[:, 1:, :]
+
+        # merge number of `hidden_stride * hidden_stride` hidden states together
+        # to reduce token sequence length
+        # e.g., for hidden_stride=2, this leads to a token length reduction:
+        # 1024 -> 256 for aimv2
+        if self.config.hidden_stride > 1:
+            # this `d` maybe different from the above `d``
+            n, L, d = features.shape
+            sqrt_l = int(L**0.5)
+            assert sqrt_l**2 == L, (
+                "The token sequence length should be a perfect square.")
+            features = features.reshape(n, sqrt_l, sqrt_l, d)
+            pl = (self.config.hidden_stride -
+                  (sqrt_l %
+                   self.config.hidden_stride)) % self.config.hidden_stride
+            features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0)
+            sqrt_l += pl
+            features = features.reshape(n, sqrt_l // self.config.hidden_stride,
+                                        self.config.hidden_stride,
+                                        sqrt_l // self.config.hidden_stride,
+                                        self.config.hidden_stride, d)
+            # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d]
+            features = features.permute(0, 1, 3, 2, 4, 5)
+            # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d]
+            features = features.flatten(3)
+            # [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d]
+            features = features.reshape(
+                n, -1,
+                self.config.hidden_stride * self.config.hidden_stride * d)
+
+        return features
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]"""
+        features = self.encode(pixel_values)
+        logits, _ = self.head[0](
+            features)  # we spllit the sequncial here for not throwing an error
+        logits = self.head[1](logits)
+        tokens = self.tokenize(logits)
+        # tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with
+        # [BatchSize, #Token, 5], after which, tokens' shape should become
+        # [BatchSize, #Token, VocabSize]
+        batch_size, token_len, _ = tokens.shape
+        padding_tensor = torch.zeros(size=(batch_size, token_len,
+                                           len(IMAGE_INDICATOR_IDS)),
+                                     dtype=tokens.dtype,
+                                     device=tokens.device,
+                                     layout=tokens.layout,
+                                     requires_grad=False)
+        tokens = torch.cat((tokens, padding_tensor), dim=2)
+        return tokens
+
+
+class AIMv2SwiGLUFFN(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.hidden_size
+        bias = config.use_bias
+
+        # TODO(Isotr0py): investigate if we can add TP to visual tokenizer
+        self.fc1 = ReplicatedLinear(in_features,
+                                    hidden_features,
+                                    bias=bias,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.fc1")
+        self.fc2 = ReplicatedLinear(hidden_features,
+                                    in_features,
+                                    bias=bias,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.fc2")
+        self.fc3 = ReplicatedLinear(in_features,
+                                    hidden_features,
+                                    bias=bias,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.fc3")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        gate, _ = self.fc3(x)
+        x_parallel = F.silu(x_parallel) * gate
+        out, _ = self.fc2(x_parallel)
+        return out
+
+
+class AIMv2PatchEmbed(nn.Module):
+
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.hidden_size,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm.forward_native(x)
+        return x
+
+
+class AIMv2ViTPreprocessor(nn.Module):
+
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        num_patches = (config.image_size // config.patch_size)**2
+
+        self.patchifier = AIMv2PatchEmbed(config)
+        self.pos_embed = nn.Parameter(
+            torch.zeros((1, num_patches, config.hidden_size)))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        tokens = self.patchifier(x)
+        _, N, _ = tokens.shape
+        pos_embed = self.pos_embed.to(tokens.device)
+        tokens = tokens + pos_embed[:, :N]
+        return tokens
+
+
+class AIMv2Attention(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+        dim = config.hidden_size
+
+        # TODO(Isotr0py): investigate if we can add TP to visual tokenizer
+        self.num_heads = config.num_attention_heads
+        self.qkv = ReplicatedLinear(dim, dim * 3, bias=config.qkv_bias)
+        # self.qkv = QKVParallelLinear(
+        #               hidden_size=dim,
+        #               head_size=dim // config.num_attention_heads,
+        #               total_num_heads=config.num_attention_heads,
+        #               bias=config.qkv_bias,
+        #               quant_config=quant_config,
+        #               prefix=f"{prefix}.qkv")
+        self.proj = ReplicatedLinear(dim, dim, bias=config.use_bias)
+        # self.proj = RowParallelLinear(input_size=dim,
+        #                  output_size=dim,
+        #                  bias = config.use_bias,
+        #                  quant_config=quant_config,
+        #                  prefix=f"{prefix}.proj")
+
+    def forward(  # todo might implement multiple attn implementations
+            self,
+            x: torch.Tensor,
+            mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv, _ = self.qkv(x)
+
+        qkv = qkv.reshape(B, N, 3, self.num_heads,
+                          C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv.unbind(0)
+
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        x = x.transpose(1, 2).contiguous().reshape(B, N, C)
+        x, _ = self.proj(x)
+        return x
+
+
+class AIMv2Block(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+        self.attn = AIMv2Attention(config,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.attn")
+        self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = AIMv2SwiGLUFFN(config,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.mlp")
+        self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self,
+                x: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = x + self.attn(self.norm_1.forward_native(x), mask)
+        x = x + self.mlp(self.norm_2.forward_native(x))
+        return x
+
+
+class AIMv2Transformer(nn.Module):
+
+    def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig,
+                 prefix: str):
+        super().__init__()
+
+        self.blocks = nn.ModuleList([
+            AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}")
+            for i in range(config.num_hidden_layers)
+        ])
+        self.post_trunk_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # they take the -1 as the ref embeddings, like a clip skip
+        for block in self.blocks:
+            tokens = block(tokens, mask)
+        # NO NORM IN THE OG IMPLEMENTATION
+        # tokens = self.post_trunk_norm(tokens)
+        return tokens
+
+
+class AIMv2Model(torch.nn.Module):
+
+    def __init__(self,
+                 config: AIMv2Config,
+                 quant_config: QuantizationConfig,
+                 prefix: str = ""):
+        super().__init__()
+        self.preprocessor = AIMv2ViTPreprocessor(config)
+        self.trunk = AIMv2Transformer(config,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.trunk")
+
+    @property
+    def dtype(self):
+        return self.trunk.blocks[0].attn.qkv.weight.dtype
+
+    @property
+    def device(self):
+        return self.trunk.blocks[0].attn.qkv.device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        x = self.preprocessor(pixel_values)
+        x = self.trunk(x, mask)
+
+        return x
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 065715cbde4..c518efdb54f 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -24,7 +24,6 @@
 from vllm.model_executor.layers.quantization.deepspeedfp import (
     DeepSpeedFPConfig, DeepSpeedFPParameter)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -176,10 +175,8 @@ def local_moe_fused(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         do_normalize = self.top_k > 1
-        topk_weights, topk_ids = fused_topk(hidden_states,
-                                            router_logits,
-                                            self.top_k,
-                                            renormalize=do_normalize)
+        topk_weights, topk_ids, token_expert_indices = fused_topk(
+            hidden_states, router_logits, self.top_k, renormalize=do_normalize)
         # topk_ids: (num_tokens, k)
         if self.is_quant:
             if 2 * num_tokens <= self.num_experts:
@@ -435,7 +432,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.unpadded_vocab_size = config.vocab_size
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -462,14 +458,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index edf67c860e9..7c716efab8e 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -15,11 +15,10 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import (SamplerOutput,
-                                                SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargs)
@@ -527,7 +526,6 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
-        self.sampler = get_sampler()
 
     def _validate_image_sizes(
             self, images: List[torch.Tensor]) -> List[torch.Tensor]:
@@ -653,14 +651,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 8700f24d2bd..d152287e8fa 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0 Adapted from
 # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
-from functools import cached_property
 from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple,
                     TypedDict, Union, cast)
 
@@ -17,7 +16,6 @@
 
 from vllm.config import VllmConfig
 from vllm.jsontree import json_map_leaves
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
@@ -461,17 +459,3 @@ def compute_logits(
     ) -> Optional[torch.Tensor]:
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
-
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 6a3112b5f76..444ed38d05c 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -39,7 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -396,7 +395,6 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -423,14 +421,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index dfb8f49cc01..16dac6123d6 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -24,7 +24,6 @@
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -462,7 +461,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -538,14 +536,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 04d6cde555e..bcfbe92c3a1 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -37,7 +37,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -791,7 +790,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
     def forward(
         self,
@@ -828,14 +826,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     stacked_params_mapping = {
         "q_proj": {
             "param_name": "qkv_proj",
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index e1d77646f47..76a529c9334 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -11,8 +11,10 @@
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
+                                                   get_act_fn)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
@@ -108,6 +110,7 @@ class BertEncoder(nn.Module):
 
     def __init__(self,
                  vllm_config: VllmConfig,
+                 bias: bool = True,
                  rotary_kwargs: Optional[dict] = None,
                  prefix: str = ""):
         super().__init__()
@@ -118,6 +121,7 @@ def __init__(self,
             BertLayer(config=config,
                       cache_config=cache_config,
                       quant_config=quant_config,
+                      bias=bias,
                       rotary_kwargs=rotary_kwargs,
                       prefix=f"{prefix}.layer.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
@@ -139,6 +143,7 @@ def __init__(self,
                  config: BertConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
+                 bias: bool = True,
                  rotary_kwargs: Optional[dict] = None,
                  prefix: str = ""):
         super().__init__()
@@ -149,19 +154,31 @@ def __init__(self,
             layer_norm_eps=config.layer_norm_eps,
             cache_config=cache_config,
             quant_config=quant_config,
+            bias=bias,
             rotary_kwargs=rotary_kwargs,
             prefix=f"{prefix}.attention")
 
-        self.intermediate = BertIntermediate(
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            quant_config=quant_config,
-            prefix=f"{prefix}.intermediate")
+        if config.hidden_act in ["silu", "gelu_and_mul"]:
+            self.intermediate = BertGatedIntermediate(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.intermediate")
+        else:
+            self.intermediate = BertIntermediate(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.intermediate")
 
         self.output = BertOutput(hidden_size=config.hidden_size,
                                  intermediate_size=config.intermediate_size,
                                  layer_norm_eps=config.layer_norm_eps,
+                                 bias=bias,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.output")
 
@@ -181,6 +198,7 @@ def __init__(
         layer_norm_eps: float,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
         rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
@@ -190,11 +208,13 @@ def __init__(
                                       num_attention_heads=num_attention_heads,
                                       cache_config=cache_config,
                                       quant_config=quant_config,
+                                      bias=bias,
                                       rotary_kwargs=rotary_kwargs,
                                       prefix=f"{prefix}.output")
 
         self.output = BertSelfOutput(hidden_size=hidden_size,
                                      layer_norm_eps=layer_norm_eps,
+                                     bias=bias,
                                      quant_config=quant_config,
                                      prefix=f"{prefix}.output")
 
@@ -215,6 +235,7 @@ def __init__(
         num_attention_heads: int,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = True,
         rotary_kwargs: Optional[dict] = None,
         prefix: str = "",
     ):
@@ -240,7 +261,7 @@ def __init__(
             head_size=self.head_dim,
             total_num_heads=self.total_num_heads,
             total_num_kv_heads=self.total_num_kv_heads,
-            bias=True,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj")
 
@@ -278,12 +299,13 @@ class BertSelfOutput(nn.Module):
     def __init__(self,
                  hidden_size: int,
                  layer_norm_eps: float,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.dense = RowParallelLinear(input_size=hidden_size,
                                        output_size=hidden_size,
-                                       bias=True,
+                                       bias=bias,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.dense")
         self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
@@ -301,12 +323,13 @@ def __init__(self,
                  hidden_size: int,
                  intermediate_size: int,
                  hidden_act: str,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.dense = ColumnParallelLinear(input_size=hidden_size,
                                           output_size=intermediate_size,
-                                          bias=True,
+                                          bias=bias,
                                           quant_config=quant_config,
                                           prefix=f"{prefix}.dense")
         self.intermediate_act_fn = get_act_fn(hidden_act)
@@ -317,19 +340,46 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+class BertGatedIntermediate(nn.Module):
+    # for NomciBert and GteModel
+
+    def __init__(self,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 hidden_act: str,
+                 bias: bool = True,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.act_fn = get_act_and_mul_fn(hidden_act)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        return hidden_states
+
+
 class BertOutput(nn.Module):
 
     def __init__(self,
                  hidden_size: int,
                  intermediate_size: int,
                  layer_norm_eps: float,
+                 bias: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
         super().__init__()
 
         self.dense = RowParallelLinear(input_size=intermediate_size,
                                        output_size=hidden_size,
-                                       bias=True,
+                                       bias=bias,
                                        quant_config=quant_config,
                                        prefix=f"{prefix}.dense")
 
@@ -343,19 +393,32 @@ def forward(self, hidden_states: torch.Tensor,
 
 
 class BertModel(nn.Module, SupportsQuant):
-    packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
+    packed_modules_mapping = {
+        "qkv_proj": ["query", "key", "value"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self,
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
                  embedding_class: type = BertEmbedding,
+                 bias: bool = True,
                  rotary_kwargs: Optional[dict] = None,
                  add_pooling_layer: bool = False):
         super().__init__()
+        """
+        For BertModel, all linear layers have bias.
+        For NomicBertModel, all linear layers do not have bias.
+        """
+
         config = vllm_config.model_config.hf_config
         self.embeddings = embedding_class(config)
         self.encoder = BertEncoder(vllm_config=vllm_config,
+                                   bias=bias,
                                    rotary_kwargs=rotary_kwargs,
                                    prefix=f"{prefix}.encoder")
         self.pooler = BertPooler(config) if add_pooling_layer else None
@@ -387,6 +450,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
             ("qkv_proj", "query", "q"),
             ("qkv_proj", "key", "k"),
             ("qkv_proj", "value", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
         ]
 
         params_dict = dict(self.named_parameters())
@@ -546,3 +611,115 @@ def forward(
                          inputs_embeds=inputs_embeds,
                          intermediate_tensors=intermediate_tensors,
                          token_type_ids=token_type_ids)
+
+
+class NomicBertEmbeddingModel(BertEmbeddingModel):
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "emb_ln": "embeddings.LayerNorm",
+            "layers": "layer",
+            "attn.Wqkv": "attention.self.qkv_proj",
+            "attn.out_proj": "attention.output.dense",
+            'norm1': "attention.output.LayerNorm",
+            'mlp.fc11': "intermediate.up_proj",
+            'mlp.fc12': "intermediate.gate_proj",
+            'mlp.fc2': "output.dense",
+            'norm2': "output.LayerNorm",
+        })
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "NomicBertConfig"
+        assert config.activation_function == "swiglu"
+
+        # Assume NomicBertModel all linear layers do not have bias
+        assert not config.mlp_fc1_bias
+        assert not config.mlp_fc2_bias
+        assert not config.qkv_proj_bias
+
+        config.layer_norm_eps = config.layer_norm_epsilon
+        config.position_embedding_type = "rotary"
+        config.intermediate_size = config.n_inner
+        config.hidden_act = "silu"
+        config.hidden_size = config.n_embd
+        config.num_hidden_layers = config.n_layer
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_trained_positions,
+            "base": config.rotary_emb_base,
+            "rope_scaling": {
+                "rope_type": "dynamic",
+                "factor": config.rotary_scaling_factor
+            }
+        }
+
+        return BertModel(vllm_config=vllm_config,
+                         prefix=prefix,
+                         bias=False,
+                         rotary_kwargs=rotary_kwargs,
+                         embedding_class=BertEmbedding)
+
+
+class GteEmbeddingModel(BertEmbeddingModel):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attention.qkv_proj": "attention.self.qkv_proj",
+            "attention.o_proj": "attention.output.dense",
+            'attn_ln': "attention.output.LayerNorm",
+            'mlp.down_proj': "output.dense",
+            'mlp_ln': "output.LayerNorm",
+        })
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        config = vllm_config.model_config.hf_config
+
+        assert config.__class__.__name__ == "GteConfig"
+        assert config.position_embedding_type == "rope"
+        assert config.hidden_act == "gelu"
+
+        config.position_embedding_type = "rotary"
+        config.hidden_act = "gelu_and_mul"
+
+        head_dim = config.hidden_size // config.num_attention_heads
+        rotary_kwargs = {
+            "head_size": head_dim,
+            "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
+            "max_position": config.max_position_embeddings,
+            "base": config.rope_theta,
+        }
+
+        model = BertModel(vllm_config=vllm_config,
+                          prefix=prefix,
+                          rotary_kwargs=rotary_kwargs,
+                          embedding_class=BertEmbedding)
+
+        # GteModel only gate_up_proj does not have bias.
+        # Hack method learned from vllm/model_executor/models/glm.py
+        for layer in model.encoder.layer:
+            layer.intermediate.gate_up_proj.bias = None
+            layer.intermediate.skip_bias_add = True
+        return model
+
+    def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        n = "mlp.up_gate_proj"
+        for name, weight in weights:
+            if n in name:
+                up, gate = weight.chunk(2, dim=0)
+                yield name.replace(n, "intermediate.up_proj"), up
+                yield name.replace(n, "intermediate.gate_proj"), gate
+            else:
+                yield name, weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        weights = self.split_up_gate_proj(weights)
+        self.model.load_weights(weights)
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index a6f00f99977..f44565bd2e0 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -12,7 +11,6 @@
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -62,6 +60,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig],
         cache_config: Optional[CacheConfig],
         is_cross_attention: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
@@ -141,7 +140,7 @@ def forward(
 
 class Blip2QFormerSelfOutput(nn.Module):
 
-    def __init__(self, config: Blip2QFormerConfig) -> None:
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
         super().__init__()
 
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -169,6 +168,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig],
         cache_config: Optional[CacheConfig],
         is_cross_attention: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
@@ -177,9 +177,10 @@ def __init__(
             quant_config=quant_config,
             cache_config=cache_config,
             is_cross_attention=is_cross_attention,
+            prefix=f"{prefix}.attention",
         )
 
-        self.output = Blip2QFormerSelfOutput(config)
+        self.output = Blip2QFormerSelfOutput(config, prefix=f"{prefix}.output")
 
     def forward(
         self,
@@ -197,7 +198,7 @@ def forward(
 
 class Blip2QFormerIntermediate(nn.Module):
 
-    def __init__(self, config: Blip2QFormerConfig) -> None:
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
         super().__init__()
 
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -211,7 +212,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 class Blip2QFormerOutput(nn.Module):
 
-    def __init__(self, config: Blip2QFormerConfig) -> None:
+    def __init__(self, config: Blip2QFormerConfig, prefix: str = "") -> None:
         super().__init__()
 
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -239,6 +240,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig],
         cache_config: Optional[CacheConfig],
         layer_idx: int,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
@@ -246,7 +248,8 @@ def __init__(
         self.seq_len_dim = 1
         self.attention = Blip2QFormerAttention(config,
                                                quant_config=quant_config,
-                                               cache_config=cache_config)
+                                               cache_config=cache_config,
+                                               prefix=f"{prefix}.attention")
 
         self.layer_idx = layer_idx
 
@@ -255,13 +258,16 @@ def __init__(
                 config,
                 quant_config=quant_config,
                 cache_config=cache_config,
-                is_cross_attention=True)
+                is_cross_attention=True,
+                prefix=f"{prefix}.crossattention")
             self.has_cross_attention = True
         else:
             self.has_cross_attention = False
 
-        self.intermediate_query = Blip2QFormerIntermediate(config)
-        self.output_query = Blip2QFormerOutput(config)
+        self.intermediate_query = Blip2QFormerIntermediate(
+            config, prefix=f"{prefix}.intermediate_query")
+        self.output_query = Blip2QFormerOutput(config,
+                                               prefix=f"{prefix}.output_query")
 
     def forward(
         self,
@@ -327,6 +333,7 @@ def __init__(
         *,
         quant_config: Optional[QuantizationConfig],
         cache_config: Optional[CacheConfig],
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
@@ -336,7 +343,8 @@ def __init__(
             Blip2QFormerLayer(config,
                               quant_config=quant_config,
                               cache_config=cache_config,
-                              layer_idx=layer_idx)
+                              layer_idx=layer_idx,
+                              prefix=f"{prefix}.layer.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
         ])
 
@@ -367,6 +375,7 @@ def __init__(
         *,
         quant_config: Optional[QuantizationConfig],
         cache_config: Optional[CacheConfig],
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
@@ -378,7 +387,8 @@ def __init__(
 
         self.encoder = Blip2QFormerEncoder(config,
                                            quant_config=quant_config,
-                                           cache_config=cache_config)
+                                           cache_config=cache_config,
+                                           prefix=f"{prefix}.encoder")
 
     def forward(
         self,
@@ -513,7 +523,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.qformer = Blip2QFormerModel(config.qformer_config,
                                          cache_config=cache_config,
-                                         quant_config=quant_config)
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.qformer")
 
         self.language_projection = nn.Linear(
             config.qformer_config.hidden_size,
@@ -530,13 +541,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -649,7 +653,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
         """Run forward pass for BLIP-2.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -677,8 +681,9 @@ def forward(
                 batch.
             pixel_values: The pixels in each input image.
         
-        See also:
-            :class:`Blip2ImageInputs`
+        :::{seealso}
+        {class}`Blip2ImageInputs`
+        :::
         """
 
         if intermediate_tensors is not None:
@@ -707,13 +712,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index f960075b98b..74d401b295c 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -297,7 +296,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                           self.config.hidden_size)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -324,14 +322,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 0ad5e89df2e..ef8b033f384 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -22,7 +22,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -950,7 +949,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -1054,14 +1052,6 @@ def compute_logits(
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -1121,10 +1111,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint (e.g. "
-                                f"{name}), but not found the expected name in "
-                                f"the model (e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 1b1738f882b..233e9ee0a25 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -21,7 +21,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -429,7 +428,6 @@ def __init__(
                 self.transformer.embedding.weight)
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -442,14 +440,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index bb8d9bf8a03..8f64e5d5c96 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -89,6 +88,7 @@ def __init__(
         self,
         config: CohereConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -99,12 +99,14 @@ def __init__(
             [self.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             self.intermediate_size,
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.act_fn = SiluAndMul()
 
@@ -158,12 +160,14 @@ def __init__(
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -244,7 +248,9 @@ def __init__(self,
                                          quant_config=quant_config,
                                          prefix=f"{prefix}.self_attn")
 
-        self.mlp = CohereMLP(config, quant_config=quant_config)
+        self.mlp = CohereMLP(config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.mlp")
         self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
                                          eps=config.layer_norm_eps)
 
@@ -365,7 +371,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 scale=config.logit_scale)
         self.model = CohereModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "model"))
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -399,14 +404,6 @@ def compute_logits(
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -421,6 +418,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
 
+            # Skip loading rotary embeddings since vLLM has its own
+            if "rotary_emb.inv_freq" in name:
+                continue
+
             if (self.quant_config is not None and
                 (scale_name := self.quant_config.get_cache_scale(name))):
                 # Loading kv cache quantization scales
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index b66529860bc..9ec245cce18 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -16,7 +16,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -390,7 +389,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -417,14 +415,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         expert_params_mapping = [(
@@ -461,13 +451,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 break
 
             else:
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
                 if is_pp_missing_parameter(name, self):
                     continue
+                # Remapping the name of FP8 kv-scale.
                 name = maybe_remap_kv_scale_name(name, params_dict)
                 if name is None:
                     continue
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 5e036d049a8..c6421143dd6 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -43,7 +43,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -453,7 +452,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -480,14 +478,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e7fde76cd0b..b50175cf764 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -10,7 +10,6 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -154,8 +153,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                  prefix=maybe_prefix(
                                                      prefix, "model"))
 
-        self.sampler = get_sampler()
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -179,14 +176,6 @@ def compute_logits(
         return self.model.compute_logits(hidden_states, sampling_metadata,
                                          spec_step_idx)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 23b450aedda..ce86b9b2c4f 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -455,9 +454,7 @@ def __init__(
             qk_head_dim=self.qk_head_dim,
             v_head_dim=self.v_head_dim,
             rotary_emb=self.rotary_emb,
-            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
             kv_b_proj=self.kv_b_proj,
-            o_proj=self.o_proj,
         )
 
         self.prefix = prefix
@@ -469,17 +466,22 @@ def forward(
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
         if self.q_lora_rank is not None:
-            ckq = self.q_a_proj(hidden_states)[0]
-            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+            q_c = self.q_a_proj(hidden_states)[0]
+            q_c = self.q_a_layernorm(q_c)
+            q = self.q_b_proj(q_c)[0]
         else:
-            hidden_states_or_q_c = hidden_states
+            q = self.q_proj(hidden_states)[0]
         kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
             [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
-        return self.mla_attn(hidden_states_or_q_c,
-                             kv_c_normed,
-                             k_pe,
-                             output_shape=hidden_states.shape)
+
+        attn_out = self.mla_attn(
+            q,
+            kv_c_normed,
+            k_pe,
+            output_shape=(hidden_states.shape[0],
+                          self.num_local_heads * self.v_head_dim))
+        return self.o_proj(attn_out)[0]
 
 
 class DeepseekV2DecoderLayer(nn.Module):
@@ -686,7 +688,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -713,14 +714,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def make_empty_intermediate_tensors(
             self, batch_size: int, dtype: torch.dtype,
             device: torch.device) -> IntermediateTensors:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index c3dbadb2927..6d8f27530ce 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -4,7 +4,6 @@
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -16,7 +15,6 @@
 from vllm.config import VllmConfig
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -24,8 +22,8 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        BaseProcessingInfo, MultiModalHashes,
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -281,24 +279,26 @@ def _cached_apply_hf_processor(
         prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
         # perform caching for the most common case
         if mm_data_items.get_count("image", strict=False) > 2:
-            # This code path corresponds to the cache being disabled
-            return self._apply_hf_processor_main(
+            return self._apply_hf_processor(
                 prompt=prompt,
-                mm_items=mm_data_items,
+                mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-                enable_hf_prompt_update=True,
+                return_mm_hashes=return_mm_hashes,
             )
 
         return super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            return_mm_hashes=return_mm_hashes,
         )
 
 
@@ -393,13 +393,6 @@ def _init_vision_module(
         model = model.to(dtype=torch.get_default_dtype())
         return model
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -647,13 +640,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 3e4a5040b7c..4ff1e785494 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -9,7 +9,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -131,10 +130,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # checkpoint file has token_map tensor.
         self.token_map = None
 
-    @property
-    def sampler(self):
-        return self.model.sampler
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.model.get_input_embeddings(input_ids)
 
@@ -188,14 +183,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
         # due to missing lm_head weights and its config being that of a
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 553c524ebc3..4a6490cd127 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -510,8 +509,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -538,14 +535,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 0e67b1ec94f..e7e03fc0997 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -473,7 +472,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 quant_config=quant_config,
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -500,14 +498,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 359cc7f3773..d1a36c3f481 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -3,7 +3,6 @@
 import math
 from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -14,7 +13,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
                                              BartParallelLMHead,
@@ -673,7 +671,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
     def forward(
         self,
@@ -716,11 +713,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> SamplerOutput:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -929,12 +921,6 @@ def _build_image_projection_layers(self, config: PretrainedConfig):
             raise NotImplementedError(
                 'Florence2 only supports COSINE as temporal embedding.')
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-        return get_sampler()
-
     def _validate_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -1110,13 +1096,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 27cd8d0986a..d6bd6155a44 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -27,7 +27,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -270,10 +269,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @property
-    def sampler(self):
-        return self.language_model.sampler
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
 
         h = w = self.config.patch_size
@@ -387,14 +382,6 @@ def compute_logits(
             self.language_model.lm_head, hidden_states, sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.language_model.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 92d99883c77..c1cc0df1117 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -35,7 +35,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -388,7 +387,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = GemmaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -415,14 +413,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index d125c666f3c..7fb2e9948c0 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -146,8 +145,8 @@ def __init__(self,
         # reference:
         # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa
         layer_idx = extract_layer_index(prefix)
-        use_sliding_window = (layer_idx % 2 == 0 and
-                              config.interleaved_sliding_window is not None)
+        use_sliding_window = (layer_idx % 2 == 0 and getattr(
+            config, "interleaved_sliding_window", None) is not None)
         sliding_window = config.interleaved_sliding_window if \
             use_sliding_window else None
         self.attn = Attention(self.num_heads,
@@ -388,7 +387,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                  prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -415,14 +413,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index fb8eccc5507..4e0d4f84ca6 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -147,7 +146,9 @@ def __init__(self,
 
         # TODO(woosuk): Add reference to the original HF implementation.
         layer_idx = extract_layer_index(prefix)
-        self.is_sliding = bool((layer_idx + 1) % config.sliding_window_pattern)
+        self.is_sliding = (getattr(
+            config, "interleaved_sliding_window", None) is not None and bool(
+                (layer_idx + 1) % config.sliding_window_pattern))
         # Initialize the rotary embedding.
         if self.is_sliding:
             # Local attention. Override the values in config.json.
@@ -493,7 +494,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                  prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -521,14 +521,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index e5a3d6762ff..65c177f8c5a 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 from torch import nn
@@ -12,7 +12,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -479,7 +478,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.quant_config = quant_config
         self.multimodal_config = multimodal_config
-        self.sliding_window = config.text_config.interleaved_sliding_window
+        self.sliding_window = getattr(config.text_config,
+                                      "interleaved_sliding_window", None)
 
         self.vision_tower = SiglipVisionModel(config.vision_config,
                                               quant_config,
@@ -503,10 +503,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def dtype(self):
         return next(self.parameters()).dtype
 
-    @property
-    def sampler(self):
-        return self.language_model.sampler
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -607,7 +603,7 @@ def forward(self,
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
+                **kwargs: object) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -685,13 +681,14 @@ def prepare_attn_masks(
             global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
             global_attn_masks.append(global_attn_mask)
 
-            # Create a local causal mask with sliding window (1024).
-            local_attn_mask = torch.ones_like(global_attn_mask)
-            local_attn_mask = torch.tril(local_attn_mask,
-                                         diagonal=-self.sliding_window)
-            local_attn_mask = torch.where(local_attn_mask == 0,
-                                          global_attn_mask, float("-inf"))
-            local_attn_masks.append(local_attn_mask)
+            if self.sliding_window is not None:
+                # Create a local causal mask with sliding window (1024).
+                local_attn_mask = torch.ones_like(global_attn_mask)
+                local_attn_mask = torch.tril(local_attn_mask,
+                                             diagonal=-self.sliding_window)
+                local_attn_mask = torch.where(local_attn_mask == 0,
+                                              global_attn_mask, float("-inf"))
+                local_attn_masks.append(local_attn_mask)
         kwargs["global_attn_masks"] = global_attn_masks
         kwargs["local_attn_masks"] = local_attn_masks
         return kwargs
@@ -704,13 +701,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 8d52da8b748..6269ebcee5c 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -3,13 +3,13 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
-from .interfaces import SupportsV0Only
 from .utils import PPMissingLayer
 
 
-class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only):
+class GlmForCausalLM(LlamaForCausalLM):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         # Hack Llama model to fit HF format GLM implementation
         # Attention difference between GLM and Llama:
@@ -17,7 +17,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # 2. There is no bias for o_proj in attention
         for layer in self.model.layers:
             if not isinstance(layer, PPMissingLayer):
-                layer.self_attn.rotary_emb.rotary_dim //= 2
                 layer.self_attn.rotary_emb.is_neox_style = False
                 layer.self_attn.o_proj.bias = None
                 layer.self_attn.o_proj.skip_bias_add = True
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index cba093cbfef..290be968cb5 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -37,7 +37,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
@@ -82,7 +81,7 @@ def __init__(self,
         partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = head_dim or hidden_size // self.total_num_heads
-        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.rotary_dim = self.head_dim
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -110,6 +109,7 @@ def __init__(self,
             base=self.rope_theta,
             rope_scaling=rope_scaling,
             partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -197,13 +197,12 @@ def forward(
         )
 
         hidden_states = self.post_self_attn_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
 
         # Fully Connected
-        hidden_states = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.post_mlp_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
 
         return hidden_states, residual
 
@@ -267,7 +266,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -295,14 +293,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 776c03f652b..e3219333915 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -255,7 +254,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -282,14 +280,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 43f3d4f6dc9..def6b1544d8 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -43,7 +42,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -244,6 +243,30 @@ def forward(
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, 'q')
+                weight_loader(param, loaded_weight, 'k')
+                weight_loader(param, loaded_weight, 'v')
+            else:
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
@@ -278,7 +301,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -305,36 +327,10 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
-                continue
-            if ".attn.bias" in name:
-                # Skip attention mask.
-                # NOTE: "c_attn.bias" should not be skipped.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
-            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
-                weight_loader(param, loaded_weight, 'q')
-                weight_loader(param, loaded_weight, 'k')
-                weight_loader(param, loaded_weight, 'v')
-            else:
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]),
+        )
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 752aec0b223..3db96fb8e18 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -43,7 +42,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -188,6 +187,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
+        self.quant_config = quant_config
         self.embed_dim = config.n_embd
         self.wte = VocabParallelEmbedding(
             config.vocab_size,
@@ -228,61 +228,6 @@ def forward(
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
-
-class GPTJForCausalLM(nn.Module, SupportsPP):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        assert not config.tie_word_embeddings
-        self.transformer = GPTJModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(
-                                         prefix, "transformer"))
-        self.lm_head = ParallelLMHead(
-            config.vocab_size,
-            config.n_embd,
-            bias=True,
-            quant_config=quant_config,
-        )
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.transformer.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata, self.lm_head.bias)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -339,3 +284,54 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class GPTJForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        assert not config.tie_word_embeddings
+        self.transformer = GPTJModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.n_embd,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata, self.lm_head.bias)
+        return logits
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 582b2ff7e75..620ee66f57e 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -299,7 +298,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.embed_out.weight = self.gpt_neox.embed_in.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.gpt_neox.make_empty_intermediate_tensors)
 
@@ -326,14 +324,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3bd6332c11c..0696a7245c2 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -441,8 +440,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -464,14 +461,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def make_empty_intermediate_tensors(
             self, batch_size: int, dtype: torch.dtype,
             device: torch.device) -> IntermediateTensors:
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
new file mode 100644
index 00000000000..b43b59da6d1
--- /dev/null
+++ b/vllm/model_executor/models/granite_speech.py
@@ -0,0 +1,777 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2025 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM Granite speeech model."""
+import math
+from typing import Iterable, Mapping, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import BatchFeature, PretrainedConfig
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import get_sampler
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .blip2 import Blip2QFormerModel
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, embed_multimodal,
+                    init_vllm_registered_model, maybe_prefix)
+
+
+### Audio Input
+class GraniteSpeechAudioInputs(TypedDict):
+
+    input_features: torch.Tensor
+    """Shape: `(bsz, num_features, 160)`"""
+
+    input_features_mask: torch.Tensor
+    """Shape: `(bsz, num_features)`"""
+
+    audio_embed_sizes: list[int]
+    """List of length `bsz`"""
+
+
+class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": 1}
+
+    # There is no limit to the maximum number of audio tokens that can be
+    # encoded as features; we pick ~5000 as a number that is probably higher
+    # than we would expect to encounter. The sequence of length
+    # get_max_audio_len() produces get_max_audio_tokens().
+    def get_max_audio_tokens(self):
+        return 5001
+
+    def get_max_audio_len(self):
+        return 8000000
+
+
+### Input Processing  & Multimodal utils
+class GraniteSpeechMultiModalProcessor(
+        BaseMultiModalProcessor[GraniteSpeechMultiModalProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_hf_processor().audio_processor
+        sampling_rate = feature_extractor.melspec_kwargs["sample_rate"]
+        return MultiModalDataParser(target_sr=sampling_rate)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_features=MultiModalFieldConfig.batched("audio"),
+            audio_embed_sizes=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        feature_extractor = processor.audio_processor
+        vocab = tokenizer.get_vocab()
+
+        # Use getattr with default to be compatible with transformers<4.48
+        audio_token = getattr(processor, "audio_token", "<|audio|>")
+        audio_token_id = vocab[audio_token]
+
+        def get_replacement(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            audio = audios.get(item_idx)
+            audio_length = audio.shape[-1]
+            num_projector_features = feature_extractor._get_num_audio_features(
+                [audio_length])[0]
+            return [audio_token_id] * num_projector_features
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[audio_token_id],
+                replacement=get_replacement,
+            )
+        ]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        if audios:
+            # GraniteSpeechFeatureExtractor accepts "audio"
+            mm_data["audio"] = audios
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        if "audio" in mm_data:
+            # Calculate the number of audio tokens per entry in the batch;
+            # This is used to split the batch back out after padding.
+            audio_token_index = self.info.get_hf_config().audio_token_index
+            processed_outputs["audio_embed_sizes"] = [
+                torch.sum(indices == audio_token_index).item()
+                for indices in processed_outputs["input_ids"]
+            ]
+
+        return processed_outputs
+
+
+class GraniteSpeechDummyInputsBuilder(
+        BaseDummyInputsBuilder[GraniteSpeechMultiModalProcessingInfo]):
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        return {
+            "audio":
+            self._get_dummy_audios(
+                length=self.info.get_max_audio_len(),
+                num_audios=num_audios,
+            )
+        }
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        hf_processor = self.info.get_hf_processor()
+        audio_token = getattr(hf_processor, "audio_token", "<|audio|>")
+        return audio_token * num_audios
+
+
+### QFormer Projector
+class GraniteSpeechEncoderProjector(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: CacheConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.projector_config.hidden_size
+        self.downsample_rate = config.downsample_rate
+        self.window_size = config.window_size
+        self.num_queries = config.window_size // config.downsample_rate
+
+        self.query = nn.Parameter(
+            torch.zeros(1, self.num_queries,
+                        config.projector_config.hidden_size))
+
+        # NOTE - this is implemented generically in transformers,
+        # but for now we create the QFormer model directly since
+        # all existing models use this for the projector.
+        self.qformer = Blip2QFormerModel(
+            config.projector_config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.qformer",
+        )
+        self.linear = nn.Linear(config.projector_config.hidden_size,
+                                config.text_config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = hidden_states.size()
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad),
+                                          "constant", 0)
+        hidden_states = hidden_states.view(batch_size * nblocks,
+                                           self.window_size, dim)
+
+        last_hidden_state = self.qformer(
+            query_embeds=self.query.data,
+            encoder_hidden_states=hidden_states,
+        )
+
+        query_proj = self.linear(
+            last_hidden_state.view(
+                batch_size,
+                nblocks * self.window_size // self.downsample_rate,
+                -1,
+            ))
+        return query_proj
+
+
+# Encoder - conformer is adapted from: https://github.com/lucidrains/conformer.git
+# NOTE - it would be nice to see if we can align this with other models using
+# conformer in vLLM, e.g., phi4mm audio.
+class GraniteSpeechConformerFeedForward(nn.Module):
+    """Feedforward module for conformer encoder blocks."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+
+        self.up_proj = ColumnParallelLinear(
+            input_size=config.hidden_dim,
+            output_size=config.hidden_dim * config.feedforward_mult,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.silu = nn.SiLU()
+
+        self.down_proj = RowParallelLinear(
+            input_size=config.hidden_dim * config.feedforward_mult,
+            output_size=config.hidden_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        hidden_states, _ = self.up_proj(hidden_states)
+        hidden_states = self.silu(hidden_states)
+        hidden_states, _ = self.down_proj(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechConformerAttention(nn.Module):
+    """Attention for conformer blocks using Shaw's relative positional
+    embeddings. See the following [paper](https://arxiv.org/pdf/1803.02155)
+    for more details.
+    """
+
+    def __init__(self, config: PretrainedConfig, prefix: str = ""):
+        super().__init__()
+
+        inner_dim = config.dim_head * config.num_heads
+        self.max_pos_emb = config.max_pos_emb
+        self.context_size = config.context_size
+        self.num_heads = config.num_heads
+        self.dim_head = config.dim_head
+        self.scale = self.dim_head**-0.5
+        self.pre_norm = nn.LayerNorm(config.hidden_dim)
+        self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, config.hidden_dim)
+        self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1,
+                                        self.dim_head)
+
+        if self.context_size <= 0 or self.context_size > self.max_pos_emb:
+            raise ValueError(
+                "Context size is either less than 0 or exceeds the max_pos_emb"
+            )
+
+    def forward(self, hidden_states: torch.Tensor,
+                attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        bsz, num_features, _ = hidden_states.shape
+
+        num_blocks = math.ceil(num_features / self.context_size)
+        remainder = num_features % self.context_size
+        if remainder > 0:
+            # right padding to reach block size
+            hidden_states = torch.nn.functional.pad(
+                hidden_states, (0, 0, 0, self.context_size - remainder))
+
+        # NOTE: would be nice to try to use qkvparallellinear
+        # here for this block attention implementation if possible
+        query_states = self.to_q(hidden_states)
+        key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.reshape(bsz, num_blocks, self.context_size,
+                                            self.num_heads,
+                                            -1).transpose(2, 3)
+        key_states = key_states.reshape(bsz, num_blocks, self.context_size,
+                                        self.num_heads, -1).transpose(2, 3)
+        value_states = value_states.reshape(bsz, num_blocks, self.context_size,
+                                            self.num_heads,
+                                            -1).transpose(2, 3)
+
+        # shaw's relative positional embedding
+        dist = attention_dists.to(hidden_states.device)
+        rel_pos_emb = self.rel_pos_emb(dist)
+        rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] +
+                                                list(rel_pos_emb.shape))
+        pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded,
+                             dim=-1) * self.scale
+
+        if remainder > 0:
+            # masked attention in the extended block
+            mask = torch.ones(self.context_size,
+                              self.context_size,
+                              dtype=bool,
+                              device=hidden_states.device)
+            mask[:remainder, :remainder] = 0
+            mask_value = -torch.finfo(pos_attn.dtype).max
+            pos_attn[:, -1, :].masked_fill_(mask, mask_value)
+
+        with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.MATH):
+            out = F.scaled_dot_product_attention(query_states,
+                                                 key_states,
+                                                 value_states,
+                                                 attn_mask=pos_attn,
+                                                 scale=self.scale)
+        out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
+        return self.to_out(out[:, :num_features, :])
+
+
+class GraniteSpeechConformerDepthWiseConv1d(nn.Module):
+    """Wrapper for padded 1D pointwise convolution."""
+
+    def __init__(self,
+                 chan_in: int,
+                 chan_out: int,
+                 kernel_size: int,
+                 prefix: str = ""):
+        super().__init__()
+        # Padding for the 1D conv is symmetric or close (i.e., offset by one).
+        pad = kernel_size // 2
+        pad_offset = (kernel_size + 1) % 2
+        self.padding = (pad, pad - pad_offset)
+
+        self.conv = nn.Conv1d(chan_in,
+                              chan_out,
+                              kernel_size,
+                              groups=chan_in,
+                              bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = F.pad(hidden_states, self.padding)
+        return self.conv(hidden_states)
+
+
+class GraniteSpeechConformerConvModule(nn.Module):
+    """Conformer conv module consisting of several 1D/depthwise 1D
+    convolutional layers.
+    """
+
+    def __init__(self, config: PretrainedConfig, prefix: str = ""):
+        super().__init__()
+        inner_dim = config.hidden_dim * config.conv_expansion_factor
+
+        self.norm = nn.LayerNorm(config.hidden_dim)
+        self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1)
+        self.glu = nn.GLU(dim=1)
+        self.depth_conv = GraniteSpeechConformerDepthWiseConv1d(
+            inner_dim,
+            inner_dim,
+            kernel_size=config.conv_kernel_size,
+            prefix=f"{prefix}.depth_conv",
+        )
+        self.silu = nn.SiLU()
+        self.batch_norm = nn.BatchNorm1d(inner_dim)
+        self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.up_conv(hidden_states.permute(0, 2, 1))
+        hidden_states = self.glu(hidden_states)
+        hidden_states = self.depth_conv(hidden_states)
+        hidden_states = self.silu(self.batch_norm(hidden_states))
+        hidden_states = self.down_conv(hidden_states).permute(0, 2, 1)
+        return hidden_states
+
+
+class GraniteSpeechConformerBlock(nn.Module):
+    """Conformer block, consisting largely of linear layers,
+    attention, and convolutional layers."""
+
+    def __init__(self, config: PretrainedConfig, prefix: str = ""):
+        super().__init__()
+        self.ff1 = GraniteSpeechConformerFeedForward(config,
+                                                     prefix=f"{prefix}.ff1")
+        self.attn = GraniteSpeechConformerAttention(config,
+                                                    prefix=f"{prefix}.attn")
+        self.conv = GraniteSpeechConformerConvModule(config,
+                                                     prefix=f"{prefix}.conv")
+        self.ff2 = GraniteSpeechConformerFeedForward(config,
+                                                     prefix=f"{prefix}.ff2")
+        self.post_norm = nn.LayerNorm(config.hidden_dim)
+
+    def forward(self, hidden_states: torch.Tensor,
+                attention_dists: torch.Tensor) -> torch.Tensor:
+        hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states
+        hidden_states = self.attn(
+            hidden_states, attention_dists=attention_dists) + hidden_states
+        hidden_states = self.conv(hidden_states) + hidden_states
+        hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechCTCEncoder(nn.Module):
+    """CTC Encoder comprising conformer blocks and additional linear layers."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 prefix: str,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.config = config
+
+        # Precompute clamped relative positional encoding distances
+        seq = torch.arange(config.context_size)
+        relpos_dist = seq.view(-1, 1) - seq.view(1, -1)
+        self.attention_dists = torch.clamp(
+            relpos_dist, -config.context_size,
+            config.context_size) + config.max_pos_emb
+
+        self.input_linear = nn.Linear(config.input_dim,
+                                      config.hidden_dim,
+                                      bias=True)
+        self.layers = nn.ModuleList([
+            GraniteSpeechConformerBlock(
+                config,
+                prefix=f"{prefix}.layers.{idx}",
+            ) for idx in range(config.num_layers)
+        ])
+
+        self.out = ColumnParallelLinear(
+            input_size=config.hidden_dim,
+            output_size=config.output_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out",
+        )
+
+        self.out_mid = RowParallelLinear(
+            input_size=config.output_dim,
+            output_size=config.hidden_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_mid",
+        )
+        self.softmax = nn.Softmax(dim=-1)
+        self.num_layers = config.num_layers
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.input_linear(hidden_states)
+        for idx, layer in enumerate(self.layers, start=1):
+            hidden_states = layer(hidden_states,
+                                  attention_dists=self.attention_dists)
+
+            if idx == self.num_layers // 2:
+                hidden_states_mid = hidden_states.clone()
+                hidden_states_mid, _ = self.out(hidden_states_mid)
+                hidden_states_mid = self.softmax(hidden_states_mid)
+                hidden_states_mid, _ = self.out_mid(hidden_states_mid)
+                hidden_states += hidden_states_mid
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    GraniteSpeechMultiModalProcessor,
+    info=GraniteSpeechMultiModalProcessingInfo,
+    dummy_inputs=GraniteSpeechDummyInputsBuilder)
+class GraniteSpeechForConditionalGeneration(
+        nn.Module,
+        SupportsMultiModal,
+        SupportsPP,
+        SupportsLoRA,
+):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.cache_config = cache_config
+        self.sampler = get_sampler()
+
+        # The language model is typically a Granite LLM
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        # Conformer encoder
+        self.encoder = GraniteSpeechCTCEncoder(
+            config=config.encoder_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
+
+        # Blip2 QFormer
+        self.projector = GraniteSpeechEncoderProjector(
+            config=config,
+            quant_config=quant_config,
+            cache_config=cache_config,
+            prefix=f"{prefix}.projector",
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_audio_input(
+        self,
+        **kwargs: object,
+    ) -> Optional[GraniteSpeechAudioInputs]:
+        input_features = kwargs.pop("input_features", None)
+        input_features_mask = kwargs.pop("input_features_mask", None)
+        audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
+        if input_features is None:
+            return None
+
+        # If we have a batch of variable feature length audio clips, we need
+        # to mask the features; usually we would get an input_features_mask
+        # from the processor, but we handle rebuilding it here since
+        # vLLM generally processes everything independently + batches.
+        if input_features_mask is None:
+            input_features_mask = self._build_input_features_mask(
+                audio_embed_sizes)
+
+        if not isinstance(input_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_features)}")
+
+        if input_features_mask is not None and not isinstance(
+                input_features_mask, torch.Tensor):
+            raise ValueError("Incorrect type of audio input features mask. "
+                             f"Got type: {type(input_features_mask)}")
+
+        if isinstance(input_features, torch.Tensor):
+            # Granite speech currently only allows one audio token per instance
+            # and features are already unsqueezed in the processor, so one
+            # instance will have shape [1, {num_features}, 160]. As such,
+            # input features will usually be of shape
+            # [bsz, 1, num_features, 160], which we squeeze to be 3D here.
+            if len(input_features.shape) == 4:
+                input_features = input_features.squeeze(1)
+            if len(input_features.shape) != 3:
+                raise ValueError(
+                    "Squeezed input features should be 3D but are of shape "
+                    f"{input_features.shape}")
+            input_features = input_features.to(
+                self.encoder.input_linear.weight.dtype)
+
+        else:
+            # Otherwise we have a list of tensors, which are almost certainly
+            # differing in their respective numbers of audio features;
+            # stack them into a 3D tensor of size [bsz, most_num_features, 160].
+            input_features = self._pad_and_stack_input_features(
+                input_features, ).to(self.encoder.input_linear.weight.dtype)
+
+        return GraniteSpeechAudioInputs(
+            input_features=input_features,
+            input_features_mask=input_features_mask,
+            audio_embed_sizes=audio_embed_sizes.flatten().tolist(),
+        )
+
+    def _build_input_features_mask(
+        self,
+        audio_embed_sizes: torch.Tensor,
+    ) -> torch.Tensor:
+        """Calculate the input features mask, which will generally be used
+        to mask the the padded features for all entries in the batch except
+        for those with the most audio features.
+
+        Args:
+            audio_embed_sizes: torch.Tensor
+                Tensor of num features in each seq in the batch.
+        Returns:
+            torch.Tensor: Mask of shape (bsz, num_features) to be applied to
+            the audio features prior to splitting the audio embeddings.
+        """
+        most_audio_features = torch.max(audio_embed_sizes).item()
+        mask_indices = torch.arange(
+            most_audio_features,
+            device=audio_embed_sizes.device,
+        ).view(1, -1)
+        input_features_mask = mask_indices < audio_embed_sizes.view(-1, 1)
+        return input_features_mask
+
+    def _pad_and_stack_input_features(
+        self,
+        input_features: list[torch.Tensor],
+    ) -> torch.Tensor:
+        """Given a list of input features of varying length, pad them to the
+        same length and stack them into a torch.Tensor.
+
+        NOTE: Usually, padding is done in the input processor/feature extractor
+        and zero padded prior to the computation of the Mel features; the
+        resulting values are only constant within a batch and generally nonzero
+        (i.e., slightly negative nums); we should validate that this is okay
+        since we don't use a feature attention mask, but the more important
+        thing is that we apply the input_features_mask with variable len
+        batches.
+
+        Args:
+            input_features: list[torch.Tensor]
+                Input features to be coerced into a tensor.
+        Returns:
+            torch.Tensor: Tensor of shape [bsz, num_features, 160], where
+            num_features is the max number of features of any entry in the
+            batch.
+        """
+        # Input features are of shape [bsz, num_features, 160]
+        feat_lens = [feats.shape[1] for feats in input_features]
+        padding = [max(feat_lens) - length for length in feat_lens]
+        # TODO (Alex) - Validate that it's okay to zero pad like this;
+        # in transformers we zero pad prior to calculating the speech features,
+        # so the value is not zero and is dependent on the batched features.
+        padded = [
+            torch.nn.functional.pad(feats, (0, 0, 0, pad, 0, 0))
+            for feats, pad in zip(input_features, padding)
+        ]
+        stacked_features = torch.cat(padded, dim=0).to(input_features[0])
+        return stacked_features
+
+    def _process_audio_input(
+        self,
+        audio_input: GraniteSpeechAudioInputs,
+    ) -> tuple[torch.Tensor]:
+        """Compute the audio features to be merged into the LLM embeddings.
+        
+        Args:
+            audio_input: GraniteSpeechAudioInputs
+                Audio inputs object containing Mel features, an input features
+                mask, and the (flattened) number of audio tokens per instance.
+        Returns:
+            tuple[torch.Tensor]: List of length bsz.
+        """
+        # TODO (Alex) - support embedding inputs
+        encoder_embeds = self.encoder(audio_input["input_features"])
+        # [bsz, <max feature size>, 4096]
+        projected_embeds = self.projector(encoder_embeds)
+        # Apply mask on variable length audio features
+        masked_embeds = projected_embeds[audio_input["input_features_mask"]]
+        # Split variable length features into a tuple
+        return torch.split(masked_embeds, audio_input["audio_embed_sizes"])
+
+    def get_multimodal_embeddings(
+        self,
+        **kwargs: object,
+    ) -> Optional[MultiModalEmbeddings]:
+        """Compute the audio embeddings if audio inputs are present."""
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if audio_input is None:
+            return None
+        audio_features = self._process_audio_input(audio_input)
+        return audio_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        """Compute the merged LLM / audio embeddings."""
+        if multimodal_embeddings is None:
+            return self.language_model.get_input_embeddings(input_ids)
+
+        inputs_embeds = embed_multimodal(
+            input_ids,
+            self.config.audio_token_index,
+            self.language_model.model.get_input_embeddings,
+            multimodal_embeddings,
+        )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            audio_embeds = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids, audio_embeds)
+            input_ids = None
+
+        model_output = self.language_model(input_ids, positions,
+                                           intermediate_tensors, inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(
+            hidden_states,
+            sampling_metadata,
+        )
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get the module prefix in multimodal models."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="projector",
+            tower_model="encoder",
+        )
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 367722126e5..7fff14cb9f1 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -391,8 +390,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 scale=1 /
                                                 self.config.logits_scaling)
 
-        self.sampler = get_sampler()
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -428,14 +425,6 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index cf8c969e118..4e660cbf667 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -20,7 +20,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -295,8 +294,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 scale=1 /
                                                 self.config.logits_scaling)
 
-        self.sampler = get_sampler()
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -332,14 +329,6 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index ef96257ba4b..c48cb157084 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -39,7 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -521,7 +520,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size,
                                                 self.output_multiplier_scale)
 
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -551,14 +549,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         skip_prefixes = ["rotary_emb.inv_freq"]
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 15e126b0f4c..99c226439ec 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -19,8 +19,8 @@
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
@@ -488,24 +488,26 @@ def _cached_apply_hf_processor(
         prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
         # perform caching for the most common case
         if mm_data_items.get_count("image", strict=False) > 1:
-            # This code path corresponds to the cache being disabled
-            return self._apply_hf_processor_main(
+            return self._apply_hf_processor(
                 prompt=prompt,
-                mm_items=mm_data_items,
+                mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-                enable_hf_prompt_update=True,
+                return_mm_hashes=return_mm_hashes,
             )
 
         return super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            return_mm_hashes=return_mm_hashes,
         )
 
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index c31870461b4..961954c2b58 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -28,7 +28,6 @@
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -603,7 +602,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.text_config.tie_word_embeddings:
             self.lm_head.weight = self.model.text_model.wte.weight
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
-        self.sampler = get_sampler()
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
@@ -754,14 +752,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 0cda199af47..7fea9647ead 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -226,9 +226,9 @@ def forward(
         intermediate_tensors: Optional["IntermediateTensors"],
     ) -> Union[Tensor, "IntermediateTensors"]:
         """
-        Accept :class:`IntermediateTensors` when PP rank > 0.
+        Accept {class}`IntermediateTensors` when PP rank > 0.
 
-        Return :class:`IntermediateTensors` only for the last PP rank.
+        Return {class}`IntermediateTensors` only for the last PP rank.
         """
         ...
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 22c9287509e..f141dcf3cd4 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -13,7 +13,6 @@
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import PoolerOutput
-    from vllm.model_executor.layers.sampler import SamplerOutput
     from vllm.model_executor.pooling_metadata import PoolingMetadata
     from vllm.model_executor.sampling_metadata import SamplingMetadata
 
@@ -103,14 +102,6 @@ def compute_logits(
         """Return `None` if TP rank > 0."""
         ...
 
-    def sample(
-        self,
-        logits: T,
-        sampling_metadata: "SamplingMetadata",
-    ) -> "SamplerOutput":
-        """Only called on TP rank 0."""
-        ...
-
 
 @overload
 def is_text_generation_model(
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 520b85c0cdf..c3d7cbfcddb 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -23,7 +23,6 @@
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -336,7 +335,6 @@ def __init__(self,
         if self.config.tie_word_embeddings:
             self.output.weight = self.model.tok_embeddings.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -363,14 +361,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -423,7 +413,7 @@ def __init__(
                          prefix=prefix,
                          model_type=model_type)
 
-        for attr in ("output", "logits_processor", "sampler"):
+        for attr in ("output", "logits_processor"):
             delattr(self, attr)
 
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 8f5f454cbf6..23b92ad2bbf 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -8,7 +8,6 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
 
 import torch
@@ -20,7 +19,6 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -698,13 +696,6 @@ def _patch_quant_config(self, config: PretrainedConfig,
                 (llm_quant_config is not None):
                 quant_config.modules_to_not_convert.append("vision_model")
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _init_vision_model(
         self,
         config: PretrainedConfig,
@@ -903,7 +894,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
 
         if intermediate_tensors is not None:
             input_ids = None
@@ -941,13 +932,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 78fe6588edd..e1e3f0f199c 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -36,7 +36,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -308,7 +307,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                         config.mup_width_scale)
         self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size,
                                                 scale=self.output_logits_scale)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -335,14 +333,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6fabc8228e1..46335c2b393 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -19,7 +19,6 @@
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -409,7 +408,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -466,14 +464,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index c2fac70afc4..0629266860f 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -56,10 +56,8 @@
 from vllm.config import VllmConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
-from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -70,22 +68,20 @@
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 
 from .utils import is_pp_missing_parameter, maybe_prefix
 
-logger = init_logger(__name__)
-
 
 # For dummy input only
 @dataclass
@@ -143,6 +139,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(KimiVLConfig)
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
     def get_num_image_tokens(
         self,
         *,
@@ -180,23 +179,6 @@ def get_num_image_tokens(
         token_width = (width + pad_width) // (kernel_size[1] * patch_size)
         return int(token_height * token_width)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        # None means unlimited
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image":
-            self.get_num_image_tokens(
-                image_width=MaxImageTokenMeta.width,
-                image_height=MaxImageTokenMeta.height,
-            ),
-        }
-
     @property
     def image_token_id(self) -> int:
         return self.get_hf_config().media_placeholder_token_id
@@ -204,34 +186,28 @@ def image_token_id(self) -> int:
 
 class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
 
-    def __init__(self, info: KimiVLProcessingInfo) -> None:
-        super().__init__(info)
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
 
-        self.image_token_id = self.info.image_token_id
-        self.image_token = self.info.get_tokenizer().decode(
-            self.image_token_id)
+        return image_token * num_images
 
-    def get_dummy_processor_inputs(
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        width = MaxImageTokenMeta.width
-        height = MaxImageTokenMeta.height
-        mm_data = {
+        return {
             "image":
-            self._get_dummy_images(width=width,
-                                   height=height,
+            self._get_dummy_images(width=MaxImageTokenMeta.width,
+                                   height=MaxImageTokenMeta.height,
                                    num_images=num_images)
         }
 
-        return ProcessorInputs(
-            prompt_text=self.image_token * num_images,
-            mm_data=mm_data,
-        )
-
 
 class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
 
@@ -321,7 +297,6 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = get_sampler()
         self.media_placeholder: int = self.config.media_placeholder_token_id
         self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_world_size = get_tensor_model_parallel_world_size()
@@ -365,8 +340,7 @@ def _parse_and_validate_image_input(
         else:
             pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
                                                 patch_size)
-        # fp32 -> bf16
-        pixel_values = pixel_values.to(torch.bfloat16)
+        pixel_values = pixel_values.to(self.vision_tower.dtype)
         # image_grid_hws.shape = (N, 2)
         assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
 
@@ -395,6 +369,9 @@ def _process_image_input(self,
         return self.multi_modal_projector(
             torch.cat(image_features)).split(lengths)
 
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> Optional[NestedTensors]:
         # Validate the multimodal input keyword arguments
@@ -432,7 +409,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> SamplerOutput:
+    ) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         # NOTE: In v1, inputs_embeds is always generated at model runner from
@@ -470,14 +447,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata, **kwargs)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         config = self.config.text_config
         _KEYS_TO_MODIFY_MAPPING = {
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index caa4a5108a9..7a3ea7a6876 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -128,11 +127,13 @@ def __init__(self,
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         # MistralConfig has an optional head_dim introduced by Mistral-Nemo
-        self.head_dim = getattr(config, "head_dim",
-                                self.hidden_size // self.total_num_heads)
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
         # Phi models introduced a partial_rotary_factor parameter in the config
-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
-        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
+                                             1)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -164,11 +165,12 @@ def __init__(self,
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.rotary_dim,
+            rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,
             is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor,
         )
 
         if hasattr(config, "interleaved_sliding_window"):
@@ -331,6 +333,8 @@ def __init__(self,
         else:
             self.norm = PPMissingLayer()
 
+        self.aux_hidden_state_layers: tuple[int] = tuple()
+
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
@@ -344,7 +348,8 @@ def forward(
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
+                                                        list[torch.Tensor]]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -356,7 +361,11 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
@@ -366,6 +375,9 @@ def forward(
             })
 
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def load_weights(self, weights: Iterable[Tuple[str,
@@ -515,11 +527,16 @@ def __init__(self,
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def _init_model(self,
                     vllm_config: VllmConfig,
                     prefix: str = "",
@@ -551,11 +568,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 51efbfe202f..0fdc30f36f9 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -51,8 +51,8 @@ def custom_routing_function(
         renormalize: bool,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
-        router_scores = torch.sigmoid(router_scores.float()).to(
-            hidden_states.dtype)
+        # psuedo-standard is that the router scores are floats
+        router_scores = torch.sigmoid(router_scores.float())
         return (router_scores, router_indices.to(torch.int32))
 
     def __init__(self,
@@ -273,8 +273,8 @@ def __init__(
             cache_config=cache_config,
             prefix=f"{prefix}.self_attn",
         )
-        is_moe_layer = (self.layer_idx +
-                        1) % config.interleave_moe_layer_step == 0
+        is_moe_layer = config.interleave_moe_layer_step > 0 and (
+            self.layer_idx + 1) % config.interleave_moe_layer_step == 0
         if is_moe_layer:
             self.feed_forward = Llama4MoE(
                 config=config,
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 28ad6128c4f..76655bd71b1 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -6,7 +6,8 @@
 import torch.nn as nn
 from transformers import LlamaConfig
 
-from vllm.config import ModelConfig
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -37,17 +38,19 @@ def __init__(
             self.input_layernorm = nn.Identity()
 
 
+@support_torch_compile
 class LlamaModel(nn.Module):
 
     def __init__(
         self,
         *,
-        model_config: ModelConfig,
-        start_layer_id: int = 0,
+        vllm_config: VllmConfig,
         prefix: str = "",
+        start_layer_id: int = 0,
     ) -> None:
         super().__init__()
-        self.config = model_config.hf_config
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             self.config.vocab_size,
@@ -70,19 +73,19 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         input_embeds = self.embed_tokens(input_ids)
         hidden_states = self.fc(
             torch.cat((input_embeds, hidden_states), dim=-1))
         residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for layer in self.layers:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
                 residual,
             )
-        return hidden_states + residual
+        hidden_states = hidden_states + residual
+        return hidden_states, hidden_states
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
@@ -116,12 +119,13 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class EagleLlamaForCausalLM(LlamaForCausalLM):
 
-    def __init__(self, *, model_config: ModelConfig, start_layer_id: int = 0):
+    def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
         nn.Module.__init__(self)
-        self.config = model_config.hf_config
-        self.model = LlamaModel(model_config=model_config,
-                                start_layer_id=start_layer_id,
-                                prefix="model")
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix="model",
+                                start_layer_id=start_layer_id)
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.config.vocab_size,
@@ -132,7 +136,7 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.model(input_ids, positions, hidden_states)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
new file mode 100644
index 00000000000..904ff321094
--- /dev/null
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -0,0 +1,242 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM)
+from vllm.v1.sample.metadata import SamplingMetadata
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config=quant_config, prefix=prefix)
+
+        # override qkv
+        self.self_attn.qkv_proj = QKVParallelLinear(
+            2 * self.hidden_size,
+            self.self_attn.head_dim,
+            self.self_attn.total_num_heads,
+            self.self_attn.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "qkv_proj"),
+        )
+
+        self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        residual = hidden_states
+        embeds = self.input_layernorm(embeds)
+        hidden_states = self.hidden_norm(hidden_states)
+
+        hidden_states = torch.cat([embeds, hidden_states], dim=-1)
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class LlamaModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        start_layer_id: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        self.vocab_size = self.config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(
+                self.config,
+                prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"),
+            )
+        ])
+        if hasattr(self.config, "target_hidden_size"):
+            self.fc = torch.nn.Linear(self.config.target_hidden_size * 3,
+                                      self.config.hidden_size,
+                                      bias=False)
+        else:
+            self.fc = torch.nn.Linear(self.config.hidden_size * 3,
+                                      self.config.hidden_size,
+                                      bias=False)
+        self.norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+        assert hidden_states.shape[-1] == input_embeds.shape[-1]
+
+        residual = None
+        hidden_states, residual = self.layers[0](
+            positions,
+            input_embeds,
+            hidden_states,
+            residual,
+        )
+
+        hidden_states, hidden_prenorm = self.norm(hidden_states, residual)
+        return hidden_states, hidden_prenorm
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if 'midlayer.' in name:
+                name = name.replace('midlayer.', 'layers.0.')
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Eagle3LlamaForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
+        nn.Module.__init__(self)
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                start_layer_id=start_layer_id,
+                                prefix="model")
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.lm_head = ParallelLMHead(
+            self.config.draft_vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.draft_vocab_size,
+            padding_size=(DEFAULT_VOCAB_PADDING_SIZE),
+            prefix="")
+        self.logits_processor = LogitsProcessor(self.config.draft_vocab_size,
+                                                scale=logit_scale)
+        self.draft_id_to_target_id = nn.Parameter(
+            torch.zeros((self.config.draft_vocab_size),
+                        dtype=torch.long).type(torch.LongTensor),
+            requires_grad=False,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(input_ids, positions, hidden_states)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        base = torch.arange(self.config.draft_vocab_size, device=logits.device)
+        targets = base + self.draft_id_to_target_id
+        logits_new = logits.new_full((
+            logits.shape[0],
+            self.config.vocab_size,
+        ), float('-inf'))
+        logits_new[:, targets] = logits
+        return logits_new
+
+    def combine_hidden_states(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # combine multiple auxiliary hidden states returned by eagle3
+        return self.model.fc(hidden_states)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+        )
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "t2d" in name:
+                continue
+            if "d2t" in name:
+                name = name.replace("d2t", "draft_id_to_target_id")
+            elif "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+
+        return loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index fbd212d1700..6287fdb3300 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -2,7 +2,6 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
                     TypeVar, Union, cast)
 
@@ -23,7 +22,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -356,9 +354,8 @@ def _get_prompt_updates(
         image_token_id = hf_config.image_token_index
         image_end_id = vocab[processor.image_end_token]
 
-        vision_config = hf_config.vision_config
-        assert isinstance(vision_config, PixtralVisionConfig)
-        encoder_info = PixtralHFEncoderInfo(vision_config)
+        assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(hf_config)
 
         def get_replacement(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
@@ -398,14 +395,12 @@ def _build_llava_or_pixtral_hf_processor(
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
     cache: Optional[ProcessingCache] = None,
-    enable_sanity_checks: bool = True,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
             info,
             dummy_inputs,  # type: ignore
             cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
         )
 
     if isinstance(info, LlavaProcessingInfo):
@@ -413,7 +408,6 @@ def _build_llava_or_pixtral_hf_processor(
             info,
             dummy_inputs,  # type: ignore
             cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
         )
 
     raise NotImplementedError(type(info))
@@ -546,13 +540,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -734,8 +721,9 @@ def forward(
                 batch.
             pixel_values: The pixels in each input image.
 
-        See also:
-            :class:`LlavaImageInputs`
+        :::{seealso}
+        {class}`LlavaImageInputs`
+        :::
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
@@ -763,13 +751,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 9c4d0e1fc27..c7e8d6991b2 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import abstractmethod
-from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
@@ -13,7 +12,6 @@
 from typing_extensions import NotRequired
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
@@ -250,13 +248,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
@@ -546,7 +537,7 @@ def forward(
         Unlike in LLaVA-1.5, the number of image tokens inputted to the language
         model depends on the original size of the input image. Including the
         original image token in the input, the required number of image tokens
-        is given by :func:`get_llava_next_image_feature_size`.
+        is given by {func}`get_llava_next_image_feature_size`.
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
@@ -557,8 +548,9 @@ def forward(
             pixel_values: The pixels in each grid patch for each input image.
             image_sizes: The original `(height, width)` for each input image.
 
-        See also:
-            :class:`LlavaNextImageInputs`
+        :::{seealso}
+        {class}`LlavaNextImageInputs`
+        :::
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
@@ -585,13 +577,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 0221c6b237c..a5ff189cfdb 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -2,7 +2,6 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -12,7 +11,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -301,13 +299,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_video_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -469,13 +460,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 60d32c92469..5c2b388e403 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -2,7 +2,6 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
                     TypedDict, Union)
 
@@ -16,7 +15,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -455,13 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
@@ -583,21 +574,21 @@ def _parse_and_validate_video_input(
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
+        mm_input_by_modality = {}
 
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
 
-        return modalities
+        return mm_input_by_modality
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -848,8 +839,9 @@ def get_language_model(self) -> torch.nn.Module:
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -858,14 +850,13 @@ def get_multimodal_embeddings(
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
                 multimodal_embeddings += tuple(vision_embeddings)
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_pixels(video_input)
+            if modality == "video":
+                video_embeddings = self._process_video_pixels(multimodal_input)
                 multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
@@ -957,13 +948,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 7a525ad8e49..af78ece66bb 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -14,7 +14,6 @@
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -27,7 +26,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -154,6 +153,26 @@ def forward(
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
                        SupportsV0Only):
@@ -193,7 +212,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
@@ -247,30 +265,7 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 526dec46ff2..78303733f6b 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -19,7 +19,6 @@
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -208,7 +207,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
@@ -282,14 +280,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index cf03396a9ca..866dc3f466e 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -45,7 +45,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -553,7 +552,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -584,14 +582,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index a2ca92cdec0..f42d48e919c 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -28,12 +28,16 @@
 
 import torch
 from torch import nn
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.whisper.modeling_whisper import (
     ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
 
 from vllm.config import VllmConfig
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
@@ -512,6 +516,36 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.audio_token_id = None
 
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/openbmb/MiniCPM-o-2_6-int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        # MiniCPMO GPTQ model leave vpm unquantized.
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        return super().init_vision_module(config, quant_config, prefix)
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        # MiniCPMO GPTQ model leave resampler unquantized.
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        return super().init_resampler(embed_dim, vision_dim, quant_config,
+                                      prefix)
+
     def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Do not use parameters temporarily
         audio_config = self.config.audio_config
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1a91cf9bab4..300360f785a 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -25,7 +25,7 @@
 import math
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
                     Union)
 
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
@@ -758,13 +757,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.llm, "sampler"):
-            return self.llm.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_vision_input(
         self,
         modality: str,
@@ -946,14 +938,6 @@ def compute_logits(
     ) -> Optional[torch.Tensor]:
         return self.llm.compute_logits(hidden_states, sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
@@ -1197,7 +1181,7 @@ def init_llm(
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> nn.Module:
         model = Idefics2VisionTransformer(config.vision_config,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 7562aa678d5..951f4e2304a 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -3,7 +3,7 @@
 import copy
 import math
 import re
-from typing import Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.distributed
@@ -33,7 +33,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -111,7 +110,17 @@ def _forward(
             variance = tensor_model_parallel_all_reduce(
                 variance) / self.tp_world
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-        x = x.to(orig_dtype) * self.weight
+
+        weight = self.weight
+        if x.size(-1) != self.weight.size(0):
+            if self.weight.size(0) < x.size(-1):
+                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
+                full_weight = self.weight.repeat(repeat_count)
+                weight = full_weight[:x.size(-1)]
+            else:
+                weight = self.weight[:x.size(-1)]
+
+        x = x.to(orig_dtype) * weight
         return x
 
     def forward(
@@ -422,6 +431,10 @@ def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
                                attn_metadata):
         hidden = []
         for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            if _prefill_idx >= len(attn_metadata.query_start_loc):
+                break
+            if _prefill_idx >= len(state_indices_tensor):
+                break
             _start = attn_metadata.query_start_loc[_prefill_idx]
             _end = attn_metadata.query_start_loc[_prefill_idx + 1]
             slot_id = state_indices_tensor[_prefill_idx]
@@ -444,6 +457,10 @@ def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
             hidden.append(
                 self._decode_infer(q, k, v, kv_cache, state_indices_tensor,
                                    attn_metadata))
+
+        if not hidden:
+            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
+
         hidden = torch.concat(hidden, dim=0).contiguous()
         return hidden
 
@@ -664,6 +681,9 @@ def __init__(
         self.shared_moe = False
 
         shared_intermediate = getattr(config, 'shared_intermediate_size', 0)
+        if isinstance(shared_intermediate, list):
+            shared_intermediate = shared_intermediate[
+                layer_id] if layer_id < len(shared_intermediate) else 0
         if shared_intermediate > 0:
             self.shared_moe = True
             self.shared_mlp = MiniMaxText01MLP(
@@ -876,6 +896,8 @@ def _clear_prefill_cache(self, attn_metadata,
 
         slots_to_clear = []
         for _prefill_id in range(getattr(attn_metadata, "num_prefills", 0)):
+            if _prefill_id >= len(seq_id_map):
+                break
             seq_id = seq_id_map[_prefill_id]
             if attn_metadata.context_lens_tensor[
                     _prefill_id] == 0 and seq_id in seq_to_slot_maps:
@@ -887,13 +909,18 @@ def _clear_prefill_cache(self, attn_metadata,
                                         dtype=torch.long)
             minimax_cache_tensors[:, slots_tensor, ...] = 0
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(self,
                 input_ids: Optional[torch.Tensor],
                 positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                intermediate_tensors=None,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs) -> torch.Tensor:
+                **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
         if attn_metadata is None:
@@ -902,6 +929,7 @@ def forward(self,
             kwargs["request_ids_to_seq_ids"] = {}
         if "finished_requests_ids" not in kwargs:
             kwargs["finished_requests_ids"] = []
+
         (
             minimax_cache_tensors,
             state_indices_tensor,
@@ -923,15 +951,11 @@ def forward(self,
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        kv_cache_index = 0
         minimax_cache_index = 0
         attn_metadata.rotary_emb = self.rotary_emb
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             _caches = None
-            if isinstance(layer.self_attn, MiniMaxText01Attention):
-                _caches = kv_caches[kv_cache_index]
-                kv_cache_index += 1
             if isinstance(layer.self_attn, MiniMaxText01LinearAttention):
                 current_state_layer = minimax_cache_index
                 _caches = minimax_cache_params.at_layer_idx(
@@ -994,7 +1018,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     self.config.vocab_size)
 
-            self.sampler = Sampler()
         else:
             self.lm_head = PPMissingLayer()
 
@@ -1011,15 +1034,20 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(
             batch_size)
 
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, self.kv_cache,
-                                   intermediate_tensors, inputs_embeds,
-                                   **kwargs)
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
 
         return hidden_states
 
@@ -1030,16 +1058,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ):
-
-        next_tokens = self.sampler(logits, sampling_metadata)
-
-        return next_tokens
-
     def make_empty_intermediate_tensors(
             self, batch_size: int, dtype: torch.dtype,
             device: torch.device) -> IntermediateTensors:
@@ -1055,8 +1073,9 @@ def make_empty_intermediate_tensors(
         })
 
     def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> None:
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         def which_layer(name: str) -> int:
             if "layers" in name:
@@ -1120,6 +1139,7 @@ def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
                               weight_name,
                               expert_id=expert_id,
                               shard_id=shard_id)
+                loaded_params.add(name)
                 break
             else:
                 if is_pp_missing_parameter(name, self):
@@ -1129,6 +1149,7 @@ def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
                                         default_weight_loader)
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
                 weight_loader(param, loaded_weight)
+                loaded_params.add(name)
             return
 
         def is_shared_mlp_weight(name: str) -> bool:
@@ -1166,6 +1187,7 @@ def load_shared_mlp_weight(name: str, loaded_weight: torch.Tensor,
                 else:
                     raise AssertionError(
                         "MLP weight not in [gate_up_proj, down_proj]")
+            loaded_params.add(name)
             return
 
         def is_mha_weight(name: str) -> bool:
@@ -1182,6 +1204,7 @@ def load_linear_attn_weight(name: str, loaded_weight: torch.Tensor,
                 MiniMaxText01LinearAttention.weight_direct_load)
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
             return
 
         def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
@@ -1206,6 +1229,7 @@ def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
                                         default_weight_loader)
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
                 weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
                 break
             else:
                 if is_pp_missing_parameter(name, self):
@@ -1216,6 +1240,7 @@ def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
                                         default_weight_loader)
                 weight_loader = weight_loader_with_alias(name)(weight_loader)
                 weight_loader(param, loaded_weight)
+                loaded_params.add(name)
             return
 
         def is_layer_norm_weight(name: str) -> bool:
@@ -1231,6 +1256,7 @@ def load_layer_norm_weight(name: str, loaded_weight: torch.Tensor,
                                     default_weight_loader)
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
             return
 
         def load_basic_weight(name: str, loaded_weight: torch.Tensor,
@@ -1242,6 +1268,7 @@ def load_basic_weight(name: str, loaded_weight: torch.Tensor,
                                     default_weight_loader)
             weight_loader = weight_loader_with_alias(name)(weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
             return
 
         for name, loaded_weight in weights:
@@ -1270,4 +1297,4 @@ def load_basic_weight(name: str, loaded_weight: torch.Tensor,
                 continue
 
             load_basic_weight(name, loaded_weight, self)
-        return
+        return loaded_params
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
new file mode 100644
index 00000000000..4ac60f97bb5
--- /dev/null
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Iterable, Mapping
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union, cast
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
+
+from .clip import CLIPVisionModel
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llava import (BaseLlavaMultiModalProcessor, LlavaDummyInputsBuilder,
+                    init_vision_tower_for_llava)
+from .llava_next import LlavaNextProcessingInfo
+from .pixtral import PixtralHFVisionModel
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+
+class MiniMaxVL01ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+
+class MiniMaxVL01ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+MiniMaxVL01ImageInputs = Union[MiniMaxVL01ImagePixelInputs,
+                               MiniMaxVL01ImageEmbeddingInputs]
+
+
+class MiniMaxVL01MultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MiniMaxVL01DummyInputsBuilder(LlavaDummyInputsBuilder):
+    pass
+
+
+class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(MiniMaxVL01Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(**kwargs)
+        image_processor = hf_processor.image_processor
+        image_processor.anyres_preprocess = (
+            image_processor.anyres_for_vllm_preprocess)
+
+        return hf_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+
+class MiniMaxVL01MultiModalProcessor(
+        BaseLlavaMultiModalProcessor[MiniMaxVL01ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return {
+            "pixel_values": MultiModalFieldConfig.batched("image"),
+            "image_embeds": MultiModalFieldConfig.batched("image"),
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiniMaxVL01MultiModalProcessor,
+    info=MiniMaxVL01ProcessingInfo,
+    dummy_inputs=MiniMaxVL01DummyInputsBuilder)
+class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                          SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = MiniMaxVL01MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            multimodal_projector_bias=True,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size))
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.vision_feature_layer = config.vision_feature_layer
+        self.vocab_size = config.text_config.vocab_size
+        self.pad_token_id = -1
+        if self.config.pad_token_id is not None:
+            self.pad_token_id = self.config.pad_token_id
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
+                            PixtralHFVisionModel],
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        image_features = vision_tower(pixel_values)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _process_image_pixels(
+        self,
+        inputs: MiniMaxVL01ImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["pixel_values"]
+
+        return self._image_pixels_to_features(self.vision_tower, pixel_values)
+
+    def _process_image_input(
+        self,
+        image_input: MiniMaxVL01ImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features)
+
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features))
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[MiniMaxVL01ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return MiniMaxVL01ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return MiniMaxVL01ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        return self._process_image_input(image_input)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 8b1a1d68fc3..42ec786f3a5 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -2,7 +2,6 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
                     TypeVar, Union)
 
@@ -19,7 +18,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -33,7 +32,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -272,9 +272,8 @@ def _get_prompt_updates(
         image_token_id = hf_config.image_token_index
         image_end_id = vocab[processor.image_end_token]
 
-        vision_config = hf_config.vision_config
-        assert isinstance(vision_config, PixtralVisionConfig)
-        encoder_info = PixtralHFEncoderInfo(vision_config)
+        assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(hf_config)
 
         def get_replacement(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
@@ -311,14 +310,12 @@ def _build_mistral3_processor(
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
     cache: Optional[ProcessingCache] = None,
-    enable_sanity_checks: bool = True,
 ) -> BaseMultiModalProcessor:
     assert isinstance(info, Mistral3ProcessingInfo)
     return Mistral3MultiModalProcessor(
         info,
         dummy_inputs,  # type: ignore
         cache=cache,
-        enable_sanity_checks=enable_sanity_checks,
     )
 
 
@@ -383,8 +380,8 @@ def init_vision_tower_for_llava(
     _build_mistral3_processor,
     info=_build_mistral3_info,
     dummy_inputs=Mistral3DummyInputsBuilder)
-class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                       SupportsPP):
+class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
+                                       SupportsMultiModal, SupportsPP):
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -435,13 +432,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -569,8 +559,9 @@ def forward(
                 batch.
             pixel_values: The pixels in each input image.
 
-        See also:
-            :class:`Mistral3ImagePixelInputs`
+        :::{seealso}
+        {class}`Mistral3ImagePixelInputs`
+        :::
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
@@ -598,14 +589,16 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index b0ac99f21ea..1513c8dad09 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -454,7 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -481,14 +479,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 96eb925cf89..7c022a5b8f6 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -42,7 +42,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -372,7 +371,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -399,14 +397,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 7bfb3ada6bb..0c1d61c01f9 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -47,7 +47,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -1211,7 +1210,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(config.output_hidden_states,
                                                 config.text_config.vocab_size)
-        self.sampler = get_sampler()
 
     def compute_logits(
         self,
@@ -1222,14 +1220,6 @@ def compute_logits(
                                        hidden_states, sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def unpack_data(self,
                     image_data: Union[List[torch.Tensor], torch.Tensor],
                     padding_value=0) -> torch.Tensor:
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 0966f546ddf..56a7f02c415 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 import math
 from collections.abc import Iterable, Mapping
-from functools import cached_property
 from itertools import tee
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
@@ -38,7 +37,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import _initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -672,9 +670,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.config,
             None,
             prefix=maybe_prefix(prefix, "multi_modal_projector"))
-
         self.language_model = _initialize_model(
-            vllm_config=vllm_config.with_hf_config(config.text_config),
+            vllm_config=vllm_config.with_hf_config(config.text_config,
+                                                   ["LlamaForCausalLM"]),
             prefix=maybe_prefix(prefix, "language_model"),
             model_class=Llama4ForCausalLM,
         )
@@ -682,13 +680,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
         # num_images, 1, num_chunks, channel, image_size, image_size
@@ -785,10 +776,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def separate_weights(
         self,
         weights: Iterable[Tuple[str, torch.Tensor]],
@@ -824,7 +811,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         # language_model is an Llama4ForCausalLM instance. We load it's
         # using llama4's load_weights routine.
         language_model_weights, other_weights = self.separate_weights(
-            weights, prefix="language_model.model.")
+            weights, prefix="language_model.")
         loader = AutoWeightsLoader(self)
         loaded_language_model_params = loader.load_weights(
             language_model_weights)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
new file mode 100644
index 00000000000..2190241f0ba
--- /dev/null
+++ b/vllm/model_executor/models/modernbert.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import ModernBertConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.pooler import CrossEncodingPooler
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.sequence import IntermediateTensors, PoolerOutput
+
+from .interfaces import SupportsCrossEncoding
+from .utils import WeightsMapper, maybe_prefix
+
+
+class ModernBertEmbeddings(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+
+        super().__init__()
+        self.config = config
+        self.tok_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                     config.hidden_size)
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.layer_norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds:
+            return self.norm(inputs_embeds)
+        else:
+            inputs_embeds = self.tok_embeddings(input_ids)
+            embeddings = self.norm(inputs_embeds)
+            return embeddings
+
+
+class ModernBertRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(self, config: ModernBertConfig, head_size: int, dim: int,
+                 base: float):
+        super().__init__(
+            head_size=head_size,
+            rotary_dim=dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=base,
+            is_neox_style=True,
+            dtype=torch.float16)
+        self.config = config
+
+
+class ModernBertAttention(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.deterministic_flash_attn = config.deterministic_flash_attn
+        self.num_heads = config.num_attention_heads
+        assert self.num_heads % tp_size == 0
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.head_dim * self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.Wqkv = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            bias=config.attention_bias,
+        )
+
+        if layer_id % config.global_attn_every_n_layers != 0:
+            self.local_attention = (config.local_attention // 2,
+                                    config.local_attention // 2)
+        else:
+            self.local_attention = (-1, -1)
+
+        rope_theta = config.global_rope_theta
+        if self.local_attention != (
+                -1, -1) and config.local_rope_theta is not None:
+            rope_theta = config.local_rope_theta
+        self.rotary_emb = ModernBertRotaryEmbedding(config=config,
+                                                    head_size=self.head_dim,
+                                                    dim=self.head_dim,
+                                                    base=rope_theta)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              prefix=f"{layer_id}.attn",
+                              attn_type=AttentionType.ENCODER_ONLY)
+        self.Wo = RowParallelLinear(config.hidden_size,
+                                    config.hidden_size,
+                                    bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_outputs = self.attn(q, k, v)
+        hidden_states = attn_outputs
+        hidden_states, _ = self.Wo(hidden_states)
+        return hidden_states
+
+
+class ModernBertMLP(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.config = config
+        self.Wi = nn.Linear(config.hidden_size,
+                            int(config.intermediate_size) * 2,
+                            bias=config.mlp_bias)
+        self.act = nn.GELU()
+        self.Wo = RowParallelLinear(config.intermediate_size,
+                                    config.hidden_size,
+                                    bias=config.mlp_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input, gate = self.Wi(hidden_states).chunk(2, dim=-1)
+        return self.Wo(self.act(input) * gate)[0]
+
+
+class ModernBertLayer(nn.Module):
+
+    def __init__(self,
+                 config: ModernBertConfig,
+                 prefix: str = "",
+                 layer_id: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        if layer_id == 0:
+            self.attn_norm = nn.Identity()
+        else:
+            self.attn_norm = nn.LayerNorm(config.hidden_size,
+                                          eps=config.norm_eps,
+                                          bias=config.norm_bias)
+        self.attn = ModernBertAttention(config=config, layer_id=layer_id)
+        self.mlp_norm = nn.LayerNorm(config.hidden_size,
+                                     eps=config.norm_eps,
+                                     bias=config.norm_bias)
+        self.mlp = ModernBertMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        attn_outputs = self.attn(self.attn_norm(hidden_states),
+                                 position_ids=position_ids)
+        hidden_states = hidden_states + attn_outputs
+        mlp_output = self.mlp(self.mlp_norm(hidden_states))
+        hidden_states = hidden_states + mlp_output
+        return hidden_states
+
+
+class ModernBertEncoderLayer(nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.layers = nn.ModuleList([
+            ModernBertLayer(config=config, layer_id=layer_id)
+            for layer_id in range(config.num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, position_ids)
+        return hidden_states
+
+
+@support_torch_compile
+class ModernBertModel(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"layers.": "encoder_layer.layers."})
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.embeddings = ModernBertEmbeddings(config)
+        self.encoder_layer = ModernBertEncoderLayer(vllm_config)
+        self.final_norm = nn.LayerNorm(config.hidden_size,
+                                       eps=config.norm_eps,
+                                       bias=config.norm_bias)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        weights = self.hf_to_vllm_mapper.apply(weights)
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            inputs_embeds=inputs_embeds)
+
+        outputs = self.encoder_layer(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+        )
+        norm_outputs = self.final_norm(outputs)
+        return norm_outputs
+
+
+class ModernBertPooler(nn.Module):
+
+    def __init__(self, config: ModernBertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size,
+                               config.classifier_bias)
+        self.act = nn.GELU()
+        self.norm = nn.LayerNorm(config.hidden_size,
+                                 eps=config.norm_eps,
+                                 bias=config.norm_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        pooled_output = hidden_states
+        pooled_output = pooled_output.mean(dim=0, keepdim=False)
+        pooled_output = self.norm(self.act(self.dense(pooled_output)))
+        return pooled_output
+
+
+class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.config = config
+        self.model = ModernBertModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "modernbert"))
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self._pooler = CrossEncodingPooler(config, self.classifier,
+                                           ModernBertPooler(config))
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("model."):
+                    yield name[len("model."):], weight
+                else:
+                    self_weights.append((name, weight))
+
+        self.model.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            if name.startswith("head"):
+                param = params_dict["_pooler.pooler." + name[len("head") + 1:]]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            position_ids=positions,
+        )
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index d75845b45e7..75eebdacfdc 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -35,7 +35,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -966,7 +965,7 @@ def select_tiling(
 
 class MolmoProcessorWrapper:
     """
-    Wraps :class:`MolmoProcessor` so that it can be called directly.
+    Wraps {class}`MolmoProcessor` so that it can be called directly.
 
     The original definition can be found here:
     https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
@@ -1394,7 +1393,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.embedding_size
                                                 or config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -1506,7 +1504,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
 
         if intermediate_tensors is not None:
             inputs_embeds = None
@@ -1532,14 +1530,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index b30f3ee3799..77bd794058c 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -18,7 +18,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -298,7 +297,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                     prefix=maybe_prefix(prefix, "transformer"))
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -325,14 +323,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 0ea296b2f93..5208c0796c8 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -416,8 +415,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -444,14 +441,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 5c9b04cab18..26499949687 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -408,8 +407,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -439,11 +436,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 4a341c97d6c..0781ca168f8 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -39,7 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -309,7 +308,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 quant_config=quant_config,
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -340,14 +338,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index f9427cdadf7..44beae5726d 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -28,6 +28,7 @@
 
 import torch
 from torch import nn
+from transformers import Olmo2Config
 
 from vllm.attention import Attention
 from vllm.config import VllmConfig
@@ -42,7 +43,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -52,7 +52,6 @@
     make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 
 
 class Olmo2Attention(nn.Module):
@@ -339,7 +338,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -367,14 +365,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 6cf3f1f8264..e6925e12569 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -31,7 +31,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -39,7 +38,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -255,7 +254,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -308,56 +307,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class OlmoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = OlmoeModel(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -380,8 +329,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -438,11 +385,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
@@ -453,3 +399,50 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class OlmoeForCausalLM(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OlmoeModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["rotary_emb.inv_freq"],
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 4a12f36d90e..d258eddae25 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -43,7 +42,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -313,6 +312,43 @@ def forward(
                             intermediate_tensors,
                             inputs_embeds=inputs_embeds)
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class OPTForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {
@@ -320,6 +356,10 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "decoder.": "model.decoder.",
+    })
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -334,7 +374,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = ParallelLMHead(config.vocab_size,
                                           config.word_embed_proj_dim)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -361,52 +400,11 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "lm_head.weight" in name and self.config.tie_word_embeddings:
-                continue
-            if name.startswith("decoder."):
-                name = "model." + name
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 0b42666e02d..8d9c000750d 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -22,7 +22,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -30,7 +29,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -260,6 +259,45 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class OrionForCausalLM(nn.Module, SupportsPP):
 
@@ -277,7 +315,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -304,56 +341,16 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=([
+                "rotary_emb.inv_freq",
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+                "rotary_emb.cos_cached",
+                "rotary_emb.sin_cached"
+            ]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/ovis2.py b/vllm/model_executor/models/ovis2.py
new file mode 100644
index 00000000000..67cc86e7fc8
--- /dev/null
+++ b/vllm/model_executor/models/ovis2.py
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Ovis2 model."""
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.model_executor.models.aimv2 import Aimv2VisualTokenizer
+from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.ovis2 import OvisConfig
+from vllm.transformers_utils.processors.ovis2 import OvisProcessor
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .utils import merge_multimodal_embeddings
+
+# Cannot find the following number from hf config.
+IMAGE_TOKEN = "<image>"
+IMAGE_PAD_TOKEN_ID = 151655
+NUMBER_OF_TOKEN_TO_RESERVE_FOR_SEGMENT = 256
+
+
+class Ovis2ImagePatchInputs(TypedDict):
+    type: Literal["image_patches"]
+    flat_data: torch.Tensor
+    """
+    Shape: 
+    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    """
+
+    inducator_tokens: torch.Tensor
+    """
+    Shape: 
+    `(batch_size * (num_patches + 1))`
+    """
+
+    patches_per_image: List[int]
+    """
+    List of number of total patches for each image in the batch.
+    This is used to restore the first two dimensions of `flat_data`.
+    """
+
+
+class VisualEmbedding(torch.nn.Embedding):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, visual_tokens: Tensor) -> Tensor:
+        if visual_tokens.dtype in [
+                torch.int8, torch.int16, torch.int32, torch.int64, torch.long
+        ]:
+            return super().forward(visual_tokens)
+        return torch.matmul(visual_tokens, self.weight)
+
+    @property
+    def device(self):
+        return self.weight.device
+
+    @property
+    def dtype(self):
+        return self.weight.dtype
+
+
+class Ovis2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(OvisConfig)
+
+    def get_hf_processor(self, **kwargs):
+        return self.ctx.get_hf_processor(OvisProcessor)
+
+    def get_image_processor(self) -> OvisProcessor:
+        return self.get_hf_processor().image_processor  # type: ignore
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {  # 32k is model token limit at the moment
+            "image":
+            self.get_hf_config().multimodal_max_length //
+            ((9 + 1) * NUMBER_OF_TOKEN_TO_RESERVE_FOR_SEGMENT)
+        }
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        return ImageSize(width=image_processor.size['shortest_edge'] * 9 * 2,
+                         height=image_processor.size['shortest_edge'] * 9 * 2)
+
+
+class Ovis2DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        return IMAGE_TOKEN * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+        }
+        return mm_data
+
+
+class Ovis2MultiModalProcessor(BaseMultiModalProcessor[Ovis2ProcessingInfo]):
+
+    def image_indicators_to_visual_tokens(
+        self,
+        image_indicators: list[int],
+    ) -> list[int]:
+        """
+        Filter image indicators placeholders and convert them to corresponding 
+        tokens in visual tokenizer.
+        For example, [-301, -300, -302, -300, -303, -300, -304, -300, -305]
+        should return [vocab_size-1, vocab_size-2, ..., vocab_size-5]
+        """
+        hf_config = self.info.get_hf_config()
+        vte_vocab_size = hf_config.visual_tokenizer_config.vocab_size
+        # -300 is image_atom token, filter them out
+        return [vte_vocab_size + x + 300 for x in image_indicators if x < -300]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            #    # Avoid warning from HF logger for text-only input
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            # prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) nope
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor()
+        image_indicators = [
+            hf_processor.construct_image_indicators(grid)
+            for grid in processed_outputs["grids"]
+        ]
+        indicator_tokens = [
+            self.image_indicators_to_visual_tokens(indicator)
+            for indicator in image_indicators
+        ]
+        processed_outputs["indicator_tokens"] = indicator_tokens
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                    grids=MultiModalFieldConfig.batched("image"),
+                    indicator_tokens=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement_ovis(item_idx):
+            grid = out_mm_kwargs["grids"][item_idx]
+
+            hf_processor = self.info.get_hf_processor()
+            return hf_processor.construct_image_placeholders(grid)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=IMAGE_TOKEN,
+                replacement=get_replacement_ovis,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(Ovis2MultiModalProcessor,
+                                        info=Ovis2ProcessingInfo,
+                                        dummy_inputs=Ovis2DummyInputsBuilder)
+class Ovis2ForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config: OvisConfig = config
+        self.llm = init_vllm_registered_model(
+            vllm_config=vllm_config.with_hf_config(config.get_text_config()),
+            prefix=maybe_prefix(prefix, "llm"),
+        )
+
+        self.visual_tokenizer = Aimv2VisualTokenizer(
+            config=config.visual_tokenizer_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.visual_tokenizer",
+            image_processor_name_or_path=config.visual_tokenizer_config.
+            backbone_config.name_or_path,
+        )
+
+        self.vte = VisualEmbedding(
+            self.config.visual_tokenizer_config.vocab_size,
+            self.config.hidden_size)
+
+        # TODO(Isotr0py): PP support
+        # self.make_empty_intermediate_tensors = (
+        #    self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Ovis2ImagePatchInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        indicator_tokens = kwargs.pop("indicator_tokens", None)
+
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Ovis2ImagePatchInputs(
+                type="image_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: Ovis2ImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+        indicator_tokens = image_input["indicator_tokens"]
+
+        indicator_per_image = list(
+            map(lambda x: x + 1 if x > 1 else x + 2, patches_per_image))
+
+        target_dtype = self.visual_tokenizer.dtype
+        visual_tokens = self.visual_tokenizer(
+            image_patches_flat.to(target_dtype))
+        visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
+
+        indicator_embeds = self.vte(indicator_tokens)
+        indicator_embeds_per_image = indicator_embeds.split(
+            indicator_per_image)
+
+        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        vision_embeddings = []
+        for indicator, visual in zip(indicator_embeds_per_image,
+                                     visual_embeds_per_image):
+            vision_embeddings_per_image = []
+            for i in range(visual.shape[0]):
+                vision_embeddings_per_image.append(
+                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
+            vision_embeddings_per_image.append(indicator[i + 1:])
+            vision_embeddings.append(
+                torch.cat(vision_embeddings_per_image, dim=0))
+
+        return tuple(vision_embeddings)
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        image_features = self._process_image_input(image_input)
+
+        return image_features
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [IMAGE_PAD_TOKEN_ID])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        # up until here we have a inputs_embeds 100% numerical identity
+        # between the OG HF Transformers implementation and ours
+        hidden_states = self.llm(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.llm.logits_processor(self.llm.lm_head, hidden_states,
+                                           sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 6c1bd499f63..8699ae52622 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -8,7 +8,6 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -260,10 +259,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @property
-    def sampler(self):
-        return self.language_model.sampler
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -369,7 +364,7 @@ def forward(self,
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
+                **kwargs: object) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -396,13 +391,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index db8d170a8c9..eacf02433b5 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -46,7 +45,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -116,9 +115,10 @@ def __init__(self,
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
             base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(self.num_heads,
@@ -221,7 +221,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.vocab_size = config.vocab_size
-
+        self.config = config
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
                                                    config.hidden_size)
         self.start_layer, self.end_layer, self.layers = make_layers(
@@ -260,6 +260,38 @@ def forward(
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # copy from vllm/model_executor/models/bloom.py
+                # NOTE: Persimmon's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PersimmonForCausalLM(nn.Module, SupportsPP):
 
@@ -274,7 +306,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                       config.hidden_size,
                                       bias=False)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -305,49 +336,7 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # copy from vllm/model_executor/models/bloom.py
-                # NOTE: Persimmon's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index fdf7734595a..fc2b108bad9 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -53,7 +53,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -322,7 +321,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                       bias=True,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -350,14 +348,6 @@ def compute_logits(
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 33984f54ae2..338e87b4285 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -17,7 +17,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -26,7 +25,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -353,10 +352,29 @@ def forward(
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Phi3SmallForCausalLM(nn.Module, SupportsPP):
     _tied_weights_keys = ["lm_head.weight"]
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={"rotary_emb.inv_freq": None})
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -377,7 +395,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -418,6 +435,7 @@ def compute_logits(
                                        sampling_metadata)
         if self.dummy_token_indices is not None and logits is not None:
             logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
+        logits = logits / self.mup_width_multiplier
         return logits
 
     def forward(
@@ -436,33 +454,10 @@ def forward(
         output_hidden_states = output_hidden_states
         return output_hidden_states
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-
-        next_tokens = self.sampler(logits / self.mup_width_multiplier,
-                                   sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            if "lm_head.weight" in name and self.config.tie_word_embeddings:
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None))
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7f41ad2359d..a1442251b99 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -16,7 +16,6 @@
 # limitations under the License.
 import re
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -27,7 +26,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -327,7 +325,7 @@ def get_num_image_tokens(
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin],
+        processor: Optional[ProcessorMixin] = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -555,13 +553,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
@@ -716,13 +707,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index ec19797f887..6035994f433 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,41 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
-import re
-from functools import lru_cache
-from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
-                    TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 
 import numpy as np
-import scipy.signal
 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import PretrainedConfig, SiglipVisionConfig
-from transformers.utils import logging
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
+                          SequenceFeatureExtractor, SiglipVisionConfig)
 
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
-from vllm.inputs.data import TokenInputs, token_inputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.utils import is_list_of
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -43,115 +43,19 @@
 _AUDIO_PLACEHOLDER_TOKEN_ID = 200011
 
 _AUDIO_MAX_SOUNDFILE_SIZE = 241_000
-DUMMY_SAMPLING_FREQUENCY = 16_000  # kHz
-
-DYNAMIC_HD = 16
-AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>"
-IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>"
 
 SIGLIP_NAME = "siglip-so400m-patch14-448"
 VISION_ENCODER_TO_PROCESSING_CONFIG = {
     'siglip-so400m-patch14-448': {
-        'dynamic_hd': 16,
         'vit_image_size': 448,
         'vit_patch_size': 14,
         'token_compression_factor': 2,
     },
 }
-logger = logging.get_logger(__name__)
-# This is a workaround to prevent text (user input) + audio + image
-# from being used in the same prompt.
-# It includes token ids for "/n" and tokens in added_tokens_decoder
-# from the tokenizer_confg.json file.
-NON_USER_INPUT_TOKENS = {
-    198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022,
-    200023, 200024, 200025, 200026, 200027, 200028
-}
 
 
-def get_max_dummy_image(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-
-    max_side = vit_image_size * dynamic_hd_size
-    dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side)
-    return dummy_image
-
-
-# image token length
-def get_max_phi4mm_image_tokens(ctx: InputContext):
-    dummy_image = get_max_dummy_image(ctx)
-
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size,
-                                                 vit_image_size,
-                                                 vit_patch_size,
-                                                 token_compression_factor)
-    return image_num_tokens
-
-
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
-                              image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def _find_target_aspect_ratio(image, image_size, max_num, min_num):
-    orig_width, orig_height = image.size
-
-    w_crop_num = math.ceil(orig_width / float(image_size))
-    h_crop_num = math.ceil(orig_height / float(image_size))
-    if w_crop_num * h_crop_num > max_num:
-        aspect_ratio = orig_width / orig_height
-
-        # calculate the existing image aspect ratio
-        target_ratios = set((i, j) for i in range(1, max_num + 1)
-                            for j in range(1, max_num + 1)
-                            if i * j <= max_num and i * j >= min_num)
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-
-        # calculate the target width and height
-        target_width = image_size * target_aspect_ratio[0]
-        target_height = image_size * target_aspect_ratio[1]
-        logger.debug("target_aspect_ratio: %s", target_aspect_ratio)
-    else:
-        target_width = image_size * w_crop_num
-        target_height = image_size * h_crop_num
-        target_aspect_ratio = (w_crop_num, h_crop_num)
-    return target_aspect_ratio, target_height, target_width
-
-
-def _get_padding_size(image, target_height, target_width):
-    orig_width, orig_height = image.size
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -164,181 +68,6 @@ def _get_padding_size(image, target_height, target_width):
     return padding_height, padding_width
 
 
-def dynamic_preprocess(image,
-                       min_num=1,
-                       max_num=12,
-                       image_size=384,
-                       mask_size=27):
-    target_aspect_ratio, target_height, target_width =\
-          _find_target_aspect_ratio(
-        image, image_size, max_num, min_num)
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-
-    # Calculate the ratio
-    orig_width, orig_height = image.size
-    ratio_width = target_width / orig_width
-    ratio_height = target_height / orig_height
-    if ratio_width < ratio_height:
-        new_size = (target_width, int(orig_height * ratio_width))
-    else:
-        new_size = (int(orig_width * ratio_height), target_height)
-
-    attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]),
-                                 int(mask_size * target_aspect_ratio[0])))
-    if padding_width >= 14:
-        attention_mask[:, -math.floor(padding_width / 14):] = 0
-    if padding_height >= 14:
-        attention_mask[-math.floor(padding_height / 14):, :] = 0
-    assert attention_mask.sum(
-    ) > 0, f'attention mask is empty {attention_mask}'
-
-    if min(new_size[1], target_height) < 10 or min(new_size[0],
-                                                   target_width) < 10:
-        raise ValueError(f'the aspect ratio is very extreme {new_size}')
-
-    image = T.functional.resize(
-        image,
-        [new_size[1], new_size[0]],
-    )
-
-    resized_img = T.functional.pad(image,
-                                   [0, 0, padding_width, padding_height],
-                                   fill=[255, 255, 255])
-
-    return resized_img, attention_mask
-
-
-def pad_to_max_num_crops(images, max_crops=5):
-    """
-    images: B x 3 x H x W, B<=max_crops
-    """
-    B, _, H, W = images.shape
-    if max_crops > B:
-        pad = torch.zeros(max_crops - B,
-                          3,
-                          H,
-                          W,
-                          dtype=images.dtype,
-                          device=images.device)
-        images = torch.cat([images, pad], dim=0)
-    return images
-
-
-def pad_mask_to_max_num_crops(masks, max_crops=5):
-    B, H, W = masks.shape
-    if max_crops > B:
-        pad = torch.ones(max_crops - B,
-                         H,
-                         W,
-                         dtype=masks.dtype,
-                         device=masks.device)
-        masks = torch.cat([masks, pad], dim=0)
-    return masks
-
-
-def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
-
-    # Basic settings.
-    img_processor = T.Compose([
-        T.ToTensor(),
-        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
-    # Dynamic HD
-    base_resolution = vit_resolution
-    images = [image.convert('RGB') for image in images]
-    # cover 384 and 448 resolution
-    mask_resolution = base_resolution // vit_patch_size
-    elems, image_attention_masks = [], []
-    for im in images:
-        elem, attention_mask = dynamic_preprocess(im,
-                                                  max_num=dynamic_hd_size,
-                                                  image_size=base_resolution,
-                                                  mask_size=mask_resolution)
-        elems.append(elem)
-        image_attention_masks.append(attention_mask)
-    hd_images = [img_processor(im) for im in elems]
-    global_image = [
-        torch.nn.functional.interpolate(
-            im.unsqueeze(0).float(),
-            size=(base_resolution, base_resolution),
-            mode='bicubic',
-        ).to(im.dtype) for im in hd_images
-    ]
-    shapes = [[im.size(1), im.size(2)] for im in hd_images]
-    mask_shapes = [[mask.size(0), mask.size(1)]
-                   for mask in image_attention_masks]
-    global_attention_mask = [
-        torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images
-    ]
-    hd_images_reshape = [
-        im.reshape(1, 3, h // base_resolution, base_resolution,
-                   w // base_resolution, base_resolution).permute(
-                       0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution,
-                                                 base_resolution).contiguous()
-        for im, (h, w) in zip(hd_images, shapes)
-    ]
-    attention_masks_reshape = [
-        mask.reshape(1, h // mask_resolution, mask_resolution,
-                     w // mask_resolution, mask_resolution).permute(
-                         0, 1, 3, 2, 4).reshape(-1, mask_resolution,
-                                                mask_resolution).contiguous()
-        for mask, (h, w) in zip(image_attention_masks, mask_shapes)
-    ]
-    # NOTE token compression is hard coded here, and odd numbers seems to fail
-    downsample_attention_masks = [
-        mask[:, 0::2,
-             0::2].reshape(1, h // mask_resolution, w // mask_resolution,
-                           mask_resolution // 2 + mask_resolution % 2,
-                           mask_resolution // 2 + mask_resolution % 2).permute(
-                               0, 1, 3, 2, 4)
-        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes)
-    ]
-    downsample_attention_masks = [
-        mask.reshape(mask.size(1) * mask.size(2),
-                     mask.size(3) * mask.size(4))
-        for mask in downsample_attention_masks
-    ]
-    # NOTE hard coded number of tokens
-    num_img_tokens = [
-        256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16
-        for mask in downsample_attention_masks
-    ]
-
-    hd_images_reshape = [
-        torch.cat([_global_image] + [_im], dim=0)
-        for _global_image, _im in zip(global_image, hd_images_reshape)
-    ]
-    hd_masks_reshape = [
-        torch.cat([_global_mask] + [_mask],
-                  dim=0) for _global_mask, _mask in zip(
-                      global_attention_mask, attention_masks_reshape)
-    ]
-    max_crops = max([img.size(0) for img in hd_images_reshape])
-    image_transformed = [
-        pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape
-    ]
-    image_transformed = torch.stack(image_transformed, dim=0)
-    mask_transformed = [
-        pad_mask_to_max_num_crops(mask, max_crops) \
-            for mask in hd_masks_reshape
-    ]
-    mask_transformed = torch.stack(mask_transformed, dim=0)
-
-    returned_input_image_embeds = image_transformed
-    returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
-    returned_image_attention_mask = mask_transformed
-    returned_num_img_tokens = num_img_tokens
-
-    data = {
-        "pixel_values": returned_input_image_embeds,
-        "image_sizes": returned_image_sizes,
-        "image_attention_mask": returned_image_attention_mask,
-        "num_img_tokens": returned_num_img_tokens,
-    }
-    return data
-
-
 def get_navit_vision_model(layer_idx: int = -1, **kwargs):
     vision_config = {
         "hidden_size": 1152,
@@ -492,7 +221,7 @@ def get_img_features(self,
 
     def forward(self, pixel_values: torch.FloatTensor,
                 image_sizes: torch.Tensor,
-                image_attention_mask: torch.Tensor) -> torch.FloatTensor:
+                image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
         """
         process image and return vision embeddings.
 
@@ -656,785 +385,505 @@ def forward(self, pixel_values: torch.FloatTensor,
         for _output_img in output_imgs:
             img_feature_proj = self.img_projection(
                 _output_img.to(target_device).to(target_dtype))
-            img_set_tensor.append(img_feature_proj)
+            img_set_tensor.append(img_feature_proj.squeeze(0))
 
         return img_set_tensor
 
 
-class Phi4MMAudioFeatureInputs(TypedDict):
-    type: Literal["audio_features"]
-    data: Tuple[NestedTensors]
-    """Shape: `((batch_size, num_audios, 80, M), )"""
-
-
-class Phi4MMAudioEmbeddingInputs(TypedDict):
-    type: Literal["audio_embeds"]
-    data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
-
-
-Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
-
-
-def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
-    """Create a Mel filter-bank the same as SpeechLib FbankFC.
-
-    Args:
-        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
-        n_fft (int): FFT size. int > 0 [scalar]
-        n_mel (int): Mel filter size. int > 0 [scalar]
-        fmin (float): lowest frequency (in Hz). If None use 0.0.
-            float >= 0 [scalar]
-        fmax: highest frequency (in Hz). If None use sample_rate / 2.
-            float >= 0 [scalar]
-
-    Returns
-        out (numpy.ndarray): Mel transform matrix
-            [shape=(n_mels, 1 + n_fft/2)]
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
     """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    bank_width = int(n_fft // 2 + 1)
-    if fmax is None:
-        fmax = sample_rate / 2
-    if fmin is None:
-        fmin = 0
-    assert fmin >= 0, "fmin cannot be negative"
-    assert (fmin < fmax <=
-            sample_rate / 2), "fmax must be between (fmin, samplerate / 2]"
-
-    def mel(f):
-        return 1127.0 * np.log(1.0 + f / 700.0)
-
-    def bin2mel(fft_bin):
-        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
-
-    def f2bin(f):
-        return int((f * n_fft / sample_rate) + 0.5)
-
-    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
-    klo = f2bin(fmin) + 1
-    khi = f2bin(fmax)
-
-    khi = max(khi, klo)
-
-    # Spec 2: SpeechLib uses triangles in Mel space
-    mlo = mel(fmin)
-    mhi = mel(fmax)
-    m_centers = np.linspace(mlo, mhi, n_mels + 2)
-    ms = (mhi - mlo) / (n_mels + 1)
-
-    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
-    for m in range(0, n_mels):
-        left = m_centers[m]
-        center = m_centers[m + 1]
-        right = m_centers[m + 2]
-        for fft_bin in range(klo, khi):
-            mbin = bin2mel(fft_bin)
-            if left < mbin < right:
-                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
-
-    return matrix
-
-
-class LogFbankProcessor:
-
-    def __init__(self):
-
-        self._eightk_method = "fillzero"
-        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
-
-        self._hamming400 = np.hamming(400)  # for 16k audio
-        self._hamming200 = np.hamming(200)  # for 8k audio
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
 
-    def extract_spectrogram(self, wav, fs):
-        """Extract spectrogram features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        if wav.ndim > 1:
-            wav = np.squeeze(wav)
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
 
-        # by default, we extract the mean if stereo
-        if len(wav.shape) == 2:
-            wav = wav.mean(1)
+    This should be in `(height, width)` format.
+    """
 
-        # Resample to 16000 or 8000 if needed
-        if fs > 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
-            fs = 16000
-        elif 8000 < fs < 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
-            fs = 8000
-        elif fs < 8000:
-            raise RuntimeError(f"Unsupported sample rate {fs}")
-
-        if fs == 8000:
-            if self._eightk_method == "resample":
-                # Input audio is 8 kHz. Convert to 16 kHz before feature
-                # extraction
-                wav = scipy.signal.resample_poly(wav, 2, 1)
-                fs = 16000
-            # Do nothing here for fillzero method
-        elif fs != 16000:
-            # Input audio is not a supported sample rate.
-            raise RuntimeError(
-                f"Input data using an unsupported sample rate: {fs}")
-
-        preemphasis = 0.97
-
-        if fs == 8000:
-            n_fft = 256
-            win_length = 200
-            hop_length = 80
-            fft_window = self._hamming200
-        elif fs == 16000:
-            n_fft = 512
-            win_length = 400
-            hop_length = 160
-            fft_window = self._hamming400
-
-        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
-        n_batch = (wav.shape[0] - win_length) // hop_length + 1
-        # Here we don't use stride_tricks since the input array may not satisfy
-        # memory layout requirement and we need writeable output
-        # Here we only use list of views before copy to destination
-        # so it is more efficient than broadcasting
-        y_frames = np.array(
-            [
-                wav[_stride:_stride + win_length]
-                for _stride in range(0, hop_length * n_batch, hop_length)
-            ],
-            dtype=np.float32,
-        )
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
 
-        # Spec 2: SpeechLib applies preemphasis within each batch
-        y_frames_prev = np.roll(y_frames, 1, axis=1)
-        y_frames_prev[:, 0] = y_frames_prev[:, 1]
-        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
 
-        S = np.fft.rfft(fft_window * y_frames, n=n_fft,
-                        axis=1).astype(np.complex64)
 
-        if fs == 8000:
-            # Need to pad the output to look like 16 kHz data but with zeros in
-            # the 4 to 8 kHz bins.
-            frames, bins = S.shape
-            padarray = np.zeros((frames, bins))
-            S = np.concatenate((S[:, 0:-1], padarray),
-                               axis=1)  # Nyquist bin gets set to zero
+class Phi4MMImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
-        spec = np.abs(S).astype(np.float32)
-        return spec
+    `hidden_size` must match the hidden size of language model backbone.
+    """
 
-    def extract_features(self, wav, fs):
-        """Extract log filterbank features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        spec = self.extract_spectrogram(wav, fs)
-        spec_power = spec**2
 
-        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
-        log_fbank = np.log(fbank_power).astype(np.float32)
+class Phi4MMAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
 
-        return log_fbank
 
+class Phi4MMAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
 
-@lru_cache
-def audio_feature_extractor() -> LogFbankProcessor:
-    # Creates an instance of the audio processor, needed to extract the
-    # the audio features from the sound file
-    # LRU cache ensures that we only make one copy
-    return LogFbankProcessor()
 
+Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
+Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
 
-def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                              vit_patch_size, token_compression_factor):
-    """
-    compute the number of tokens an image is expected to take up considering 
-    the image encoder architecture and exclude output features containing 
-    only padding pixels
 
-    for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
-    32x32 feature map
-    NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-    """
-    assert vit_image_size % vit_patch_size == 0, \
-        "vit_image_size must be divisible by vit_patch_size"
-    assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
-        "vit_image_size // vit_patch_size must be divisible by "\
-            "token_compression_factor"
-
-    target_aspect_ratio, target_height, target_width = (
-        _find_target_aspect_ratio(image,
-                                  vit_image_size,
-                                  dynamic_hd_size,
-                                  min_num=1))
-    assert target_aspect_ratio[
-        0] * vit_image_size == target_width, \
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-    assert target_aspect_ratio[
-        1] * vit_image_size == target_height, \
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-    assert (target_height % vit_image_size == 0
-            and target_width % vit_image_size == 0)
-
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-    assert padding_width == 0 or padding_height == 0, \
-        "padding_width or padding_height must be 0"
-
-    target_feat_width = target_width // vit_patch_size
-    target_feat_height = target_height // vit_patch_size
-    if padding_width >= vit_patch_size:
-        assert padding_height == 0, "padding_height not 0"
-        non_pad_feat_width = target_feat_width - math.floor(
-            padding_width / vit_patch_size)
-        non_pad_feat_height = target_feat_height
-    elif padding_height >= vit_patch_size:
-        assert padding_width == 0, "padding_width not 0"
-        non_pad_feat_height = target_feat_height - math.floor(
-            padding_height / vit_patch_size)
-        non_pad_feat_width = target_feat_width
-    else:
-        # small padding shorter than a vit patch
-        non_pad_feat_width = target_feat_width
-        non_pad_feat_height = target_feat_height
-
-    feat_width = non_pad_feat_width // token_compression_factor
-    feat_height = non_pad_feat_height // token_compression_factor
-    # NOTE it's possible that the non-padding feature is not divisible
-    if non_pad_feat_width % token_compression_factor != 0:
-        feat_width += 1
-    if non_pad_feat_height % token_compression_factor != 0:
-        feat_height += 1
-    num_hd_patch_tokens = feat_width * feat_height
-    num_hd_newline_tokens = feat_height
-    vit_feature_size = vit_image_size // vit_patch_size
-    num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
-    num_sep_tokens = 1
-    num_global_image_newline_tokens = \
-        vit_feature_size // token_compression_factor
-
-    return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
-            num_hd_newline_tokens + num_global_image_newline_tokens)
-
-
-def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
+def cat_with_pad(tensors, dim, padding_value=0):
     """
-    Compute the output size of the `extract_features` method.
-
-    Args:
-        wav_length (int): Length of the input waveform in samples.
-        fs (int): Sampling rate of the waveform, either 16000 or 8000.
-
-    Returns:
-        tuple (int, int): Output size as (T, D), where:
-            T: Number of time frames.
-            D: Number of Mel filterbank bins (80).
+    cat along dim, while pad to max for all other dims
     """
+    ndim = tensors[0].dim()
+    assert all(
+        t.dim() == ndim for t in
+        tensors[1:]), "All tensors must have the same number of dimensions"
 
-    # Resample to 16000 or 8000 if needed
-    if fs > 16000:
-        wav_length //= fs // 16000
-        fs = 16000
-    elif 8000 <= fs < 16000:
-        # We'll resample to 16K from 8K
-        wav_length *= 2
-        fs = 16000
-    elif fs < 8000:
-        raise RuntimeError(f"Unsupported sample rate {fs}")
-
-    # Spectrogram parameters for 16 kHz
-    win_length = 400  # Frame length in samples
-    hop_length = 160  # Frame shift in samples
-    mel_bins = 80  # Number of mel filterbank bins
-
-    # Calculate number of frames (T)
-    T = (wav_length - win_length) // hop_length + 1
-    if T < 1:
-        raise ValueError("Waveform too short for given parameters.")
-
-    # Return time frames (T) and mel bins (D)
-    return T, mel_bins
-
+    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
+    out_size[dim] = sum(t.shape[dim] for t in tensors)
+    output = tensors[0].new_full(out_size, padding_value)
 
-def _get_audio_embed_sizes(audios, ctx: InputContext):
-    """
-    Get the audio embedding sizes for each audio file.
+    index = 0
+    for t in tensors:
+        # Create a slice list where every dimension except dim is full slice
+        slices = [slice(0, t.shape[d]) for d in range(ndim)]
+        # Update only the concat dimension slice
+        slices[dim] = slice(index, index + t.shape[dim])
 
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
+        output[slices] = t
+        index += t.shape[dim]
 
-    Returns:
-        List[int]: List of audio embedding sizes.
-    """
-    audio_embed_sizes = []
-    for audio in audios:
-        audio_data, sf = audio
-        audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf)
-        audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                     audio_frames)
-        audio_embed_sizes.append(audio_embed_size)
-    return audio_embed_sizes
+    return output
 
 
-def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""):
-    """
-    The following will search for `<|audio_{idx}|>` tokens and
-    return a mapping of audio placeholder tokens to audio placeholder token ids
-    based on the size of the audio embeddings.
+class Phi4MMProcessingInfo(BaseProcessingInfo):
 
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-        prompt_str (str): The prompt string.
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
 
-    Returns:
-        Dict[str, List[int]]: Mapping of audio placeholder tokens to audio 
-        placeholder token ids.
+        return self.ctx.get_hf_processor(**kwargs)
 
-    """
-    if len(audios) == 0:
-        return {}
-
-    audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str)
-    audio_ids = [int(audio_id) for audio_id in audio_ids]
-    assert len(audio_ids) == len(
-        audio_embed_sizes
-    ), "Number of audio tokens and audio features do not match"
-    assert tuple(audio_ids) == tuple(range(1,
-                                           len(audio_ids) +
-                                           1)), "Audio ids are not in order!"
-    audio_id_to_input_ids = {
-        f"<|audio_{audio_id}|>":
-        [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
-        for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes)
-    }
+    @property
+    def image_tokens(self) -> list[str]:
+        return [f"<|image_{i+1}|>" for i in range(100)]
 
-    return audio_id_to_input_ids
-
-
-def _count_image_tokens(images, ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_token_counts = [
-        _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size, token_compression_factor)
-        for image in images
-    ]
-    return image_token_counts
-
-
-def _get_image_id_to_input_ids(images, prompt, ctx: InputContext):
-    if len(images) == 0:
-        return {}
-
-    image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt)
-    image_ids = [int(image_id) for image_id in image_ids]
-    assert len(image_ids) == len(
-        set(image_ids)), "Duplicate image tokens in prompt"
-    assert len(images) == len(
-        image_ids), "Number of images and image tokens in prompt do not match"
-
-    # NOTE the following assertion is not strictly necessary
-    assert tuple(image_ids) == tuple(range(1,
-                                           len(image_ids) +
-                                           1)), "Image ids are not in order"
-
-    image_token_counts = _count_image_tokens(images, ctx)
-    image_id_to_input_ids = {
-        f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens
-        for image_id, num_tokens in zip(image_ids, image_token_counts)
-    }
-    return image_id_to_input_ids
+    @property
+    def audio_tokens(self) -> list[str]:
+        return [f"<|audio_{i+1}|>" for i in range(100)]
 
+    def get_dynamic_hd(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_processor = processor.image_processor
+        return image_processor.dynamic_hd
 
-def input_processor_for_phi4mm(ctx: InputContext,
-                               inputs: DecoderOnlyInputs) -> TokenInputs:
-    """
-    Implements the input processor, which transforms the input prompt ids
-    to include the audio placeholder token.  This will become the `input_ids`
-    in `forward` for the model.
+    def get_feature_extractor(self) -> SequenceFeatureExtractor:
+        return self.get_hf_processor().audio_processor
 
-    Args:
-        ctx (InputContext): Input context.
-        inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids)
-        to process.
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None}
 
-    Returns:
-        TokenInputs: Processed inputs
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-    if (multi_modal_data is None or
-        ("audio" not in multi_modal_data and "image" not in multi_modal_data)):
-        # pure text input, so no need to do pre-processing
-        return inputs
-
-    prompt_str = inputs.get("prompt")
-    prompt_token_ids = inputs.get("prompt_token_ids")
-    # for offline_inference, we will get str input and we parse MM special
-    # tokens from it
-    # (ignore prompt_token_ids)
-    # for OAI server, we will get prompt_token_ids, where MM special tokens
-    # are already parsed
-
-    if 'audio' in multi_modal_data:
-        audios = multi_modal_data["audio"]
-
-        if not isinstance(audios, list):
-            audios = [audios]
-        if prompt_str is not None:
-            audio_id_to_input_ids = _get_audio_id_to_input_ids(
-                audios, ctx, prompt_str=prompt_str)
-            audio_embed_sizes = []
-        elif prompt_token_ids is not None:
-            audio_id_to_input_ids = {}
-            audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    else:
-        audio_id_to_input_ids = {}
-        audio_embed_sizes = []
-
-    if 'image' in multi_modal_data:
-        # PIL Image or list of PIL Images
-        images = multi_modal_data["image"]
-        if not isinstance(images, list):
-            images = [images]
-        if prompt_str is not None:
-            image_id_to_input_ids = _get_image_id_to_input_ids(
-                images, prompt_str, ctx)
-            image_token_counts = []
-        elif prompt_token_ids is not None:
-            image_id_to_input_ids = {}
-            image_token_counts = _count_image_tokens(images, ctx)
-    else:
-        image_id_to_input_ids = {}
-        image_token_counts = []
-
-    # Handle the case where the prompt is a string and we need to manually
-    # tokenize it.
-    # In this case, the `audio_id_to_input_ids` dict will be mapping from
-    # an audio placeholder
-    # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the
-    # given audio length.
-    if prompt_str:
-        pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)"
-        prompt_chunk_strings = re.split(pattern, prompt_str)
-        prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""]
-
-        # Create the new input_ids with the placeholder image and audio
-        # tokens inserted
-        tokenizer = cached_tokenizer_from_config(ctx.model_config)
-        input_ids = []
-        has_imag, has_audio, has_user_text_input = False, False, False
-        for prompt_chunk_string in prompt_chunk_strings:
-            if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(image_id_to_input_ids[prompt_chunk_string])
-                has_imag = True
-            elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(audio_id_to_input_ids[prompt_chunk_string])
-                has_audio = True
-            else:
-                curr_token_ids = tokenizer(prompt_chunk_string).input_ids
-                if not has_user_text_input:
-                    for token_id in curr_token_ids:
-                        if token_id not in NON_USER_INPUT_TOKENS:
-                            has_user_text_input = True
-                            break
-                input_ids.extend(curr_token_ids)
-        if has_audio and has_imag and has_user_text_input:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-    # Handle the case where the prompt is already tokenized
-    else:
-        assert prompt_token_ids is not None, \
-            "If string prompt isn't provided, prompt_token_ids must be"
-
-        i = 0
-        input_ids = prompt_token_ids
-        # only needed for later assertion
-        img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0
-        image_token_count_iter = iter(image_token_counts)
-        audio_embed_size_iter = iter(audio_embed_sizes)
-        while i < len(input_ids):
-            token_id = input_ids[i]
-            if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID:
-                token_count = next(audio_embed_size_iter)
-                audio_cnt += 1
-            elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID:
-                token_count = next(image_token_count_iter)
-                img_cnt += 1
-            else:
-                user_text_input_cnt += 1 if token_id not in \
-                    NON_USER_INPUT_TOKENS else 0
-                i += 1
-                continue
-            tokens = [token_id] * token_count
-            input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
-            i += token_count
-
-        if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-        # If the below assertion fails, it might be that input pure-text
-        # messages contain image/audio special tokens literally
-        # (<|endoftext10|>, <|endoftext11|>).
-        assert (img_cnt == len(image_token_counts)), (
-            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
-            f"does not match number of images ({len(image_token_counts)})")
-        assert (audio_cnt == len(audio_embed_sizes)), (
-            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
-            f"does not match number of audios ({len(audio_embed_sizes)})")
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(
-        prompt_token_ids=input_ids,
-        prompt=prompt_str,
-        multi_modal_data=multi_modal_data,
-    )
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_hf_processor().image_processor
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+                image_size,
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
 
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
 
-def _compute_audio_embed_size(hf_config, audio_frames):
-    """
-    Compute the audio embedding size based on the audio frames and
-    compression rate.
-    """
-    compression_rate = hf_config.embd_layer['audio_embd_layer'][
-        'compression_rate']
-    # NOTE: this is a hard-coded value but might be configurable in the future
-    qformer_compression_rate = 1
-    integer = audio_frames // compression_rate
-    remainder = audio_frames % compression_rate
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+        vit_patch_size = prepro_config['vit_patch_size']
+        token_compression_factor = prepro_config['token_compression_factor']
+
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
+
+        image_num_tokens = self._compute_num_image_tokens(
+            image_width,
+            image_height,
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+            token_compression_factor=token_compression_factor,
+        )
 
-    result = integer if remainder == 0 else integer + 1
+        return image_num_tokens
 
-    integer = result // qformer_compression_rate
-    remainder = result % qformer_compression_rate
-    result = integer if remainder == 0 else integer + 1  # qformer compression
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> ImageSize:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
+        vit_image_size = prepro_config['vit_image_size']
+
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
 
-    return result
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
 
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
 
-def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
-    return 10000
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
+        elif 8000 <= sr < 16000:
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
+
+        # Calculate number of frames (T)
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return num_frames
+
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the audio embedding size based on the audio frames and
+        compression rate.
+        """
+        hf_config = self.get_hf_config()
+        compression_rate = hf_config.embd_layer['audio_embd_layer'][
+            'compression_rate']
+        # NOTE: this is a hard-coded value but might be configurable
+        # in the future
+        qformer_compression_rate = 1
+        integer = audio_frames // compression_rate
+        remainder = audio_frames % compression_rate
 
+        result = integer if remainder == 0 else integer + 1
 
-def dummy_audio_for_phi4mm(audio_count: int) -> dict:
-    """
-    Create dummy audio data for the Phi4MM model, which is used for profiling.
+        integer = result // qformer_compression_rate
+        remainder = result % qformer_compression_rate
+        # qformer compression
+        result = integer if remainder == 0 else integer + 1
 
-    Args:
-        audio_count (int): Number of audio samples.
+        return result
 
-    Returns:
-        dict: Dummy audio data.
-    """
-    dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0)
-    return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count
 
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
 
-def dummy_image_for_phi4mm(width: int, height: int):
-    image = Image.new('RGB', (width, height), color='black')
-    return image
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
 
+        image_tokens: list[str] = self.info.image_tokens[:num_images]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
 
-def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
-                          mm_counts: Mapping[str, int]) -> DummyData:
-    """
-    Create dummy sequence (input_ids) and audio data for the Phi4MM model, 
-    which is used for profiling.
+        return "".join(image_tokens + audio_tokens)
 
-    In this case, the sequence data is a bunch of 0s with a number of audio 
-    tokens that correspond to the audio embed size of the 
-    _AUDIO_MAX_SOUNDFILE_SIZE.
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
 
-    Args:
-        ctx (InputContext): Input context.
-        seq_len (int): Length of the sequence.
-        mm_counts (Mapping[str, int]): Multi-modal counts.
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
 
-    Returns:
-        Tuple: Dummy sequence data and dummy audio data.
-    """
-    audio_count = mm_counts["audio"]
-    audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE,
-                                                   DUMMY_SAMPLING_FREQUENCY)
-    audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                   audio_frames)
-
-    image_count = mm_counts["image"]
-    dummy_image = get_max_dummy_image(ctx)
-    max_image_tokens = get_max_phi4mm_image_tokens(ctx)
-    total_image_tokens = image_count * max_image_tokens
-
-    if seq_len - audio_feature_size * audio_count - total_image_tokens < 0:
-        raise RuntimeError(
-            f"Phi4MM cannot process {audio_count} audios and {image_count}"
-            f"images in a prompt, please increase max_model_len to be at"
-            f" larger than "
-            f"{audio_feature_size * audio_count + total_image_tokens}"
-            " or reduce audio/image limit by --limit-mm-per-prompt.")
-
-    if audio_feature_size * audio_count > total_image_tokens:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count),
-            (0, seq_len - audio_feature_size * audio_count),
-        )
         mm_data = {
-            "audio": dummy_audio_for_phi4mm(audio_count),
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
         }
-    else:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens),
-            (0, seq_len - total_image_tokens),
-        )
-        mm_data = {
-            "image": [dummy_image] * image_count,
-        }
-    return DummyData(seq_data, mm_data)
 
+        return mm_data
 
-def input_mapper_for_phi4mm_audio(ctx: InputContext,
-                                  data: object) -> MultiModalKwargs:
-    """
-    This function is used to create the MultiModalKwargs for the Phi4MM 
-    (audio) model.
-    Specifically, for audio, we extract the audio features from the sound 
-    file and create pairs of audio features and audio embed lengths (the
-    latter of which is used to repeat the audio placeholder token in the 
-    input prompt IDs).
-    These pairs are used, downstream, in `_audio_features_to_embeddings`
-    (via `_process_audio_input`).
-
-    Note that the incoming audio data (each entry in `data`) is a tuple of 
-    the audio data and the sampling frequency (e.g. from soundfile.read).
-
-    Args:
-        ctx (InputContext): Input context.
-        data (object): Audio data.
-
-    Returns:
-        MultiModalKwargs: Multi-modal inputs.
-    """
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalKwargs()
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        audio, sf = audio_input
-        feature_extractor = audio_feature_extractor()
-        single_audio_features = feature_extractor.extract_features(audio, sf)
-        feat_stride = (1 if not hasattr(feature_extractor, "stride") else
-                       feature_extractor.stride)
-        audio_frames = len(single_audio_features) * feat_stride
-        single_audio_embed_size = _compute_audio_embed_size(
-            ctx.get_hf_config(), audio_frames)
-        single_audio_feature_audio_len_pair = (
-            single_audio_features,
-            [single_audio_embed_size],
-        )
-        audio_features.append(single_audio_feature_audio_len_pair)
-    return MultiModalKwargs({"audio_features": audio_features})
-
-
-def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-    # data: list of PIL images
-    if len(data) == 0:
-        return MultiModalKwargs()
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-
-    image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size)
-    return MultiModalKwargs({
-        "pixel_values":
-        image_input_dict["pixel_values"],
-        "image_sizes":
-        image_input_dict["image_sizes"],
-        "image_attention_mask":
-        image_input_dict["image_attention_mask"],
-        "num_img_tokens":
-        image_input_dict["num_img_tokens"],
-    })
 
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
-def cat_with_pad(tensors, dim, padding_value=0):
-    """
-    cat along dim, while pad to max for all other dims
-    """
-    ndim = tensors[0].dim()
-    assert all(
-        t.dim() == ndim for t in
-        tensors[1:]), "All tensors must have the same number of dimensions"
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
+                                    audio_resample_method="scipy")
 
-    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
-    out_size[dim] = sum(t.shape[dim] for t in tensors)
-    output = tensors[0].new_full(out_size, padding_value)
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        sr = self.info.get_feature_extractor().sampling_rate
+        if (audio_data := mm_data.get("audios", [])):
+            mm_data['audios'] = [(data, sr) for data in audio_data]
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs)
+
+        num_img_tokens = [
+            self.info.get_num_image_tokens(image_width=img_size[0],
+                                           image_height=img_size[1])
+            for img_size in processed_outputs["image_sizes"]
+        ]
+        processed_outputs["num_img_tokens"] = num_img_tokens
 
-    index = 0
-    for t in tensors:
-        # Create a slice list where every dimension except dim is full slice
-        slices = [slice(0, t.shape[d]) for d in range(ndim)]
-        # Update only the concat dimension slice
-        slices[dim] = slice(index, index + t.shape[dim])
+        audio_features = processed_outputs['input_audio_embeds']
+        feature_sizes = [
+            self.info.get_audio_num_frames(len(audio), sr)
+            for audio in audio_data
+        ]
+        processed_outputs['input_audio_embeds'] = [
+            audio_features[idx, :size]
+            for idx, size in enumerate(feature_sizes)
+        ]
 
-        output[slices] = t
-        index += t.shape[dim]
+        return processed_outputs
 
-    return output
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_image_embeds=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
+            input_audio_embeds=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        image_tokens: list[str] = self.info.image_tokens  # type: ignore
+        audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+        feature_extractor = self.info.get_feature_extractor()
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, feature_extractor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
+
+            audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+
+            return audio_tokens
+
+        num_images = mm_items.get_count("image", strict=False)
+        num_audios = mm_items.get_count("audio", strict=False)
+
+        image_repl = [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_image_replacement_phi4mm,
+            ) for image_token in image_tokens[:num_images]
+        ]
+        audio_repl = [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_audio_replacement_phi4mm,
+            ) for audio_token in audio_tokens[:num_audios]
+        ]
+        return image_repl + audio_repl
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
-                                           input_mapper_for_phi4mm_audio)
-@MULTIMODAL_REGISTRY.register_input_mapper("image",
-                                           input_mapper_for_phi4mm_image)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_phi4mm_audio_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "image", get_max_phi4mm_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
-class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
-                        SupportsV0Only):
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
@@ -1518,48 +967,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
-
-    def _audio_features_to_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        input_features: List[torch.Tensor],
-        audio_input_sizes: torch.Tensor,
-        audio_projection_mode: str,
-    ) -> torch.Tensor:
-        """
-        Convert audio features to embeddings, which are used as input to the 
-        model (via `inputs_embeds`).
-
-        Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case).
-            input_features (list[torch.Tensor]): Input features (the audio 
-            embeddings).
-            audio_input_sizes (list[torch.Tensor]): Audio input sizes (the 
-            audio embed lengths to use for padding the audio placeholder token 
-            in the input prompt IDs).
-        """
-        # The audio projection can either be a single linear or Sequential,
-        # so handle both cases
-        if isinstance(self.embed_tokens_extend.audio_projection,
-                      nn.Sequential):
-            target_dtype = self.embed_tokens_extend.audio_projection[
-                0].bias.dtype
-        else:
-            target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype
-
-        audio_input = [
-            input.unsqueeze(0).to(target_dtype) for input in input_features
-        ]
-        kwargs = {
-            "wte": self.model.embed_tokens,
-            'audio_projection_mode': audio_projection_mode
-        }
-        audio_embeddings = self.embed_tokens_extend(input_ids, audio_input,
-                                                    audio_input_sizes,
-                                                    **kwargs)
-        audio_embeddings = audio_embeddings.to(target_dtype)
-        return audio_embeddings
 
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
@@ -1574,7 +981,7 @@ def _parse_and_validate_audio_input(
         Returns:
             Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
         """
-        audio_features = kwargs.pop("audio_features", None)
+        audio_features = kwargs.pop("input_audio_embeds", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
@@ -1586,7 +993,7 @@ def _parse_and_validate_audio_input(
                                  f"Got type: {type(audio_features)}")
 
             return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=audio_features)
+                                            data=flatten_bn(audio_features))
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
@@ -1598,8 +1005,7 @@ def _parse_and_validate_audio_input(
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self, input_ids: torch.Tensor,
-                             audio_input: Phi4MMAudioInputs,
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
                              audio_projection_mode: str) -> NestedTensors:
         """
         Create the audio embeddings from the audio input, where the audio input
@@ -1607,8 +1013,6 @@ def _process_audio_input(self, input_ids: torch.Tensor,
         created by `input_mapper_for_phi4mm_audio`.
 
         Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case, 
-            before the audio token replication).
             audio_input (Phi4MMAudioInputs): Audio input.
 
         Returns:
@@ -1620,21 +1024,20 @@ def _process_audio_input(self, input_ids: torch.Tensor,
         audio_features = audio_input["data"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
-        audio_feature = [i[0] for j in audio_features for i in j]
-        audio_feature_len = [i[1].item() for j in audio_features for i in j]
-        # Add the batch dim via `squeeze`
 
-        return self._audio_features_to_embeddings(
-            input_ids.unsqueeze(0),
-            audio_feature,
-            audio_feature_len,
-            audio_projection_mode,
-        ).squeeze(0)
+        dtype = next(self.embed_tokens_extend.parameters()).dtype
+        audio_embeds = [
+            self.embed_tokens_extend(
+                features.to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ) for features in audio_features
+        ]
+        return audio_embeds
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:
-        pixel_values: Optional[Dict] = kwargs.get("pixel_values")
-        if pixel_values is None:
+        input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
+        if input_image_embeds is None:
             return None
 
         image_sizes = kwargs.get("image_sizes")
@@ -1643,23 +1046,24 @@ def _parse_and_validate_image_input(self,
         assert image_sizes is not None and image_attention_mask is not None\
               and num_img_tokens is not None, "Missing image inputs"
 
-        if isinstance(pixel_values, list):
-            assert pixel_values[0].dim() == 5, "Incorrect image inputs"
+        if is_list_of(input_image_embeds, torch.Tensor):
+            assert all(p.dim() == 5
+                       for p in input_image_embeds), "Incorrect image inputs"
             # list len is batch_size.
             # each tensor has dimension: num_img_per_example, num_hd_patches,
             # channels, height, width.
             # need to pad along num_hd_patches.
             # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
-            pixel_values = cat_with_pad(pixel_values, dim=0)
-        elif isinstance(pixel_values, torch.Tensor):
+            input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
+        elif isinstance(input_image_embeds, torch.Tensor):
             # dimension: batch_size, num_img_per_example, num_hd_patches,
             # channels, height, width.
             # we flatten first 2 dims to make it a single large batch for
             # SigLIP Encoder.
-            assert pixel_values.dim() == 6, "Incorrect image inputs"
-            pixel_values = pixel_values.flatten(0, 1)
+            assert input_image_embeds.dim() == 6, "Incorrect image inputs"
+            input_image_embeds = input_image_embeds.flatten(0, 1)
         else:
-            raise ValueError("Incorrect pixel_values inputs")
+            raise ValueError("Incorrect input_image_embeds inputs")
 
         if isinstance(image_attention_mask, list):
             image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
@@ -1685,80 +1089,140 @@ def _parse_and_validate_image_input(self,
         else:
             raise ValueError("Incorrect image_attention_mask inputs")
 
-        return {
-            'pixel_values': pixel_values,
-            'image_sizes': image_sizes,
-            'image_attention_mask': image_attention_mask,
-            'num_img_tokens': num_img_tokens,
-        }
+        return Phi4MMImagePixelInputs(
+            type="pixel_values",
+            data=input_image_embeds,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
 
-    def merge_image_features_to_inputs_embeds(
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("input_image_embeds",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("input_audio_embeds",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            dtype = next(self.vision_encoder.parameters()).dtype
+            pixel_values = image_input['data'].to(dtype)
+            image_sizes = image_input['image_sizes']
+            image_attention_mask = image_input['image_attention_mask']
+            image_embeds = self.vision_encoder(pixel_values, image_sizes,
+                                               image_attention_mask)
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
+                multimodal_embeddings += tuple(audio_embeddings)
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        image_set_tensors: List[torch.Tensor],
-    ):
-        position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero(
-            as_tuple=True)
-
-        assert all([t.shape[0] == 1 for t in image_set_tensors
-                    ]), 'img_set_tensor should have shape (1, N_tokens, C)'
-        # Shape: (merged_N_tokens, C)
-        image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0)
-        image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to(
-            inputs_embeds.device)
-        merged_embeds = inputs_embeds.index_put(
-            indices=position_tuple,
-            values=image_set_tensor,
-            accumulate=False,
-        )
-        return merged_embeds
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+            audio_projection_mode = 'vision'
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            # Each entry in this is a pair of audio_features and audio_embed
-            # lengths
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
             audio_input = self._parse_and_validate_audio_input(**kwargs)
-            image_inputs = self._parse_and_validate_image_input(**kwargs)
-
-            has_audio = audio_input is not None
-            has_image = image_inputs is not None
-
-            if has_audio:
-                audio_projection_mode = 'vision' if has_image else 'speech'
-                inputs_embeds = self._process_audio_input(
-                    input_ids, audio_input, audio_projection_mode)
-
-            if has_image:
-                dtype = self.vision_encoder.img_processor.embeddings.\
-                    patch_embedding.weight.dtype
-                pixel_values = image_inputs['pixel_values'].to(dtype)
-                image_sizes = image_inputs['image_sizes']
-                image_attention_mask = image_inputs['image_attention_mask']
-                image_set_tensors = self.vision_encoder(
-                    pixel_values, image_sizes, image_attention_mask)
-                if not has_audio:
-                    inputs_embeds = self.model.embed_tokens(input_ids)
-
-                inputs_embeds = self.merge_image_features_to_inputs_embeds(
-                    input_ids, inputs_embeds, image_set_tensors)
-
-            if has_image or has_audio:
-                # multi-modal input, we have set inputs_embeds properly in
-                # previous steps
-                input_ids = None
-            else:
-                # text-only, we keep using original input_ids
+
+            if image_input is None and audio_input is None:
                 inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
 
         hidden_states = self.model(
             input_ids,
@@ -1778,14 +1242,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> None:
         weights = ((name, data) for name, data in weights
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index db90848f980..34a7a73d057 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1159,8 +1159,11 @@ def get_audio_features(
         input_embeds: torch.FloatTensor,
         audio_attention_mask: torch.Tensor = None,
         audio_projection_mode: str = "speech",
-    ):
-
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
         if self.freeze_audio_processor:
             with torch.no_grad():
                 audio_features, masks = self.encoder(input_embeds,
@@ -1210,62 +1213,20 @@ def get_audio_features(
 
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        input_embeds: torch.FloatTensor,
-        audio_embed_sizes,
-        **kwargs,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
     ) -> torch.FloatTensor:
         """
         arguments:
-            input_ids: input text ids (B, U)
-            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+            audio_features: audio features (T, D)
+        
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
         """
-        assert input_embeds is not None and len(input_embeds) == len(
-            audio_embed_sizes)
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        with torch.no_grad():
-            positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero(
-                as_tuple=False)
-
-        if not isinstance(input_embeds, list):
-            input_embeds = [input_embeds]
-
-        audio_projection_mode = kwargs.get("audio_projection_mode", "speech")
-        audio_set_tensor = [
-            self.get_audio_features(
-                input_embed, audio_projection_mode=audio_projection_mode)
-            for input_embed in input_embeds
-        ]
-
-        with torch.no_grad():
-            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
-
-        if "wte" in kwargs:
-            # we use the token embedding layer from the huggingface model, this
-            # is REQUIRED to make sure we are using the loaded weights.
-            hidden_states = kwargs["wte"](input_ids)
-        else:
-            # otherwise, we use token embedding in pretrained mixformer from
-            # phi team
-            hidden_states = self.wte(input_ids)
-
-        if len(positions.tolist()) > 0:
-            assert sum(audio_embed_sizes) == len(
-                positions
-            ), "please ensure the encoder outputs have the same length as"\
-                " defined in input_ids!"
-            idx = 0
-            for i in range(len(audio_embed_sizes)):
-                cnt = audio_embed_sizes[i]
-                assert audio_set_tensor[i].shape[0] == 1
-                hidden_states[
-                    positions[idx, 0],
-                    positions[idx, 1]:positions[idx, 1] + cnt,
-                ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to(
-                    hidden_states.dtype).to(hidden_states.device))
-                idx += cnt
-
-        return hidden_states
+        audio_embeds = self.get_audio_features(
+            audio_features.unsqueeze(0),
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds.squeeze(0)
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index 9f08a1c4c6f..4051763cec8 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -12,7 +12,7 @@
 from torch import Tensor, nn
 
 
-class Block(nn.Module):
+class BlockBase(nn.Module):
     """Block abstract module"""
 
     def __init__(self, input_size, output_size):
@@ -1602,7 +1602,7 @@ def forward(
         return x, memory, pos_emb, att_mask
 
 
-class AttBlock(Block, AttModule):
+class AttBlock(BlockBase, AttModule):
     """Attention Block module to support both Attention and Block module."""
 
     def memory_dims(self, max_len=False):
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 381a33d98b9..2dc55e4c352 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -634,7 +633,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -659,14 +657,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 38e140a91ec..c0b492dbfcb 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -28,7 +28,6 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
@@ -37,8 +36,9 @@
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        BaseProcessingInfo, MultiModalHashes,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
@@ -65,14 +65,14 @@ class PixtralImagePixelInputs(TypedDict):
     """
     Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
 
-    The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
+    The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
     """
 
 
 class PixtralProcessorAdapter:
     """
     Provide a HF-compatible interface for
-    :class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
+    {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
     """
 
     def __init__(self, tokenizer: MistralTokenizer) -> None:
@@ -272,15 +272,19 @@ def _cached_apply_hf_processor(
         prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
-        prompt_ids, mm_kwargs, _ = super()._cached_apply_hf_processor(
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        prompt_ids, mm_kwargs, mm_hashes, _ = super(
+        )._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            return_mm_hashes=return_mm_hashes,
         )
 
         # NOTE: The tokens are already inserted by the chat template
-        return prompt_ids, mm_kwargs, True
+        return prompt_ids, mm_kwargs, mm_hashes, True
 
 
 @MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor,
@@ -331,13 +335,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[PixtralImagePixelInputs]:
         images = kwargs.pop("images", None)
@@ -441,13 +438,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]):
@@ -926,9 +916,9 @@ def get_image_size(self) -> int:
         return self.vision_config.image_size
 
     def get_patch_size(self) -> int:
-        spatial_merge_size = getattr(self.vision_config, "spatial_merge_size",
-                                     1)
-        return (self.vision_config.patch_size * spatial_merge_size)
+        # spatial_merge_size is needed for Mistral3
+        spatial_merge_size = getattr(self.hf_config, "spatial_merge_size", 1)
+        return self.vision_config.patch_size * spatial_merge_size
 
     def get_patch_grid_length(self) -> int:
         image_size, patch_size = self.get_image_size(), self.get_patch_size()
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
new file mode 100644
index 00000000000..790c48ccd21
--- /dev/null
+++ b/vllm/model_executor/models/plamo2.py
@@ -0,0 +1,736 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only PLaMo2 model."""
+import math
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_scan_fn, selective_state_update)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    composed_weight_loader, default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   SupportsV0Only)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+
+# Only used for type hinting.
+class Plamo2Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo2"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    hidden_size_per_head: int
+    num_key_value_heads: int
+    # Mamba
+    mamba_d_state: int
+    mamba_d_conv: int
+    mamba_num_heads: int
+    mamba_step: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
+
+    def _init_weights(self, module: torch.nn.Module) -> None:
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+def get_initial_dt_bias(num_heads: int) -> torch.Tensor:
+    dt_min = 0.001
+    dt_max = 0.1
+    dt = torch.exp(
+        torch.rand(num_heads) * (math.log(dt_max) - math.log(dt_min)) +
+        math.log(dt_min))
+    dt = torch.clamp(dt, 1e-4)
+    inv_dt = dt + torch.log(-torch.expm1(-dt))
+    return inv_dt
+
+
+def is_mamba(config: Plamo2Config, i: int) -> bool:
+    assert config.mamba_step > 1
+
+    if config.num_hidden_layers <= (config.mamba_step // 2):
+        # use attention in last layer
+        return i != config.num_hidden_layers - 1
+    return (i % config.mamba_step) != (config.mamba_step // 2)
+
+
+# TODO(Shinichi): Replace this with RMSNorm.
+def _rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor,
+              eps: float) -> torch.Tensor:
+    input_shape = hidden_states.shape
+    hidden_states = hidden_states.reshape(input_shape[:-1] + weight.shape)
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    hidden_states = hidden_states.to(input_dtype)
+    hidden_states = weight * hidden_states
+    return hidden_states.reshape(input_shape)
+
+
+def _swiglu(h: torch.Tensor) -> torch.Tensor:
+    h0, h1 = h.chunk(2, dim=-1)
+    return torch.nn.functional.silu(h0) * h1
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class Plamo2MambaMixer(nn.Module):
+    # TODO(Shinichi): Rebase on Mamba2 implementation.
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = (config.mamba_num_heads *
+                                  config.hidden_size_per_head)
+        self.hidden_size_per_head = config.hidden_size_per_head
+        self.num_heads = config.mamba_num_heads
+        self.time_step_rank = max(64, self.hidden_size // 16)
+        self.use_conv_bias = False
+        self.use_bias = False
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.intermediate_size,
+            bias=self.use_conv_bias,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=self.use_bias,
+            prefix=f"{prefix}.in_proj",
+        )
+        # selective projection used to make dt, B and C input dependent
+        self.bcdt_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.time_step_rank + self.ssm_state_size * 2,
+            bias=False,
+            prefix=f"{prefix}.bcdt_proj",
+        )
+        # time step projection (discretization) -
+        # In the forward we need to apply dt_proj without the bias,
+        # as the bias is added in the selective scan kernel.
+        self.dt_proj = ColumnParallelLinear(
+            self.time_step_rank,
+            self.num_heads,
+            bias=False,
+            prefix=f"{prefix}.dt_proj",
+        )
+        self.dt_bias = torch.nn.Parameter(get_initial_dt_bias(self.num_heads))
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.A = nn.Parameter(
+            torch.empty(
+                self.intermediate_size // tp_size,
+                self.ssm_state_size,
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+
+        self.out_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+            prefix=f"{prefix}.out_proj",
+        )
+        # The activation function is fixed to SiLU.
+        self.activation = "silu"
+
+        self.dt_norm = RMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.B_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.C_norm = RMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states)[0]
+        # Reshaping the projected states as in modeling_plamo.py.
+        length = len(hidden_states)
+        projected_states = projected_states.reshape(length, self.num_heads, -1)
+        gate, hidden_states = torch.split(
+            projected_states,
+            [self.hidden_size_per_head, self.hidden_size_per_head],
+            dim=-1)
+        hidden_states = hidden_states.reshape(length, -1).transpose(0, 1)
+        gate = gate.reshape(length, -1).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+            hidden_states = causal_conv1d_fn(
+                hidden_states,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            hidden_states = causal_conv1d_update(
+                hidden_states.transpose(0, 1),
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+            hidden_states = hidden_states.transpose(0, 1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.bcdt_proj(hidden_states.transpose(-2, -1))[0]
+
+        # Splitting the ssm_parameters as in modeling_plamo.py.
+        B, C, time_step = torch.split(
+            ssm_parameters,
+            [self.ssm_state_size, self.ssm_state_size, self.time_step_rank],
+            dim=-1,
+        )
+        time_step = self.dt_norm(time_step.contiguous())
+        B = self.B_norm(B.contiguous())
+        C = self.C_norm(C.contiguous())
+
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = (self.dt_bias.float() if hasattr(
+            self.dt_proj, "bias") else None)
+
+        # Broadcasting as in modeling_plamo.py.
+        discrete_time_step = discrete_time_step.transpose(
+            0, 1)[..., None].expand(-1, -1, self.hidden_size_per_head)
+        discrete_time_step = discrete_time_step.reshape(
+            -1, self.intermediate_size).transpose(0, 1)
+        time_proj_bias = time_proj_bias[...,
+                                        None].expand(-1,
+                                                     self.hidden_size_per_head)
+        time_proj_bias = time_proj_bias.reshape(self.intermediate_size)
+
+        if attn_metadata.query_start_loc is not None \
+            and attn_metadata.context_lens_tensor is not None:
+            scan_outputs = selective_scan_fn(
+                hidden_states,
+                mamba_cache_params.ssm_state,
+                discrete_time_step,
+                self.A,
+                B.transpose(-2, -1),
+                C.transpose(-2, -1),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                has_initial_state=attn_metadata.context_lens_tensor > 0,
+                query_start_loc=attn_metadata.query_start_loc)
+        else:
+            scan_outputs = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states.transpose(0, 1),
+                discrete_time_step.transpose(0, 1),
+                self.A,
+                B,
+                C,
+                self.D,
+                gate.transpose(0, 1),
+                time_proj_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor)
+            scan_outputs = scan_outputs.transpose(0, 1)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(-2,
+                                                                     -1))[0]
+        return contextualized_states
+
+
+class DenseMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Plamo2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size, [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(self.intermediate_size,
+                                           self.hidden_size,
+                                           bias=False,
+                                           prefix=f"{prefix}.down_proj",
+                                           quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)[0]
+        h = _swiglu(h)
+        output, _ = self.down_proj(h)
+        return output  # type: ignore
+
+
+class Plamo2AttentionMixer(nn.Module):
+
+    def __init__(self,
+                 config: Plamo2Config,
+                 cache_config: CacheConfig,
+                 quant_config: QuantizationConfig,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size_per_head
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.rope_theta = config.rope_theta if hasattr(config,
+                                                       "rope_theta") else 10000
+        self.rope_scaling = config.rope_scaling if hasattr(
+            config, "rope_scaling") else None
+
+        assert max_model_len is not None, "max_model_len must be provided"
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_model_len,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.q_weight = torch.nn.Parameter(
+            torch.ones((self.num_heads, config.hidden_size_per_head)))
+        self.k_weight = torch.nn.Parameter(
+            torch.ones((self.num_kv_heads, config.hidden_size_per_head)))
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = _rms_norm(q, self.q_weight, 1e-6)
+        k = _rms_norm(k, self.k_weight, 1e-6)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 layer_idx: int,
+                 max_model_len: int | None = None,
+                 prefix: str = "",
+                 **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        max_model_len = vllm_config.scheduler_config.max_model_len
+
+        self.is_mamba = is_mamba(config, layer_idx)
+        if self.is_mamba:
+            self.mixer = Plamo2MambaMixer(config=config,
+                                          cache_config=cache_config,
+                                          quant_config=quant_config,
+                                          max_model_len=max_model_len,
+                                          prefix=f"{prefix}.mixer")
+        else:
+            self.mixer = Plamo2AttentionMixer(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              max_model_len=max_model_len,
+                                              prefix=f"{prefix}.mixer")
+
+        self.mlp = DenseMLP(config=config,
+                            quant_config=quant_config,
+                            prefix=f"{prefix}.mlp")
+        self.pre_mixer_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.post_mixer_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_mlp_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.post_mlp_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(
+                hidden_states, residual)
+
+        hidden_states = self.mixer(positions=positions,
+                                   hidden_states=hidden_states,
+                                   residual=residual,
+                                   mamba_cache_params=mamba_cache_params)
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo2Decoder(torch.nn.Module):
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.layers = nn.ModuleList([
+            Plamo2DecoderLayer(vllm_config=vllm_config,
+                               layer_idx=i,
+                               prefix=f"{prefix}.layers.{i}")
+            for i in range(num_hidden_layers)
+        ])
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+    ) -> torch.Tensor:
+        mamba_cache_index = 0
+        for layer in self.layers:
+            layer_mamba_cache_params = None
+            if layer.is_mamba:
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    mamba_cache_index)
+                mamba_cache_index += 1
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params)
+        return hidden_states, residual
+
+
+class Plamo2Model(Plamo2PreTrainedModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config.model_config.hf_config)
+
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # TODO(Shinichi): Implement pipeline parallelism.
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+
+        hidden_states, residual = self.layers(
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            mamba_cache_params=mamba_cache_params)
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid,
+                        SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not vllm_config.cache_config.enable_prefix_caching, \
+            "PLaMo2 currently does not support prefix caching"
+
+        super().__init__(config)
+        self.config = config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = scheduler_config
+
+        # ModelConfig.get_head_size assumes head_dim is set or calculated as
+        # hidden_size // num_attention_heads. However, this is not always
+        # the case for PLaMo2, as indicated by the FIXME comment.
+        self.config.head_dim = self.config.hidden_size_per_head
+
+        self.model = Plamo2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                self.config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = (self.config.mamba_num_heads *
+                       self.config.hidden_size_per_head)
+        conv_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_conv - 1,
+        )
+        temporal_state_shape = (
+            hidden_size // world_size,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            # Both tie_word_embeddings=True and lm_head.weight in the safetensor
+            # at the same time causes dict key access error.
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                assert "lm_head.weight" not in params_dict
+                continue
+
+            # Update the weight names to be compatible with the vllm version
+            # of the model.
+            # Do not change the order of the replacements.
+            replacements = {
+                # Rename incompatible weight names.
+                ".A_log": ".A",
+                ".B_norm_weight": ".B_norm.weight",
+                ".C_norm_weight": ".C_norm.weight",
+                ".dt_norm_weight": ".dt_norm.weight",
+            }
+            # Apply replacements based on the defined mappings
+            for old, new in replacements.items():
+                if old in name:
+                    name = name.replace(old, new)
+
+            # Broadcast the loaded weight to match the model's parameter shape.
+            if ".A" in name:
+                loaded_weight = loaded_weight[:, None, None].expand(
+                    -1, self.config.hidden_size_per_head,
+                    self.config.mamba_d_state)
+                loaded_weight = loaded_weight.reshape(
+                    -1, self.config.mamba_d_state)
+            elif ".D" in name:
+                loaded_weight = loaded_weight[:, None].expand(
+                    -1, self.config.hidden_size_per_head)
+                loaded_weight = loaded_weight.reshape(-1)
+            # Offset parameter with vllm's RMSNorm haven't been supported yet.
+            if ".pre_mixer_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mixer_norm" in name:
+                loaded_weight += 1.0 / 5
+            elif ".pre_mlp_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mlp_norm" in name:
+                loaded_weight += 1.0 / (5**1.5)
+            elif "model.norm.weight" in name:
+                loaded_weight += 1.0
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index a33739a8eef..e75294bc6cb 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -24,7 +24,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -273,7 +272,6 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.transformer.wte.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -286,14 +284,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 2831a5a1233..f76f31c9fc8 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -43,7 +43,6 @@
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -450,7 +449,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -478,14 +476,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
new file mode 100644
index 00000000000..039f528db13
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -0,0 +1,901 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-Omni model (thinker part)."""
+
+from copy import copy
+from functools import partial
+from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence,
+                    Set, Tuple, Union)
+
+import torch
+import torch.nn as nn
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniConfig, Qwen2_5OmniThinkerConfig)
+from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
+    Qwen2_5OmniAudioEncoder)
+from transformers.models.qwen2_5_omni.processing_qwen2_5_omni import (
+    Qwen2_5OmniProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.models.qwen2_5_vl import (
+    Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
+    Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
+    Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
+from vllm.model_executor.models.qwen2_audio import (
+    Qwen2AudioInputs, Qwen2AudioProcessingInfo,
+    _get_feat_extract_output_lengths)
+from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+
+try:
+    import flash_attn
+except (ImportError, ModuleNotFoundError):
+    flash_attn = None
+
+logger = init_logger(__name__)
+
+
+def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
+                                          torch.empty((0, )))
+
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        input_audio_features=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_feature_lengths, dim=1),
+        feature_attention_mask=MultiModalFieldConfig.batched("audio"),
+        audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+        second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
+
+    def _parse_audio_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="audio",
+                required_fields={
+                    "input_audio_features", "audio_feature_lengths"
+                },
+                fields_factory=_qwen2_5_omni_thinker_field_config,
+            )
+
+        return super()._parse_audio_data(data)
+
+
+class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
+                                       Qwen2_5_VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config
+
+    def get_hf_processor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        fps: Optional[Union[float, List[float]]] = None,
+        **kwargs: object,
+    ) -> Qwen2_5OmniProcessor:
+        if fps is not None:
+            kwargs["fps"] = fps
+        processor = self.ctx.get_hf_processor(
+            Qwen2_5OmniProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size),
+            **kwargs,
+        )
+        if not hasattr(processor, "audio_token"):
+            processor.audio_token = "<|AUDIO|>"
+        if not hasattr(processor, "image_token"):
+            processor.image_token = "<|IMAGE|>"
+        if not hasattr(processor, "video_token"):
+            processor.video_token = "<|VIDEO|>"
+        return processor
+
+    def get_feature_extractor(
+        self,
+        *,
+        sampling_rate: Optional[int] = None,
+        **kwargs: object,
+    ):
+        hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None, "image": None, "video": None}
+
+
+class Qwen2_5OmniThinkerDummyInputsBuilder(
+        BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        hf_processor = self.info.get_hf_processor()
+
+        audio_token: str = hf_processor.audio_token
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return (audio_token * num_audios + image_token * num_images +
+                video_token * num_videos)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        feature_extractor = self.info.get_feature_extractor()
+
+        target_audio_length = min(
+            feature_extractor.chunk_length,
+            30,
+        ) * feature_extractor.sampling_rate
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=target_audio_length,
+                                   num_audios=num_audios),
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos),
+        }
+
+        return mm_data
+
+
+class Qwen2_5OmniThinkerMultiModalProcessor(
+        BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return Qwen2_5OmniThinkerMultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        mm_data = dict(mm_data)
+        audios = mm_data.pop("audios", [])
+
+        # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+        if audios:
+            # NOTE: Qwen2.5-Omni processor accept "audio"
+            mm_data["audio"] = audios
+            mm_kwargs = dict(**mm_kwargs, )
+
+        hf_inputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        input_features = hf_inputs.pop('input_features', None)
+        feature_attention_mask = hf_inputs.get('feature_attention_mask', None)
+        if ('input_audio_features' not in hf_inputs
+                and input_features is not None):
+            if feature_attention_mask is not None:
+                input_features = input_features.permute(
+                    0, 2, 1)[feature_attention_mask.bool()].permute(1, 0)
+            hf_inputs['input_audio_features'] = input_features
+        if ('audio_feature_lengths' not in hf_inputs
+                and feature_attention_mask is not None):
+            hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+        return hf_inputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _qwen2_5_omni_thinker_field_config(hf_inputs)
+
+    def _maybe_apply_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargs,
+        is_update_applied: bool,
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_item_counts = mm_items.get_all_counts()
+        self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+
+        if is_update_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                mm_prompt_updates,
+                prompt_ids,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+            tokenizer = self.info.get_tokenizer()
+            prompt = decode_tokens(tokenizer, prompt_ids)
+        else:
+            (
+                prompt_ids,
+                prompt,
+                mm_placeholders,
+            ) = self._apply_prompt_updates(
+                prompt_ids,
+                mm_prompt_updates,
+                mm_item_counts,
+            )
+            self._validate_mm_placeholders(
+                mm_placeholders,
+                mm_item_counts,
+                use_audio_in_video=use_audio_in_video)
+
+        tokenizer = self.info.get_tokenizer()
+        prompt = decode_tokens(tokenizer, prompt_ids)
+
+        if use_audio_in_video:
+            mm_kwargs["use_audio_in_video"] = True
+
+        return prompt_ids, prompt, mm_placeholders
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        vocab = tokenizer.get_vocab()
+
+        audio_token = processor.audio_token
+        image_token = processor.image_token
+        video_token = processor.video_token
+        audio_token_id = vocab[audio_token]
+        image_token_id = vocab[image_token]
+        video_token_id = vocab[video_token]
+
+        audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        if audio_feature_lengths is None and feature_attention_mask is None:
+            audio_output_lengths = []
+        elif audio_feature_lengths is not None:
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                audio_feature_lengths)
+            audio_output_lengths = audio_output_lens.tolist()
+        elif feature_attention_mask is not None:
+            assert isinstance(feature_attention_mask, torch.Tensor)
+            _, audio_output_lens = _get_feat_extract_output_lengths(
+                feature_attention_mask.sum(-1))
+            audio_output_lengths = audio_output_lens.tolist()
+
+        # number of audios read from video.
+        audio_in_video_item_idx = 0
+
+        def get_replacement_qwen2_audio(item_idx: int):
+            item_idx += audio_in_video_item_idx
+
+            num_features = audio_output_lengths[item_idx]
+            if num_features == 0:
+                audios = mm_items.get_items("audio", AudioProcessorItems)
+                audio = audios.get(item_idx)
+                raise ValueError(
+                    f"The audio {audio} (len={len(audio)}) is too short "
+                    "to be represented inside the model")
+
+            return [audio_token_id] * num_features
+
+        def get_replacement_qwen2_vision(item_idx: int, modality: str):
+            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            assert isinstance(grid_thw, torch.Tensor)
+            merge_length = image_processor.merge_size**2
+
+            token_id = image_token_id if modality == "image" else video_token_id
+            return [token_id] * (int(grid_thw.prod()) // merge_length)
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        thinker_config = self.info.get_hf_config()
+
+        def get_replacement_qwen2_use_audio_in_video(item_idx: int):
+            nonlocal audio_in_video_item_idx
+
+            audio_num_features = audio_output_lengths[audio_in_video_item_idx +
+                                                      item_idx]
+            video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+
+            audio_in_video_item_idx += 1
+
+            second_per_grid_ts = hf_processor_mm_kwargs.get(
+                "second_per_grid_ts", None)
+            if second_per_grid_ts:
+                video_second_per_grid_t = second_per_grid_ts[item_idx]
+            else:
+                video_second_per_grid_t = 1.0
+
+            return MRotaryEmbedding.omni_get_updates_use_audio_in_video(
+                thinker_config=thinker_config,
+                audio_len=audio_num_features,
+                video_grid_thw=video_grid_thw,
+                video_second_per_grid_t=video_second_per_grid_t,
+            )
+
+        video_replacement_fn = (
+            get_replacement_qwen2_use_audio_in_video if use_audio_in_video else
+            partial(get_replacement_qwen2_vision, modality="video"))
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_qwen2_audio,
+            ),
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=partial(get_replacement_qwen2_vision,
+                                    modality="image"),
+            ),
+            PromptReplacement(
+                modality="video",
+                target=video_token,
+                replacement=video_replacement_fn,
+            ),
+        ]
+
+    def _apply_hf_processor_main(
+        self,
+        prompt: Union[str, list[int]],
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        enable_hf_prompt_update: bool,
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        """
+        Qwen2.5-Omni reimplements this function to handle text only.
+        """
+        if isinstance(prompt, str):
+            if enable_hf_prompt_update:
+                return self._apply_hf_processor_text_mm(
+                    prompt_text=prompt,
+                    mm_items=mm_items,
+                    hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                )
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = encode_tokens(tokenizer, prompt)
+        else:
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt)
+
+        mm_kwargs = self._apply_hf_processor_mm_only(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, False
+
+    def _apply_hf_processor_mm_only(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalKwargs:
+        """
+        Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
+        """
+        mm_counts = mm_items.get_all_counts()
+
+        use_audio_in_video = hf_processor_mm_kwargs.get(
+            "use_audio_in_video", False)
+        if use_audio_in_video and "video" in mm_counts:
+            assert "audio" in mm_counts
+            mm_counts["audio"] -= mm_counts["video"]
+
+        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
+            prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return mm_kwargs
+
+    def _validate_mm_placeholders(
+        self,
+        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_item_counts: Mapping[str, int],
+        use_audio_in_video: bool = False,
+    ) -> None:
+        if use_audio_in_video:
+            mm_item_counts = copy(mm_item_counts)
+            if "video" in mm_item_counts:
+                assert "audio" in mm_item_counts
+                mm_item_counts["audio"] -= mm_item_counts["video"]
+        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+
+
+class Qwen2_5OmniConditionalGenerationMixin:
+
+    def _validate_and_reshape_mm_tensor(self,
+                                        mm_input: object,
+                                        name: str,
+                                        dim: int = 0) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input), dim=dim)
+        else:
+            return torch.concat(mm_input, dim=dim)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
+        input_audio_features = kwargs.pop('input_audio_features', None)
+        audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
+        feature_attention_mask = kwargs.pop('feature_attention_mask', None)
+        if input_audio_features is None:
+            return None
+        input_audio_features = self._validate_and_reshape_mm_tensor(
+            input_audio_features, 'input_audio_features', dim=1)
+        if feature_attention_mask is not None:
+            feature_attention_mask = self._validate_and_reshape_mm_tensor(
+                feature_attention_mask, 'feature_attention_mask')
+        if not isinstance(input_audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_audio_features)}")
+        return Qwen2AudioInputs(input_features=input_audio_features,
+                                audio_feature_lengths=audio_feature_lengths,
+                                feature_attention_mask=feature_attention_mask)
+
+    def _parse_and_validate_image_input(
+        self,
+        **kwargs: Dict[str, Any],
+    ) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+        self,
+        **kwargs: Dict[str, Any],
+    ) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_audio_input(
+        self,
+        audio_input: Qwen2AudioInputs,
+        audio_hashes: List[str] = None,
+        cached_audio_features: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        input_features = audio_input["input_features"]
+        audio_feature_lengths = audio_input["audio_feature_lengths"]
+        if input_features.ndim == 3:
+            assert input_features.shape[0] == 1
+            input_features = input_features.squeeze(0)
+        if audio_feature_lengths.ndim == 2:
+            assert audio_feature_lengths.shape[
+                0] == 1 or audio_feature_lengths.shape[1] == 1
+            if audio_feature_lengths.shape[0] == 1:
+                audio_feature_lengths = audio_feature_lengths.squeeze(0)
+            else:
+                audio_feature_lengths = audio_feature_lengths.squeeze(1)
+
+        audio_feat_lengths, audio_output_lengths = (
+            self.audio_tower._get_feat_extract_output_lengths(
+                audio_feature_lengths))
+
+        audio_outputs = self.audio_tower(
+            input_features.to(self.audio_tower.dtype),
+            feature_lens=audio_feature_lengths,
+            aftercnn_lens=audio_feat_lengths,
+        )
+        audio_features = audio_outputs.last_hidden_state
+        return audio_features.split(audio_output_lengths.tolist())
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"].type(self.visual.dtype)
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs,
+            video_hashes: List[str] = None,
+            cached_video_embeds: torch.Tensor = None) -> torch.Tensor:
+        if video_input["type"] == "video_embeds":
+            return video_input["video_embeds"].type(self.visual.dtype)
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5OmniThinkerMultiModalProcessor,
+    info=Qwen2_5OmniThinkerProcessingInfo,
+    dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
+)
+class Qwen2_5OmniThinkerForConditionalGeneration(
+        nn.Module, SupportsMultiModal, SupportsPP,
+        Qwen2_5OmniConditionalGenerationMixin):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "thinker.lm_head.": "language_model.lm_head.",
+            "thinker.model.": "language_model.model.",
+            "thinker.": "",
+        })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        thinker_config: Qwen2_5OmniThinkerConfig = (
+            vllm_config.model_config.hf_config.thinker_config)
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = thinker_config
+        self.multimodal_config = multimodal_config
+
+        # force "use_flash_attention_2=True" to audio tower to align
+        # the results.
+        if flash_attn is not None:
+            audio_config = thinker_config.audio_config
+            audio_config._attn_implementation_autoset = True
+            audio_config._attn_implementation = "flash_attention_2"
+        else:
+            logger.warning(
+                "flash_attn is not available, the model may not yield the "
+                "exactly same result as the transformers implementation "
+                "in the audio tower part.")
+
+        self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config)
+        self.visual = Qwen2_5_VisionTransformer(
+            vision_config=thinker_config.vision_config,
+            norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+        self.quant_config = quant_config
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            hf_config=thinker_config.text_config,
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+            if input_key in ("input_audio_features"
+                             ) and "audio" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "audio"] = self._parse_and_validate_audio_input(**kwargs)
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
+                multimodal_embeddings += video_embeddings
+            if modality == "audio":
+                audio_embeddings = self._process_audio_input(multimodal_input)
+                multimodal_embeddings += audio_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+
+            # TODO (ywang96): support overlapping modalitiy embeddings so that
+            # `use_audio_in_video` will work on V1.
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, [
+                    self.config.image_token_index,
+                    self.config.video_token_index,
+                    self.config.audio_token_index
+                ])
+        return inputs_embeds
+
+    def get_multimodal_embeddings_v0(
+            self, **kwargs: object) -> Optional[NestedTensors]:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        video_input = self._parse_and_validate_video_input(**kwargs)
+
+        if audio_input is None and image_input is None and video_input is None:
+            return None
+
+        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(audio_input)
+            multimodal_embeddings.append((audio_embeds, "audio"))
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            multimodal_embeddings.append((image_embeds, "image"))
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            multimodal_embeddings.append((video_embeds, "video"))
+        return multimodal_embeddings
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is None:
+            return inputs_embeds
+
+        for embeddings, modality in multimodal_embeddings:
+            if modality == "audio":
+                placeholder_token_id = self.config.audio_token_index
+            if modality == "image":
+                placeholder_token_id = self.config.image_token_index
+            if modality == "video":
+                placeholder_token_id = self.config.video_token_index
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, embeddings, placeholder_token_id)
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings_v0(**kwargs)
+            inputs_embeds = self.get_input_embeddings_v0(
+                input_ids, multimodal_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["talker.", "token2wav."],
+        )
+        loaded_weights = loader.load_weights(weights,
+                                             mapper=self.hf_to_vllm_mapper)
+
+        return loaded_weights
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 84b7e59c8a0..84108200e91 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -24,7 +24,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
-from functools import cached_property, partial
+from functools import partial
 from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
@@ -38,19 +38,19 @@
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
 
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
+from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -195,6 +195,25 @@ def forward(self, x: torch.Tensor):
         return x_down
 
 
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
 class Qwen2_5_VisionAttention(nn.Module):
 
     def __init__(
@@ -214,10 +233,14 @@ def __init__(
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        self.qkv = ColumnParallelLinear(input_size=embed_dim,
-                                        output_size=3 * projection_size,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv")
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
         self.proj = RowParallelLinear(input_size=projection_size,
                                       output_size=embed_dim,
                                       quant_config=quant_config,
@@ -236,7 +259,8 @@ def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
         if self.tp_size > 1:
-            qkv = tensor_model_parallel_all_gather(qkv)
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
@@ -694,9 +718,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
         loaded_params: Set[str] = set()
@@ -808,13 +832,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
         # seems to avoid vision encoder sections for some models.
@@ -952,20 +969,20 @@ def _process_video_input(
         return video_embeds.split(sizes.tolist())
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
-        modalities = {}
+        mm_input_by_modality = {}
 
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
-                             "image_embeds") and "images" not in modalities:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-            if input_key in ("pixel_values_videos",
-                             "video_embeds") and "videos" not in modalities:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
-        return modalities
+            if input_key in ("pixel_values", "image_embeds"
+                             ) and "image" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "image"] = self._parse_and_validate_image_input(**kwargs)
+            if input_key in ("pixel_values_videos", "video_embeds"
+                             ) and "video" not in mm_input_by_modality:
+                mm_input_by_modality[
+                    "video"] = self._parse_and_validate_video_input(**kwargs)
+        return mm_input_by_modality
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
@@ -973,8 +990,9 @@ def get_language_model(self) -> torch.nn.Module:
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
-        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
-        if not modalities:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
+            **kwargs)
+        if not mm_input_by_modality:
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
@@ -983,14 +1001,13 @@ def get_multimodal_embeddings(
 
         # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
-        for modality in modalities:
-            if modality == "images":
-                image_input = modalities["images"]
-                vision_embeddings = self._process_image_input(image_input)
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                vision_embeddings = self._process_image_input(multimodal_input)
                 multimodal_embeddings += vision_embeddings
-            if modality == "videos":
-                video_input = modalities["videos"]
-                video_embeddings = self._process_video_input(video_input)
+            if modality == "video":
+                video_embeddings = self._process_video_input(multimodal_input)
                 multimodal_embeddings += video_embeddings
         return multimodal_embeddings
 
@@ -1102,13 +1119,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
@@ -1121,5 +1131,6 @@ def get_mm_mapping(self) -> MultiModelKeys:
         """
         return MultiModelKeys.from_string_field(
             language_model="language_model",
-            connector="visual.",
-            tower_model="visual.merger.")
+            connector="visual.merger.",
+            tower_model="visual.",
+        )
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 280cda0f68f..f30bf08ab18 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Any, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -34,7 +33,6 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -152,8 +150,15 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
+        # NOTE - we rename audios -> audio in mm data because transformers has
+        # deprecated audios for the qwen2audio processor and will remove
+        # support for it in transformers 4.54.
+        audios = mm_data.pop("audios", [])
+        if audios:
+            mm_data["audio"] = audios
+
         # Text-only input not supported in composite processor
-        if not mm_data.get("audios", []):
+        if not mm_data.get("audio", []):
             prompt_ids = self.info.get_tokenizer().encode(prompt)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
@@ -267,13 +272,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
@@ -405,13 +403,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2700c706b97..47d90919ed8 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -47,7 +47,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -463,11 +462,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv_scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv_scale is not loaded.",  #  noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
@@ -497,7 +495,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -524,14 +521,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8c24b8f7df5..95f0c29d485 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -24,7 +24,7 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
                     Union)
 
@@ -51,7 +51,6 @@
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -775,8 +774,9 @@ def _get_image_processor_kwargs(
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
     ):
-        if self.ctx.model_config.mm_processor_kwargs:
-            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
+        mm_config = self.ctx.model_config.get_multimodal_config()
+        if mm_config.mm_processor_kwargs:
+            kwargs.update(mm_config.mm_processor_kwargs)
 
         if min_pixels is not None:
             kwargs["min_pixels"] = min_pixels
@@ -1112,13 +1112,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
         # seems to avoid vision encoder sections for some models.
@@ -1400,13 +1393,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
@@ -1419,5 +1405,6 @@ def get_mm_mapping(self) -> MultiModelKeys:
         """
         return MultiModelKeys.from_string_field(
             language_model="language_model",
-            connector="visual.",
-            tower_model="visual.merger.")
+            connector="visual.merger.",
+            tower_model="visual.",
+        )
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 9c14038e611..73d2838f461 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
@@ -283,7 +282,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -311,14 +309,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index f0ef79dfdfe..97acbaa2ac3 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -460,11 +459,10 @@ def load_weights(self, weights: Iterable[Tuple[str,
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
                             logger.warning_once(
-                                "Found kv scale in the checkpoint "
-                                f"(e.g. {name}), but not found the expected "
-                                f"name in the model "
-                                f"(e.g. {remapped_kv_scale_name}). "
-                                "kv-scale is not loaded.")
+                                "Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.",  # noqa: E501
+                                name,
+                                remapped_kv_scale_name,
+                            )
                             continue
                         else:
                             name = remapped_kv_scale_name
@@ -494,7 +492,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -521,14 +518,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 9f370d7aab4..199b885a585 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -383,7 +383,7 @@ def _get_tokenizer_without_image_pad(
         tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
     """
     The logic of adding image pad tokens should only be applied in
-    :class:`QwenVLProcessor`, so they are patched out here.
+    {class}`QwenVLProcessor`, so they are patched out here.
 
     The definition of the wrapped tokenizer can be found here:
     https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b345113ef30..e25941faa14 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -19,7 +19,6 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
-from vllm.utils import is_in_doc_build
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
@@ -99,6 +98,7 @@
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
+    "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
@@ -121,13 +121,11 @@
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
-    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
-    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
-    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "GteModel": ("bert", "GteEmbeddingModel"),
     "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"),
     "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
     "LlamaModel": ("llama", "LlamaForCausalLM"),
@@ -137,12 +135,16 @@
         if arch == "LlamaForCausalLM"
     },
     "MistralModel": ("llama", "LlamaForCausalLM"),
+    "NomicBertModel": ("bert", "NomicBertEmbeddingModel"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
+    "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
+    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
@@ -161,6 +163,8 @@
                                          "RobertaForSequenceClassification"),
     "XLMRobertaForSequenceClassification": ("roberta",
                                             "RobertaForSequenceClassification"),
+    "ModernBertForSequenceClassification": ("modernbert",
+                                            "ModernBertForSequenceClassification"),
 }
 
 _MULTIMODAL_MODELS = {
@@ -173,6 +177,7 @@
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
+    "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
@@ -183,11 +188,13 @@
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"),  # noqa: E501
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
     "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
+    "Ovis2ForConditionalGeneration": ("ovis2", "Ovis2ForConditionalGeneration"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
@@ -195,6 +202,7 @@
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     # [Encoder-decoder]
@@ -208,6 +216,7 @@
 _SPECULATIVE_DECODING_MODELS = {
     "EAGLEModel": ("eagle", "EAGLE"),
     "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"),
+    "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
@@ -365,13 +374,13 @@ def register_model(
         """
         Register an external model to be used in vLLM.
 
-        :code:`model_cls` can be either:
+        `model_cls` can be either:
 
-        - A :class:`torch.nn.Module` class directly referencing the model.
-        - A string in the format :code:`<module>:<class>` which can be used to
+        - A {class}`torch.nn.Module` class directly referencing the model.
+        - A string in the format `<module>:<class>` which can be used to
           lazily import the model. This is useful to avoid initializing CUDA
           when importing the model and thus the related error
-          :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+          `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
         """
         if not isinstance(model_arch, str):
             msg = f"`model_arch` should be a string, not a {type(model_arch)}"
@@ -390,8 +399,7 @@ def register_model(
                 raise ValueError(msg)
 
             model = _LazyRegisteredModel(*split_str)
-        elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass(
-                model_cls, nn.Module)):
+        elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
             model = _RegisteredModel.from_model_cls(model_cls)
         else:
             msg = ("`model_cls` should be a string or PyTorch model class, "
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 19a23162aa8..e78c37b65f8 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -8,7 +8,6 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
 
 import torch
@@ -21,7 +20,6 @@
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -699,13 +697,6 @@ def _patch_quant_config(self, config: PretrainedConfig,
                 (llm_quant_config is not None):
                 quant_config.modules_to_not_convert.append("vision_model")
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _init_vision_model(
         self,
         config: PretrainedConfig,
@@ -908,7 +899,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
 
         if intermediate_tensors is not None:
             input_ids = None
@@ -946,13 +937,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         skip_prefixes = [
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 1cae0a7fe0d..f86aff7ba7e 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -418,8 +417,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -440,14 +437,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 53f520304ab..1cbda7267e4 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -36,7 +36,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -105,9 +104,8 @@ def __init__(self,
             1, self.total_num_key_value_heads // tp_size)
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        rope_pct = getattr(config, "rope_pct",
-                           getattr(config, "partial_rotary_factor", 1))
-        self.rotary_ndims = int(self.head_dim * rope_pct)
+        self.partial_rotary_factor = getattr(
+            config, "rope_pct", getattr(config, "partial_rotary_factor", 1))
         self.scaling = self.head_dim**-0.5
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_key_value_heads * self.head_dim
@@ -131,9 +129,10 @@ def __init__(self,
                                         prefix=f"{prefix}.o_proj")
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.rotary_ndims,
+            rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
             base=self.config.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -310,7 +309,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -337,14 +335,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 8b9fb7cb7bc..6eebe4c4d61 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -36,7 +36,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -317,7 +316,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -344,14 +342,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index a1f233e0489..7b946ad6aac 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 import re
-from itertools import chain
 from typing import Iterable, Literal, Optional, Union
 
 import torch
@@ -35,7 +34,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -84,7 +82,7 @@ def replace_linear_class(
 ) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
-    
+
     Args:
         linear (nn.Linear): `nn.Linear` to be replaced.
         style (str): Tensor parallel style of the new linear, e.g. "colwise".
@@ -167,8 +165,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Initialize buffers (e.g. rotary embedding inverse frequency)
         self.init_buffers(self.model)
 
-        # Move remaining meta tensors to device (should happen last)
-        self.meta_to_empty(self.model)
+        # Initialize any parameters that have not had their modules replaced
+        self.init_parameters(self.model)
 
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(["hidden_states"],
@@ -294,18 +292,37 @@ def init_buffers(self, module: nn.Module):
         """
         for name, buffer in module.named_buffers(recurse=False):
             if buffer.device == torch.device("meta"):
+                if module == self.model:
+                    logger.warning(
+                        "To initialize buffers correctly, we instantiate the "
+                        "parent module and and extract the value of the "
+                        "buffer from it. In this case, the parent module is "
+                        "the base model. Instantiating the entire model here "
+                        "risks GPU OOM. Could this buffer be moved to a child "
+                        "module?")
                 new_buffer = getattr(type(module)(self.config), name)
                 setattr(module, name, new_buffer)
         for child in module.children():
             self.init_buffers(child)
 
-    def meta_to_empty(self, module: nn.Module):
-        tensors = list(chain(module.buffers(), module.parameters()))
-        if tensors and all(t.device == torch.device("meta") for t in tensors):
-            module.to_empty(device=self.device_config.device)
-            return  # We can stop recursing because to_empty is recursive
+    def init_parameters(self, module: nn.Module):
+        """
+        If a `parameter` is on the `meta` device, then its parent
+        `module` is the original module created by:
+
+        ```python
+        with torch.device("meta"):
+            self.model: PreTrainedModel = AutoModel.from_config(...)
+        ```
+        """
+        for name, param in module.named_parameters(recurse=False):
+            if param.device == torch.device("meta"):
+                new_param = nn.Parameter(
+                    torch.empty_like(param.data,
+                                     device=self.device_config.device))
+                setattr(module, name, new_param)
         for child in module.children():
-            self.meta_to_empty(child)
+            self.init_parameters(child)
 
     def get_input_embeddings(self) -> nn.Module:
         return self.model.get_input_embeddings()
@@ -343,6 +360,7 @@ def forward(
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
+
         loaded_params = set[str]()
         for name, loaded_weight in weights:
             # Use "model" instead of base_model_prefix because
@@ -396,8 +414,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -435,12 +451,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index cb5ff4ed636..bfa48099b74 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,7 +3,6 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -18,7 +17,6 @@
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -438,13 +436,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def get_mm_mapping(self) -> MultiModelKeys:
         """
         Get the module prefix in multimodal models
@@ -628,13 +619,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 7ed0560ee43..1be40ecd3e2 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -66,7 +66,7 @@ def apply(
 
 class AutoWeightsLoader:
     """
-    Helper class to load weights into a :class:`torch.nn.Module`. It is able
+    Helper class to load weights into a {class}`torch.nn.Module`. It is able
     to automatically detect child modules and parameters while iterating over
     the weights only once.
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 05e3b3f3ccd..901d83ec5b9 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -19,10 +19,11 @@
 
 class VisionEncoderInfo(ABC, Generic[_C]):
 
-    def __init__(self, vision_config: _C) -> None:
+    def __init__(self, hf_config: _C) -> None:
         super().__init__()
 
-        self.vision_config = vision_config
+        self.hf_config = hf_config
+        self.vision_config = hf_config.vision_config
 
     @abstractmethod
     def get_num_image_tokens(
@@ -57,18 +58,14 @@ def get_vision_encoder_info(
     from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig
     from .siglip import SiglipEncoderInfo, SiglipVisionConfig
 
-    vision_config = hf_config.vision_config
-    if isinstance(vision_config, CLIPVisionConfig):
-        return CLIPEncoderInfo(vision_config)
-    if isinstance(vision_config, PixtralVisionConfig):
-        # Need to sneak in spatial_merge_size for Mistral3
-        vision_config.spatial_merge_size = getattr(hf_config,
-                                                   "spatial_merge_size", 1)
-        return PixtralHFEncoderInfo(vision_config)
-    if isinstance(vision_config, SiglipVisionConfig):
-        return SiglipEncoderInfo(vision_config)
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
+    if isinstance(hf_config.vision_config, CLIPVisionConfig):
+        return CLIPEncoderInfo(hf_config)
+    if isinstance(hf_config.vision_config, PixtralVisionConfig):
+        return PixtralHFEncoderInfo(hf_config)
+    if isinstance(hf_config.vision_config, SiglipVisionConfig):
+        return SiglipEncoderInfo(hf_config)
+
+    msg = f"Unsupported vision config: {type(hf_config.vision_config)}"
     raise NotImplementedError(msg)
 
 
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 63e71f26880..908cd7885aa 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -21,7 +21,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -669,7 +668,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
 
     def forward(
         self,
@@ -724,14 +722,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index ea21fffaede..d34033e3ac9 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -31,7 +31,6 @@
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -870,7 +869,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         # Initialize logits processing and sampling
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
@@ -1004,23 +1002,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        """Sample next tokens from computed logits.
-        
-        Args:
-            logits: Computed logits for next token prediction
-            sampling_metadata: Metadata for sampling process
-            
-        Returns:
-            Sampled tokens and related sampling information
-        """
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 2b1294bf7ba..34a0b527b58 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -282,10 +282,12 @@ def __init__(self,
                  packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
                  **kwargs):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
         super().__init__(**kwargs)
 
     @property
@@ -300,12 +302,17 @@ def packed_factor(self):
     def marlin_tile_size(self):
         return self._marlin_tile_size
 
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
     def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
         return _adjust_shard_indexes_for_packing(
             shard_size=shard_size,
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
-            marlin_tile_size=self.marlin_tile_size)
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
 
 
 class PackedvLLMParameter(ModelWeightParameter):
@@ -323,10 +330,12 @@ def __init__(self,
                  packed_factor: Union[int, Fraction],
                  packed_dim: int,
                  marlin_tile_size: Optional[int] = None,
+                 bitblas_tile_size: Optional[int] = None,
                  **kwargs):
         self._packed_factor = packed_factor
         self._packed_dim = packed_dim
         self._marlin_tile_size = marlin_tile_size
+        self._bitblas_tile_size = bitblas_tile_size
         super().__init__(**kwargs)
 
     @property
@@ -341,12 +350,17 @@ def packed_factor(self):
     def marlin_tile_size(self):
         return self._marlin_tile_size
 
+    @property
+    def bitblas_tile_size(self):
+        return self._bitblas_tile_size
+
     def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
         return _adjust_shard_indexes_for_packing(
             shard_size=shard_size,
             shard_offset=shard_offset,
             packed_factor=self.packed_factor,
-            marlin_tile_size=self.marlin_tile_size)
+            marlin_tile_size=self.marlin_tile_size,
+            bitblas_tile_size=self.bitblas_tile_size)
 
 
 class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
@@ -421,8 +435,13 @@ def _adjust_shard_indexes_for_marlin(shard_size, shard_offset,
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
+def _adjust_shard_indexes_for_bitblas(shard_size, shard_offset,
+                                      bitblas_tile_size):
+    return shard_size // bitblas_tile_size, shard_offset // bitblas_tile_size
+
+
 def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
-                                      marlin_tile_size):
+                                      marlin_tile_size, bitblas_tile_size):
     shard_size = shard_size // packed_factor
     shard_offset = shard_offset // packed_factor
     if marlin_tile_size is not None:
@@ -430,4 +449,10 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
             shard_size=shard_size,
             shard_offset=shard_offset,
             marlin_tile_size=marlin_tile_size)
-    return shard_size, shard_offset
+    elif bitblas_tile_size is not None:
+        return _adjust_shard_indexes_for_bitblas(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            bitblas_tile_size=bitblas_tile_size)
+
+    return shard_size, shard_offset
\ No newline at end of file
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 741bd1a6a1c..756ea11311d 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .base import MultiModalPlaceholderMap
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
@@ -9,11 +8,12 @@
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
-The global :class:`~MultiModalRegistry` is used by model runners to
+The global {class}`~MultiModalRegistry` is used by model runners to
 dispatch data processing according to the target model.
 
-See also:
-    :ref:`mm-processing`
+:::{seealso}
+{ref}`mm-processing`
+:::
 """
 
 __all__ = [
@@ -26,7 +26,6 @@
     "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
-    "MultiModalPlugin",
     "NestedTensors",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index f379ec1682a..1fd2ab7f87d 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,17 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import base64
 from io import BytesIO
 from pathlib import Path
+from typing import Literal, Optional
 
 import numpy as np
 import numpy.typing as npt
 
-from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
 
-from .base import MediaIO, MultiModalPlugin
-from .inputs import AudioItem, ModalityData, MultiModalKwargs
+from .base import MediaIO
 
 try:
     import librosa
@@ -24,26 +22,7 @@
     soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
 
-class AudioPlugin(MultiModalPlugin):
-    """Plugin for audio data."""
-
-    def get_data_key(self) -> str:
-        return "audio"
-
-    def _default_input_mapper(
-        self,
-        ctx: InputContext,
-        data: ModalityData[AudioItem],
-        **mm_processor_kwargs,
-    ) -> MultiModalKwargs:
-        raise NotImplementedError("There is no default audio input mapper")
-
-    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
-        raise NotImplementedError(
-            "There is no default maximum multimodal tokens")
-
-
-def resample_audio(
+def resample_audio_librosa(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
@@ -52,6 +31,55 @@ def resample_audio(
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
 
 
+def resample_audio_scipy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+):
+    # lazy import scipy.signal, otherwise it will crash doc build.
+    import scipy.signal
+
+    if orig_sr > target_sr:
+        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+    elif orig_sr < target_sr:
+        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+    return audio
+
+
+class AudioResampler:
+    """Resample audio data to a target sample rate."""
+
+    def __init__(
+        self,
+        target_sr: Optional[float] = None,
+        method: Literal["librosa", "scipy"] = "librosa",
+    ):
+        self.target_sr = target_sr
+        self.method = method
+
+    def resample(
+        self,
+        audio: npt.NDArray[np.floating],
+        *,
+        orig_sr: float,
+    ) -> npt.NDArray[np.floating]:
+        if self.target_sr is None:
+            raise RuntimeError("Audio resampling is not supported when "
+                               "`target_sr` is not provided")
+        if self.method == "librosa":
+            return resample_audio_librosa(audio,
+                                          orig_sr=orig_sr,
+                                          target_sr=self.target_sr)
+        elif self.method == "scipy":
+            return resample_audio_scipy(audio,
+                                        orig_sr=orig_sr,
+                                        target_sr=self.target_sr)
+        else:
+            raise ValueError(f"Invalid resampling method: {self.method}. "
+                             "Supported methods are 'librosa' and 'scipy'.")
+
+
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index ad95b982499..184c801e64d 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,247 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from collections import defaultdict
 from collections.abc import Sequence
 from pathlib import Path
-from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
-                    Optional, TypeVar, Union)
-
-from torch import nn
-
-from vllm.inputs import InputContext
-from vllm.logger import init_logger
-from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        resolve_mm_processor_kwargs)
+from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs,
-                     PlaceholderRange)
-
-logger = init_logger(__name__)
-
-MultiModalInputMapper = Callable[[InputContext, ModalityData[object]],
-                                 MultiModalKwargs]
-"""
-Return a dictionary to be passed as keyword arguments to
-:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
-and processors in HuggingFace Transformers.
-
-If the data is not supported, throw :exc:`TypeError`.
-"""
-
-MultiModalTokensCalc = Union[int, Callable[[InputContext], int]]
-"""
-Calculate the maximum number of multimodal tokens input to the language
-model. This does not include tokens that correspond to the input text.
-"""
+from .inputs import MultiModalKwargs, PlaceholderRange
 
 _T = TypeVar("_T")
-N = TypeVar("N", bound=type[nn.Module])
-
-
-class MultiModalPlugin(ABC):
-    """
-    Base class that defines data processing logic for a specific modality.
-
-    In particular, we adopt a registry pattern to dispatch data processing
-    according to the model being used (considering that different models may
-    process the same data differently). This registry is in turn used by
-    :class:`~MultiModalRegistry` which acts at a higher level
-    (i.e., the modality of the data).
-    """
-
-    def __init__(self) -> None:
-        self._input_mappers = ClassRegistry[nn.Module, MultiModalInputMapper]()
-        self._max_mm_tokens = ClassRegistry[nn.Module, MultiModalTokensCalc]()
-
-    @abstractmethod
-    def get_data_key(self) -> str:
-        """
-        Get the data key corresponding to the modality.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _default_input_mapper(
-        self,
-        ctx: InputContext,
-        data: ModalityData[Any],
-        **mm_processor_kwargs,
-    ) -> MultiModalKwargs:
-        """
-        Return a dictionary to be passed as keyword arguments to
-        :meth:`~torch.nn.Module.forward`. This is similar in concept to
-        tokenizers and processors in HuggingFace Transformers.
-
-        If the data is not supported, throw :exc:`TypeError`.
-        """
-        raise NotImplementedError
-
-    def register_input_mapper(
-        self,
-        mapper: Optional[MultiModalInputMapper] = None,
-    ):
-        """
-        Register an input mapper to a model class.
-
-        When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_key`), the provided function is
-        invoked to transform the data into a dictionary of model inputs.
-
-        If `None` is provided, then the default input mapper is used instead.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if self._input_mappers.contains(model_cls, strict=True):
-                logger.warning(
-                    "Model class %s already has an input mapper "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls,
-                    self,
-                )
-
-            self._input_mappers[model_cls] = (mapper
-                                              or self._default_input_mapper)
-
-            return model_cls
-
-        return wrapper
-
-    def map_input(
-        self,
-        model_config: "ModelConfig",
-        data: ModalityData[Any],
-        mm_processor_kwargs: Optional[dict[str, Any]],
-    ) -> MultiModalKwargs:
-        """
-        Transform the data into a dictionary of model inputs using the
-        input mapper registered for that model.
-
-        The model is identified by ``model_config``.
-
-        Raises:
-            TypeError: If the data type is not supported.
-        """
-
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        model_cls, _ = get_model_architecture(model_config)
-
-        mapper = self._input_mappers.get(model_cls)
-
-        if mapper is None:
-            raise KeyError(f"No input mapper in {self} is registered for "
-                           f"model class {model_cls.__name__}.")
-
-        if mm_processor_kwargs is None:
-            mm_processor_kwargs = {}
-
-        # In the case of the default mapper, we have to get resource
-        # processor through its HuggingFace autoclass; since this goes
-        # through **kwargs, we can't inspect it the same way, so we allow
-        # drop mm_processor_kwargs based on signature inspection
-        # if we're using the default mapper.
-        #
-        # This should be safe in general due to the sanitation, since the
-        # transformers resource should filter unused kwargs anyway.
-        uses_default_mapper = mapper == self._default_input_mapper
-        mm_processor_kwargs = resolve_mm_processor_kwargs(
-            model_config.mm_processor_kwargs,
-            mm_processor_kwargs,
-            callable=mapper,
-            allow_var_kwargs=uses_default_mapper,
-        )
-        return mapper(InputContext(model_config), data, **mm_processor_kwargs)
-
-    @abstractmethod
-    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
-        """
-        Calculate the maximum number of tokens, corresponding to a single
-        instance of multimodal data, that are passed to the language model.
-        """
-        raise NotImplementedError
-
-    def _validate_max_multimodal_tokens(self, max_mm_tokens: int):
-        if max_mm_tokens < 1:
-            raise ValueError("You should set the number of tokens to a "
-                             f"positive integer. Found: {max_mm_tokens}")
-
-    def register_max_multimodal_tokens(
-        self,
-        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
-    ):
-        """
-        Register the maximum number of tokens, corresponding to a single
-        instance of multimodal data, that are passed to the language model
-        for a model class.
-
-        If `None` is provided, then the default calculation is used instead.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if self._max_mm_tokens.contains(model_cls, strict=True):
-                logger.warning(
-                    "Model class %s already calculates maximum number of "
-                    "tokens in %s. It is overwritten by the new one.",
-                    model_cls,
-                    self,
-                )
-
-            if isinstance(max_mm_tokens, int):
-                self._validate_max_multimodal_tokens(max_mm_tokens)
-
-            self._max_mm_tokens[model_cls] = (
-                max_mm_tokens or self._default_max_multimodal_tokens)
-
-            return model_cls
-
-        return wrapper
-
-    def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
-        """
-        Get the maximum number of multi-modal tokens
-        for profiling the memory usage of a model.
-
-        If this registry is not applicable to the model, `0` is returned.
-
-        The model is identified by ``model_config``.
-        """
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-        from vllm.model_executor.models import supports_multimodal
-
-        model_cls, _ = get_model_architecture(model_config)
-
-        if not supports_multimodal(model_cls):
-            return 0
-
-        max_mm_tokens = self._max_mm_tokens.get(model_cls)
-        if max_mm_tokens is None:
-            return 0
-
-        if callable(max_mm_tokens):
-            mm_processor_kwargs = get_allowed_kwarg_only_overrides(
-                max_mm_tokens,
-                overrides=model_config.mm_processor_kwargs,
-                requires_kw_only=False,
-                allow_var_kwargs=True,
-            )
-            max_mm_tokens = max_mm_tokens(InputContext(model_config),
-                                          **mm_processor_kwargs)
-
-        self._validate_max_multimodal_tokens(max_mm_tokens)
-
-        return max_mm_tokens
 
 
 class MultiModalPlaceholderMap:
     """
     Relates multi-modal embeddings to their corresponding placeholders.
+
+    Note: This is only used in V0.
     """
 
     class IndexMap(NamedTuple):
@@ -279,8 +55,7 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> tuple[Optional[MultiModalDataDict], dict[str,
-                                                  "MultiModalPlaceholderMap"]]:
+    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
         prompt (``seq_group``) represented by ``positions``, as well as a
@@ -289,82 +64,58 @@ def from_seq_group(
 
         Examples:
 
-        .. code-block::
-
-            Prompt:    |AAAA BBBB What's in these images?|
-            Positions: |.................................|
+        ```
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |.................................|
 
-                images      = [A, B]
-                src_ranges  = [(0, 4), (4, 8)]
-                dest_ranges = [(0, 4), (5, 9)]
+            images      = [A, B]
+            src_ranges  = [(0, 4), (4, 8)]
+            dest_ranges = [(0, 4), (5, 9)]
 
-            Prompt:    |AAAA BBBB What's in these images?|
-            Positions: |  .....                          |
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |  .....                          |
 
-                images      = [A, B]
-                src_ranges  = [(2, 4), (4, 6)]
-                dest_ranges = [(0, 2), (3, 5)]
+            images      = [A, B]
+            src_ranges  = [(2, 4), (4, 6)]
+            dest_ranges = [(0, 2), (3, 5)]
 
-            Prompt:    |AAAA BBBB What's in these images?|
-            Positions: |     .........                   |
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |     .........                   |
 
-                images      = [B]
-                src_ranges  = [(0, 4)]
-                dest_ranges = [(0, 4)]
+            images      = [B]
+            src_ranges  = [(0, 4)]
+            dest_ranges = [(0, 4)]
 
-            Prompt:    |AAAA BBBB What's in these images?|
-            Positions: |          .......................|
+        Prompt:    |AAAA BBBB What's in these images?|
+        Positions: |          .......................|
 
-                images      = []
-                src_ranges  = []
-                dest_ranges = []
+            images      = []
+            src_ranges  = []
+            dest_ranges = []
+        ```
         """
         seq_mm_data = seq_group.multi_modal_data
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return seq_mm_data, {}
-
-        # For merged processor, we directly use mm_kwargs as mm_data
-        if isinstance(seq_mm_data, MultiModalKwargs):
-            placeholder_maps = dict[str, MultiModalPlaceholderMap]()
-
-            for modality, placeholders in seq_mm_placeholders.items():
-                placeholder_map = MultiModalPlaceholderMap()
+            return MultiModalKwargs({}), {}
 
-                if positions:
-                    placeholder_map.append_items_from_seq_group(
-                        positions,
-                        # Dummy, since we don't care about intersecting items
-                        [None] * len(placeholders),
-                        placeholders,
-                    )
-
-                placeholder_maps[modality] = placeholder_map
-
-            return seq_mm_data, placeholder_maps
-
-        mm_data = {**seq_mm_data}
-        placeholder_maps = defaultdict[str, MultiModalPlaceholderMap](
-            MultiModalPlaceholderMap)
+        placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
         for modality, placeholders in seq_mm_placeholders.items():
-            mm_items = mm_data.pop(modality)
-            if not isinstance(mm_items, list):
-                mm_items = [mm_items]
+            placeholder_map = MultiModalPlaceholderMap()
 
             if positions:
-                intersecting_items = placeholder_maps[modality] \
-                    .append_items_from_seq_group(
-                        positions,
-                        mm_items,
-                        placeholders,
-                    )
+                placeholder_map.append_items_from_seq_group(
+                    positions,
+                    # Dummy, since we don't care about intersecting items
+                    [None] * len(placeholders),
+                    placeholders,
+                )
 
-                if intersecting_items:
-                    mm_data[modality] = intersecting_items
+            placeholder_maps[modality] = placeholder_map
 
-        return mm_data, placeholder_maps
+        return seq_mm_data, placeholder_maps
 
     def append_items_from_seq_group(
         self,
@@ -445,8 +196,7 @@ def index_map(self) -> "IndexMap":
                 f"The number of source ({len(src_indices)}) and destination "
                 f"indices ({len(dest_indices)}) must be the same.")
 
-        return MultiModalPlaceholderMap.IndexMap(src=src_indices,
-                                                 dest=dest_indices)
+        return self.IndexMap(src=src_indices, dest=dest_indices)
 
 
 class MediaIO(ABC, Generic[_T]):
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 11665ef6675..53e289370a9 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -31,16 +31,20 @@ def serialize_item(cls, obj: object) -> bytes:
             return obj.encode("utf-8")
         if isinstance(obj, bytes):
             return obj
-        if isinstance(obj, Image.Image):
-            return obj.tobytes()
+        if isinstance(obj, (int, float)):
+            return np.array(obj).tobytes()
 
-        # Convertible to NumPy arrays
+        if isinstance(obj, Image.Image):
+            return cls.item_to_bytes("image", np.array(obj.convert("RGBA")))
         if isinstance(obj, torch.Tensor):
-            obj = obj.numpy()
-        if isinstance(obj, (int, float)):
-            obj = np.array(obj)
+            return cls.item_to_bytes("tensor", obj.numpy())
         if isinstance(obj, np.ndarray):
-            return obj.tobytes()
+            return cls.item_to_bytes(
+                "ndarray", {
+                    "dtype": obj.dtype.str,
+                    "shape": obj.shape,
+                    "data": obj.data.tobytes(),
+                })
 
         logger.warning(
             "No serialization method found for %s. "
@@ -53,14 +57,22 @@ def item_to_bytes(
         cls,
         key: str,
         obj: object,
+    ) -> bytes:
+        return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj))
+
+    @classmethod
+    def iter_item_to_bytes(
+        cls,
+        key: str,
+        obj: object,
     ) -> Iterable[tuple[bytes, bytes]]:
         # Recursive cases
         if isinstance(obj, (list, tuple)):
             for i, elem in enumerate(obj):
-                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+                yield from cls.iter_item_to_bytes(f"{key}.{i}", elem)
         elif isinstance(obj, dict):
             for k, v in obj.items():
-                yield from cls.item_to_bytes(f"{key}.{k}", v)
+                yield from cls.iter_item_to_bytes(f"{key}.{k}", v)
         else:
             key_bytes = cls.serialize_item(key)
             value_bytes = cls.serialize_item(obj)
@@ -71,7 +83,7 @@ def hash_kwargs(cls, **kwargs: object) -> str:
         hasher = blake3()
 
         for k, v in kwargs.items():
-            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+            for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v):
                 hasher.update(k_bytes)
                 hasher.update(v_bytes)
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 0c5a84c6508..939928bbf10 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,89 +3,11 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from PIL import Image
 
-from vllm.inputs.registry import InputContext
-from vllm.logger import init_logger
-from vllm.transformers_utils.processor import cached_get_image_processor
-from vllm.utils import is_list_of
-
-from .base import MediaIO, MultiModalPlugin
-from .inputs import ImageItem, ModalityData, MultiModalKwargs
-
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
-logger = init_logger(__name__)
-
-
-class ImagePlugin(MultiModalPlugin):
-    """Plugin for image data."""
-
-    def get_data_key(self) -> str:
-        return "image"
-
-    def _get_hf_image_processor(
-        self,
-        model_config: "ModelConfig",
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    ):
-        if mm_processor_kwargs is None:
-            mm_processor_kwargs = {}
-        return cached_get_image_processor(
-            model_config.model,
-            trust_remote_code=model_config.trust_remote_code,
-            **mm_processor_kwargs)
-
-    def _default_input_mapper(
-        self,
-        ctx: InputContext,
-        data: ModalityData[ImageItem],
-        **mm_processor_kwargs,
-    ) -> MultiModalKwargs:
-        model_config = ctx.model_config
-
-        # PIL image
-        if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
-            image_processor = self._get_hf_image_processor(
-                model_config,
-                mm_processor_kwargs,
-            )
-
-            if image_processor is None:
-                raise RuntimeError("No HuggingFace processor is available "
-                                   "to process the image object")
-            try:
-                # NOTE: It may make sense to forward the mm_processor_kwargs
-                # here too. For now, to keep it simple, we only allow it be
-                # used for the initialization call though, just in case the
-                # signatures of the preprocessor initializer don't match
-                # preprocess()
-                batch_data = image_processor \
-                    .preprocess(data, return_tensors="pt") \
-                    .data
-            except Exception:
-                logger.error(
-                    "Failed to process image (%s) with the default mapper. "
-                    "This is most likely an edge-case with this model's image "
-                    "processor in transformers (type: %s), and not vLLM.",
-                    data,
-                    type(image_processor).__name__)
-                raise
-
-            return MultiModalKwargs(batch_data)
-
-        # Image embedding
-        elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
-            return MultiModalKwargs({"image_embeds": data})
-
-        raise TypeError(f"Invalid image type: {type(data)}")
-
-    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
-        return 3000
+from .base import MediaIO
 
 
 def rescale_image_size(image: Image.Image,
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 53729799b62..61d8eb62ffa 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -26,27 +26,27 @@
 
 HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image
-item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+A {class}`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace `ImageProcessor`.
 """
 
 HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor,
                                list[np.ndarray], list[torch.Tensor]]
 """
-A :class:`transformers.image_utils.VideoInput` representing a single video
-item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+A {class}`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace `VideoProcessor`.
 """
 
 HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor]
 """
 Represents a single audio
-item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+item, which can be passed to a HuggingFace `AudioProcessor`.
 """
 
 ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image
-item, which can be passed to a HuggingFace :code:`ImageProcessor`.
+A {class}`transformers.image_utils.ImageInput` representing a single image
+item, which can be passed to a HuggingFace `ImageProcessor`.
 
 Alternatively, a 3-D tensor or batch of 2-D tensors,
 which are treated as image embeddings;
@@ -55,8 +55,8 @@
 
 VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor]
 """
-A :class:`transformers.image_utils.VideoInput` representing a single video
-item, which can be passed to a HuggingFace :code:`VideoProcessor`.
+A {class}`transformers.image_utils.VideoInput` representing a single video
+item, which can be passed to a HuggingFace `VideoProcessor`.
 
 Alternatively, a 3-D tensor or batch of 2-D tensors,
 which are treated as video embeddings;
@@ -67,7 +67,7 @@
                              torch.Tensor]
 """
 Represents a single audio
-item, which can be passed to a HuggingFace :code:`AudioProcessor`.
+item, which can be passed to a HuggingFace `AudioProcessor`.
 
 Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate
 is different from that expected by the model;
@@ -83,7 +83,7 @@
 Either a single data item, or a list of data items.
 
 The number of data items allowed per modality is restricted by
-:code:`--limit-mm-per-prompt`.
+`--limit-mm-per-prompt`.
 """
 
 
@@ -105,7 +105,7 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 A dictionary containing an entry for each modality type to input.
 
-The built-in modalities are defined by :class:`MultiModalDataBuiltins`.
+The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
 """
 
 
@@ -116,14 +116,14 @@ class PlaceholderRange:
 
     Example:
 
-        Prompt: :code:`AAAA BBBB What is in these images?`
+    Prompt: `AAAA BBBB What is in these images?`
 
-        Images A and B will have:
+    Images A and B will have:
 
-        .. code-block::
-
-            A: PlaceholderRange(offset=0, length=4)
-            B: PlaceholderRange(offset=5, length=4)
+    ```
+    A: PlaceholderRange(offset=0, length=4)
+    B: PlaceholderRange(offset=5, length=4)
+    ```
     """
 
     offset: int
@@ -166,7 +166,7 @@ def __eq__(self, other: object) -> bool:
 
 
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
-    """Equality check between :data:`NestedTensors` objects."""
+    """Equality check between {data}`NestedTensors` objects."""
     if isinstance(a, torch.Tensor):
         return isinstance(b, torch.Tensor) and torch.equal(a, b)
     elif isinstance(b, torch.Tensor):
@@ -186,7 +186,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-:meth:`MultiModalKwargs.batch`.
+{meth}`MultiModalKwargs.batch`.
 """
 
 
@@ -194,7 +194,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 class MultiModalFieldElem:
     """
     Represents a keyword argument corresponding to a multi-modal item
-    in :class:`MultiModalKwargs`.
+    in {class}`MultiModalKwargs`.
     """
 
     modality: str
@@ -205,13 +205,13 @@ class MultiModalFieldElem:
 
     key: str
     """
-    The key of this field in :class:`MultiModalKwargs`,
+    The key of this field in {class}`MultiModalKwargs`,
     i.e. the name of the keyword argument to be passed to the model.
     """
 
     data: NestedTensors
     """
-    The tensor data of this field in :class:`MultiModalKwargs`,
+    The tensor data of this field in {class}`MultiModalKwargs`,
     i.e. the value of the keyword argument to be passed to the model.
     """
 
@@ -234,7 +234,7 @@ def __eq__(self, other: object) -> bool:
 class BaseMultiModalField(ABC):
     """
     Defines how to interpret tensor data belonging to a keyword argument in
-    :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
     """
 
     def _field_factory(self, *, modality: str, key: str):
@@ -259,10 +259,10 @@ def build_elems(
         data: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
         """
-        Construct :class:`MultiModalFieldElem` instances to represent
+        Construct {class}`MultiModalFieldElem` instances to represent
         the provided data.
         
-        This is the inverse of :meth:`reduce_data`.
+        This is the inverse of {meth}`reduce_data`.
         """
         raise NotImplementedError
 
@@ -272,9 +272,9 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 
     def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
         """
-        Merge the data from multiple instances of :class:`MultiModalFieldElem`.
+        Merge the data from multiple instances of {class}`MultiModalFieldElem`.
 
-        This is the inverse of :meth:`build_elems`.
+        This is the inverse of {meth}`build_elems`.
         """
         field_types = [type(item.field) for item in elems]
         if len(set(field_types)) > 1:
@@ -286,8 +286,9 @@ def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
-    See also:
-        :func:`MultiModalFieldConfig.batched`
+    :::{seealso}
+    {func}`MultiModalFieldConfig.batched`
+    :::
     """
 
     def build_elems(
@@ -316,11 +317,13 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
-    See also:
-        :func:`MultiModalFieldConfig.flat`
-        :func:`MultiModalFieldConfig.flat_from_sizes`
+    :::{seealso}
+    {func}`MultiModalFieldConfig.flat`
+    {func}`MultiModalFieldConfig.flat_from_sizes`
+    :::
     """
-    slices: Sequence[slice]
+    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
+    dim: int = 0
 
     def build_elems(
         self,
@@ -329,7 +332,10 @@ def build_elems(
         data: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
         field_factory = self._field_factory(modality=modality, key=key)
-        return [field_factory(data[s]) for s in self.slices]
+        if not is_list_of(self.slices, slice, check="all"):
+            assert isinstance(data, torch.Tensor), \
+                "torch.Tensor is required for multiple slices"
+        return [field_factory(data[cast(slice, s)]) for s in self.slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -338,18 +344,25 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
                 # - produce exactly same result as `torch.concat(batch)`
                 # - will achieve zero-copy if the tensor is contiguous
                 return batch[0].contiguous()
-            first_shape = batch[0].shape
-            if all(elem.shape[1:] == first_shape[1:] for elem in batch):
-                return torch.concat(batch)
 
+            def _expect_same_shape(tensor: torch.Tensor):
+                return tensor.shape[:self.dim] + tensor.shape[self.dim + 1:]
+
+            first_shape = _expect_same_shape(batch[0])
+
+            if all(_expect_same_shape(elem) == first_shape for elem in batch):
+                return torch.concat(batch, dim=self.dim)
+
+        assert self.dim == 0, "dim == 0 is required for nested list"
         return [e for elem in batch for e in elem]
 
 
 @dataclass(frozen=True)
 class MultiModalSharedField(BaseMultiModalField):
     """
-    See also:
-        :func:`MultiModalFieldConfig.shared`
+    :::{seealso}
+    {func}`MultiModalFieldConfig.shared`
+    :::
     """
     batch_size: int
 
@@ -380,17 +393,17 @@ def batched(modality: str):
 
         Example:
 
-            .. code-block::
-
-                Input:
-                    Data: [[AAAA]
-                        [BBBB]
-                        [CCCC]]
-
-                Output:
-                    Element 1: [AAAA]
-                    Element 2: [BBBB]
-                    Element 3: [CCCC]
+        ```
+        Input:
+            Data: [[AAAA]
+                [BBBB]
+                [CCCC]]
+
+        Output:
+            Element 1: [AAAA]
+            Element 2: [BBBB]
+            Element 3: [CCCC]
+        ```
         """
         return MultiModalFieldConfig(
             field=MultiModalBatchedField(),
@@ -398,7 +411,9 @@ def batched(modality: str):
         )
 
     @staticmethod
-    def flat(modality: str, slices: Sequence[slice]):
+    def flat(modality: str,
+             slices: Union[Sequence[slice], Sequence[Sequence[slice]]],
+             dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -406,31 +421,52 @@ def flat(modality: str, slices: Sequence[slice]):
         Args:
             modality: The modality of the multi-modal item that uses this
                 keyword argument.
-            slices: For each multi-modal item, a slice that is used to extract
-                the data corresponding to it.
+            slices: For each multi-modal item, a slice (dim=0) or a tuple of
+                slices (dim>0) that is used to extract the data corresponding 
+                to it.
+            dim: The dimension to extract data, default to 0.
 
         Example:
 
-            .. code-block::
-        
-                Given:
-                    slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
-
-                Input:
-                    Data: [AAABBBBCC]
-
-                Output:
-                    Element 1: [AAA]
-                    Element 2: [BBBB]
-                    Element 3: [CC]
+        ```
+        Given:
+            slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
+
+        Input:
+            Data: [AAABBBBCC]
+
+        Output:
+            Element 1: [AAA]
+            Element 2: [BBBB]
+            Element 3: [CC]
+        ```
+
+        ```
+        Given:
+            slices: [
+                (slice(None), slice(0, 3)),
+                (slice(None), slice(3, 7)),
+                (slice(None), slice(7, 9))]
+            dim: 1
+
+        Input:
+            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+        Output:
+            Element 1: [[A],[A],[A]]
+            Element 2: [[B],[B],[B],[B]]
+            Element 3: [[C],[C]]
+        ```
         """
         return MultiModalFieldConfig(
-            field=MultiModalFlatField(slices=slices),
+            field=MultiModalFlatField(slices=slices, dim=dim),
             modality=modality,
         )
 
     @staticmethod
-    def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+    def flat_from_sizes(modality: str,
+                        size_per_item: torch.Tensor,
+                        dim: int = 0):
         """
         Defines a field where an element in the batch is obtained by
         slicing along the first dimension of the underlying data.
@@ -440,24 +476,40 @@ def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
                 keyword argument.
             slices: For each multi-modal item, the size of the slice that
                 is used to extract the data corresponding to it.
+            dim: The dimension to slice, default to 0.
 
         Example:
 
-            .. code-block::
-        
-                Given:
-                    size_per_item: [3, 4, 2]
-
-                Input:
-                    Data: [AAABBBBCC]
-
-                Output:
-                    Element 1: [AAA]
-                    Element 2: [BBBB]
-                    Element 3: [CC]
-    
-        See also:
-            :func:`MultiModalFieldConfig.flat`
+        ```
+        Given:
+            size_per_item: [3, 4, 2]
+
+        Input:
+            Data: [AAABBBBCC]
+
+        Output:
+            Element 1: [AAA]
+            Element 2: [BBBB]
+            Element 3: [CC]
+        ```
+
+        ```
+        Given:
+            slices: [3, 4, 2]
+            dim: 1
+
+        Input:
+            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]
+
+        Output:
+            Element 1: [[A],[A],[A]]
+            Element 2: [[B],[B],[B],[B]]
+            Element 3: [[C],[C]]
+        ```
+
+        :::{seealso}
+        {func}`MultiModalFieldConfig.flat`
+        :::
         """
 
         if size_per_item.ndim != 1:
@@ -465,12 +517,11 @@ def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
                              f"but found shape: {size_per_item.shape}")
 
         slice_idxs = [0, *accumulate(size_per_item)]
-        slices = [
-            slice(slice_idxs[i], slice_idxs[i + 1])
-            for i in range(len(size_per_item))
-        ]
+        slices = [(slice(None, None, None), ) * dim +
+                  (slice(slice_idxs[i], slice_idxs[i + 1]), )
+                  for i in range(len(size_per_item))]
 
-        return MultiModalFieldConfig.flat(modality, slices)
+        return MultiModalFieldConfig.flat(modality, slices, dim=dim)
 
     @staticmethod
     def shared(modality: str, batch_size: int):
@@ -487,19 +538,19 @@ def shared(modality: str, batch_size: int):
 
         Example:
 
-            .. code-block::
-        
-                Given:
-                    batch_size: 4
+        ```
+        Given:
+            batch_size: 4
 
-                Input:
-                    Data: [XYZ]
+        Input:
+            Data: [XYZ]
 
-                Output:
-                    Element 1: [XYZ]
-                    Element 2: [XYZ]
-                    Element 3: [XYZ]
-                    Element 4: [XYZ]
+        Output:
+            Element 1: [XYZ]
+            Element 2: [XYZ]
+            Element 3: [XYZ]
+            Element 4: [XYZ]
+        ```
         """
         return MultiModalFieldConfig(
             field=MultiModalSharedField(batch_size),
@@ -522,8 +573,8 @@ def build_elems(
 
 class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     """
-    A collection of :class:`MultiModalFieldElem`
-    corresponding to a data item in :class:`MultiModalDataItems`.
+    A collection of {class}`MultiModalFieldElem`
+    corresponding to a data item in {class}`MultiModalDataItems`.
     """
 
     @staticmethod
@@ -542,11 +593,11 @@ def modality(self) -> str:
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
-    :meth:`~torch.nn.Module.forward`.
+    {meth}`~torch.nn.Module.forward`.
 
-    The metadata :code:`items` enables us to obtain the keyword arguments
-    corresponding to each data item in :class:`MultiModalDataItems`, via
-    :meth:`get_item` and :meth:`get_items`.
+    The metadata `items` enables us to obtain the keyword arguments
+    corresponding to each data item in {class}`MultiModalDataItems`, via
+    {meth}`get_item` and {meth}`get_items`.
     """
 
     @staticmethod
@@ -585,7 +636,7 @@ def from_hf_inputs(
 
     @staticmethod
     def from_items(items: Sequence[MultiModalKwargsItem]):
-        """Construct a new :class:`MultiModalKwargs` from multiple items."""
+        """Construct a new {class}`MultiModalKwargs` from multiple items."""
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
         for item in items:
             for key, elem in item.items():
@@ -750,7 +801,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 class MultiModalInputs(TypedDict):
     """
     Represents the outputs of
-    :class:`vllm.multimodal.processing.BaseMultiModalProcessor`,
+    {class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
     ready to be passed to vLLM internals.
     """
 
@@ -775,13 +826,18 @@ class MultiModalInputs(TypedDict):
     mm_placeholders: MultiModalPlaceholderDict
     """
     For each modality, information about the placeholder tokens in
-    :code:`prompt_token_ids`.
+    `prompt_token_ids`.
+    """
+
+    cache_salt: NotRequired[str]
+    """
+    Optional cache salt to be used for prefix caching.
     """
 
 
 class MultiModalEncDecInputs(MultiModalInputs):
     """
-    Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor`
+    Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
     ready to be passed to vLLM internals.
     """
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index fc5a294564e..f9588431c8e 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -3,8 +3,8 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -14,7 +14,7 @@
 
 from vllm.utils import is_list_of
 
-from .audio import resample_audio
+from .audio import AudioResampler
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
                      ImageItem, ModalityData, MultiModalDataDict,
                      MultiModalFieldConfig, MultiModalKwargs, VideoItem)
@@ -25,7 +25,7 @@
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
     """
-    Represents data items for a modality in :class:`MultiModalDataItems`.
+    Represents data items for a modality in {class}`MultiModalDataItems`.
     """
 
     def __init__(self, data: _T, modality: str) -> None:
@@ -246,7 +246,7 @@ def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
 
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     """
-    As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
+    As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
     such that each entry corresponds to a list.
     """
 
@@ -254,7 +254,7 @@ def get_count(self, modality: str, *, strict: bool = True) -> int:
         """
         Get the number of data items belonging to a modality.
         
-        If `strict=False`, return `0` instead of raising :exc:`KeyError`
+        If `strict=False`, return `0` instead of raising {exc}`KeyError`
         even if the modality is not found.
         """
         if modality not in self:
@@ -300,18 +300,26 @@ def get_items(
 
 class MultiModalDataParser:
     """
-    Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
-    :class:`MultiModalDataItems`.
+    Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
+    {class}`MultiModalDataItems`.
 
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
             items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(
+        self,
+        *,
+        target_sr: Optional[float] = None,
+        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+    ) -> None:
         super().__init__()
 
-        self.target_sr = target_sr
+        self.audio_resampler = AudioResampler(
+            target_sr=target_sr,
+            method=audio_resample_method,
+        )
 
     def _is_embeddings(
             self, data: object
@@ -374,15 +382,8 @@ def _parse_audio_data(
             if orig_sr is None:
                 new_audio = audio
             else:
-                target_sr = self.target_sr
-                if target_sr is None:
-                    raise RuntimeError(
-                        "Audio resampling is not supported when "
-                        "`target_sr` is not provided")
-
-                new_audio = resample_audio(audio,
-                                           orig_sr=orig_sr,
-                                           target_sr=target_sr)
+                new_audio = self.audio_resampler.resample(audio,
+                                                          orig_sr=orig_sr)
 
             new_audios.append(new_audio)
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 7f289426d34..27b059b3ee6 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 import re
 import sys
 from abc import ABC, abstractmethod
@@ -110,13 +111,13 @@ class PromptUpdateDetails(Generic[_S]):
 
     is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
     """
-    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
+    Given {attr}`full`, return a boolean mask of shape `(len(full),)`
     indicating which positions of `full` to assign embeddings to.
 
     `None` (default) means to assign embeddings to all positions of `full`.
 
     The embeddings are obtained by calling
-    :class:`SupportsMultiModal.get_multimodal_embeddings`.
+    {class}`SupportsMultiModal.get_multimodal_embeddings`.
     """
 
     @staticmethod
@@ -155,13 +156,13 @@ def select_token_id(
 The token sequence or text that are part of the update.
 
 If only part of the content corresponds to feature placeholders, you can
-use :class:`PromptUpdateDetails` to specify which part.
+use {class}`PromptUpdateDetails` to specify which part.
 """
 
 PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
                             PromptUpdateInfo]
 """
-Given the index of the processed item within :attr:`modality`,
+Given the index of the processed item within {attr}`modality`,
 output the corresponding token sequence (or text).
 
 For convenience, you can directly pass in the token sequence (or text)
@@ -212,52 +213,52 @@ class PromptInsertion(PromptUpdate):
 
     Example:
 
-        For each image, insert a number of ``<image>`` feature placeholders
-        equal to the feature size of the vision encoder after the ``<s>`` token:
-
-        .. code-block:: python
-
-            PromptInsertion(
-                modality="image",
-                target="<s>",
-                insertion="<image>" * image_feature_size,
-            )
-
-        Insert these tokens at the start of the prompt:
-
-        .. code-block:: python
-
-            PromptInsertion(
-                modality="image",
-                target=PromptIndexTargets.start(),
-                insertion="<image>" * image_feature_size,
-            )
-
-        Insert these tokens after a prefix ``Images:``:
-
-        .. code-block:: python
-
-            PromptInsertion(
-                modality="image",
-                target=PromptIndexTargets.prefix("Images:"),
-                insertion="<image>" * image_feature_size,
-            )
-
-        Insert these tokens at the end of the prompt:
-
-        .. code-block:: python
-
-            PromptInsertion(
-                modality="image",
-                target=PromptIndexTargets.end(),
-                insertion="<image>" * image_feature_size,
-            )
+    For each image, insert a number of ``<image>`` feature placeholders
+    equal to the feature size of the vision encoder after the ``<s>`` token:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target="<s>",
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+
+    Insert these tokens at the start of the prompt:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target=PromptIndexTargets.start(),
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+
+    Insert these tokens after a prefix ``Images:``:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target=PromptIndexTargets.prefix("Images:"),
+        insertion="<image>" * image_feature_size,
+    )
+    ```
+
+    Insert these tokens at the end of the prompt:
+
+    ```python
+    PromptInsertion(
+        modality="image",
+        target=PromptIndexTargets.end(),
+        insertion="<image>" * image_feature_size,
+    )
+    ```
     """
 
     insertion: PromptUpdateContent = field(repr=False)
     """
-    Given the index of the processed item within :attr:`modality`,
-    output the token sequence (or text) to insert right after :attr:`target`.
+    Given the index of the processed item within {attr}`modality`,
+    output the token sequence (or text) to insert right after {attr}`target`.
 
     For convenience, you can directly pass in the token sequence (or text)
     instead of a function if it does not depend on the input.
@@ -279,57 +280,57 @@ class PromptReplacement(PromptUpdate):
 
     Example:
 
-        For each image, replace one ``<image>`` input placeholder in the prompt
-        with a number of ``<image>`` feature placeholders
-        equal to the feature size of the vision encoder:
-
-        .. code-block:: python
-
-            PromptReplacement(
-                modality="image",
-                target="<image>",
-                replacement="<image>" * image_feature_size,
-            )
-
-        As above, but further pad the feature placeholders with ``<image_bos>``
-        and `<image_eos>``, which are not supposed to be passed to the vision
-        encoder:
-
-        .. code-block:: python
-
-            PromptReplacement(
-                modality="image",
-                target="<image>",
-                replacement=PromptUpdateDetails(
-                    full="".join([
-                        "<image_bos>",
-                        "<image>" * image_feature_size,
-                        "<image_eos>",
-                    ]),
-                    features="<image>" * image_feature_size,
-                ),
-            )
-
-        To avoid unnecessary tokenization during prompt replacement,
-        we recommended passing token sequences instead of text:
-
-        .. code-block:: python
-
-            PromptReplacement(
-                modality="image",
-                target=[image_token_id],
-                replacement=PromptUpdateDetails(
-                    full=([image_bos_id] + [image_token_id] * image_feature_size
-                          + [image_eos_id]),
-                    features=[image_token_id] * image_feature_size,
-                ),
-            )
+    For each image, replace one ``<image>`` input placeholder in the prompt
+    with a number of ``<image>`` feature placeholders
+    equal to the feature size of the vision encoder:
+
+    ```python
+    PromptReplacement(
+        modality="image",
+        target="<image>",
+        replacement="<image>" * image_feature_size,
+    )
+    ```
+
+    As above, but further pad the feature placeholders with ``<image_bos>``
+    and `<image_eos>``, which are not supposed to be passed to the vision
+    encoder:
+
+    ```python
+    PromptReplacement(
+        modality="image",
+        target="<image>",
+        replacement=PromptUpdateDetails(
+            full="".join([
+                "<image_bos>",
+                "<image>" * image_feature_size,
+                "<image_eos>",
+            ]),
+            features="<image>" * image_feature_size,
+        ),
+    )
+    ```
+
+    To avoid unnecessary tokenization during prompt replacement,
+    we recommended passing token sequences instead of text:
+
+    ```python
+    PromptReplacement(
+        modality="image",
+        target=[image_token_id],
+        replacement=PromptUpdateDetails(
+            full=([image_bos_id] + [image_token_id] * image_feature_size
+                    + [image_eos_id]),
+            features=[image_token_id] * image_feature_size,
+        ),
+    )
+    ```
     """
 
     replacement: PromptUpdateContent = field(repr=False)
     """
-    Given the index of the processed item within :attr:`modality`,
-    output the token sequence (or text) to replace :attr:`target`.
+    Given the index of the processed item within {attr}`modality`,
+    output the token sequence (or text) to replace {attr}`target`.
 
     For convenience, you can directly pass in the token sequence (or text)
     instead of a function if it does not depend on the input.
@@ -383,14 +384,14 @@ def modality(self) -> str:
 
 
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
-    """Convenience function to apply :func:`full_groupby` based on modality."""
+    """Convenience function to apply {func}`full_groupby` based on modality."""
     return full_groupby(values, key=lambda x: x.modality)
 
 
 @dataclass
 class _BoundPromptSequence:
     """
-    A :data:`_PromptSeq` bound to a tokenizer to automatically
+    A {data}`_PromptSeq` bound to a tokenizer to automatically
     convert between token sequence and text representations.
     """
     tokenizer: AnyTokenizer = field(repr=False)
@@ -442,8 +443,8 @@ class _BoundPromptContent:
 @dataclass
 class BoundPromptUpdate:
     """
-    A :class:`PromptUpdate` bound to a tokenizer to automatically convert
-    :attr:`target` and the result of :meth:`get_content` between
+    A {class}`PromptUpdate` bound to a tokenizer to automatically convert
+    {attr}`target` and the result of {meth}`get_content` between
     token sequence and text representations.
     """
     _origin: PromptUpdate
@@ -478,7 +479,7 @@ def mode(self) -> UpdateMode:
 
     def get_content(self, item_idx: int) -> _BoundPromptContent:
         """
-        Given the index of the processed item within :attr:`modality`,
+        Given the index of the processed item within {attr}`modality`,
         output the token sequence (or text) to update.
         """
         content = self.content
@@ -515,7 +516,7 @@ def iter_token_matches(
     match_ids: list[int],
 ) -> Generator[_TokenMatch]:
     """
-    Yield each occurrence of :code:`match_ids` in :code:`token_ids`.
+    Yield each occurrence of `match_ids` in `token_ids`.
 
     Note that empty matches are ignored.
     """
@@ -544,8 +545,8 @@ def replace_token_matches(
     new_ids: list[int],
 ) -> list[int]:
     """
-    Replace each occurrence of :code:`match_ids` in :code:`token_ids`
-    with :code:`new_ids`.
+    Replace each occurrence of `match_ids` in `token_ids`
+    with `new_ids`.
 
     Note that empty matches are ignored.
     """
@@ -653,7 +654,7 @@ def find_token_matches(
     prompt: list[int],
     prompt_updates: Sequence[BoundPromptUpdate],
 ) -> Sequence[PromptTargetMatch]:
-    """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
+    """Return each target of `prompt_updates` found in `prompt`."""
 
     def get_matches(update: BoundPromptUpdate):
         target = update.target
@@ -679,7 +680,7 @@ def find_text_matches(
     prompt: str,
     prompt_updates: Sequence[BoundPromptUpdate],
 ) -> Sequence[PromptTargetMatch]:
-    """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
+    """Return each target of `prompt_updates` found in `prompt`."""
 
     def get_matches(update: BoundPromptUpdate):
         target = update.target
@@ -706,7 +707,7 @@ def _resolve_matches(
     mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
 ) -> list[PromptTargetMatch]:
     """
-    Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
+    Resolve `mm_matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
     matches = [m for matches in mm_matches.values() for m in matches]
@@ -730,7 +731,7 @@ def _apply_matches(
     mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
-    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in `mm_matches` to `prompt`."""
     out_seqs = list[Union[str, list[int]]]()
     prev_end_idx = 0
     next_idx_by_modality = defaultdict[str, int](lambda: 0)
@@ -779,7 +780,7 @@ def apply_token_matches(
     mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[int]:
-    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in `mm_matches` to `prompt`."""
     if not mm_matches:
         return prompt
 
@@ -793,7 +794,7 @@ def apply_text_matches(
     mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> str:
-    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in `mm_matches` to `prompt`."""
     if not mm_matches:
         return prompt
 
@@ -808,7 +809,7 @@ def _iter_placeholders(
     mm_item_counts: Mapping[str, int],
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
-    Yield each set of placeholder tokens found in :code:`prompt`.
+    Yield each set of placeholder tokens found in `prompt`.
 
     Matches are exclusive even when multiple modalities share
     the same placeholder tokens. In that case, the modality that
@@ -875,6 +876,16 @@ def find_mm_placeholders(
 _V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]")
 
 
+class ProcessingCacheOptionalItem(NamedTuple):
+    key: str
+    value: Optional[MultiModalKwargsItem]
+
+
+class ProcessingCacheItem(NamedTuple):
+    key: str
+    value: MultiModalKwargsItem
+
+
 class ProcessingCache:
 
     @staticmethod
@@ -979,6 +990,22 @@ def get(
 
         return self._cache.get(cache_key)
 
+    def get_item(
+        self,
+        model_id: str,
+        modality: str,
+        input_item: object,
+        input_kwargs: Mapping[str, object],
+    ) -> ProcessingCacheOptionalItem:
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
+
+        return ProcessingCacheOptionalItem(
+            key=cache_key,
+            value=self._cache.get(cache_key),
+        )
+
     def put(
         self,
         model_id: str,
@@ -989,13 +1016,16 @@ def put(
     ) -> None:
         """
         Put a processed multi-modal item into the cache
-        according to its dependencies (see :meth:`get`).
+        according to its dependencies (see {meth}`get`).
         """
         cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
                                                  **{modality: input_item},
                                                  **input_kwargs)
         self._cache[cache_key] = output_kwargs
 
+    def put_item(self, item: ProcessingCacheItem) -> None:
+        self._cache[item.key] = item.value
+
 
 class BaseProcessingInfo:
     """Base class to provide the information necessary for data processing."""
@@ -1051,26 +1081,29 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
+MultiModalHashes = dict[str, list[str]]
+"""
+A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
+"""
+
 
 class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
 
-    Not to be confused with :class:`transformers.ProcessorMixin`.
+    Not to be confused with {class}`transformers.ProcessorMixin`.
     """
 
     def __init__(self,
                  info: _I,
                  dummy_inputs: "BaseDummyInputsBuilder[_I]",
                  *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
+                 cache: Optional[ProcessingCache] = None) -> None:
         super().__init__()
 
         self.info = info
         self.dummy_inputs = dummy_inputs
         self.cache = cache
-        self.enable_sanity_checks = enable_sanity_checks
 
         self.data_parser = self._get_data_parser()
 
@@ -1085,10 +1118,10 @@ def __call__(
     def _get_data_parser(self) -> MultiModalDataParser:
         """
         Construct a parser to preprocess multi-modal data items
-        before passing them to :meth:`_get_hf_mm_data`.
+        before passing them to {meth}`_get_hf_mm_data`.
 
         You can support additional modalities by creating a subclass
-        of :class:`MultiModalDataParser` that has additional subparsers.
+        of {class}`MultiModalDataParser` that has additional subparsers.
         """
         return MultiModalDataParser()
 
@@ -1097,8 +1130,8 @@ def _to_mm_items(
         mm_data: MultiModalDataDict,
     ) -> MultiModalDataItems:
         """
-        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`
-        before passing them to :meth:`_get_hf_mm_data`.
+        Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
+        before passing them to {meth}`_get_hf_mm_data`.
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
         supported_mm_limits = self.info.get_supported_mm_limits()
@@ -1117,8 +1150,9 @@ def _to_mm_items(
 
             if num_items > allowed_limit:
                 raise ValueError(
-                    f"You set or defaulted to {modality}={allowed_limit} "
-                    f"in --limit-mm-per-prompt`, but passed {num_items} "
+                    "You set or defaulted to "
+                    f"'{json.dumps({modality: allowed_limit})}' in "
+                    f"`--limit-mm-per-prompt`, but passed {num_items} "
                     f"{modality} items in the same prompt.")
 
         return mm_items
@@ -1149,7 +1183,7 @@ def _get_prompt_updates(
         inputs.
 
         Moreover, this information is critical to determine the token positions
-        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
+        in order to construct  {class}`~vllm-multimodal.input.PlaceholderRange`
         for each multi-modal item.
         """
         raise NotImplementedError
@@ -1203,8 +1237,8 @@ def _hf_processor_applies_updates(
         """
         Return whether the HF processor applies prompt updates.
 
-        For most HF processors, this should be :code:`True` when multi-modal
-        data items are passed, but :code:`False` when multi-modal embeddings
+        For most HF processors, this should be `True` when multi-modal
+        data items are passed, but `False` when multi-modal embeddings
         are passed.
         """
         return not any(
@@ -1273,7 +1307,7 @@ def _apply_hf_processor_tokens_only(
         Most HF processors accept prompt text but not prompt tokens.
         If the HF processor adds or removes tokens that are not related to
         multi-modal data, you should override this method so it is consistent
-        with the output of :meth:`_apply_hf_processor_text_only` on the
+        with the output of {meth}`_apply_hf_processor_text_only` on the
         corresponding text.
         """
         return prompt_tokens
@@ -1288,7 +1322,7 @@ def _apply_hf_processor_mm_only(
 
         Since HF processor requires that text and multi-modal items
         correspond to each other, we generate dummy text using
-        :class:`DummyInputsBuilder` to go along with the multi-modal data.
+        {class}`DummyInputsBuilder` to go along with the multi-modal data.
         """
         mm_counts = mm_items.get_all_counts()
 
@@ -1312,10 +1346,10 @@ def _apply_hf_processor_main(
         Apply the HF processor on the prompt text and multi-modal data.
 
         In addition, return whether prompt updates have been applied
-        (for most HF processors, this should be :code:`True`).
+        (for most HF processors, this should be `True`).
 
         Note:
-            If :code:`enable_hf_prompt_update=False`, we use HF processor
+            If `enable_hf_prompt_update=False`, we use HF processor
             to perform prompt updates if available; HF processor requires
             that the prompt corresponds to multi-modal items.
         """
@@ -1338,46 +1372,144 @@ def _apply_hf_processor_main(
 
         return prompt_ids, mm_kwargs, False
 
+    def _get_cache_missing_items(
+        self,
+        cache: ProcessingCache,
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[dict[str, list[ProcessingCacheOptionalItem]], dict[
+            str, list[object]]]:
+        model_id = self.info.model_id
+
+        mm_cache_items = {
+            modality: [
+                cache.get_item(model_id, modality, item,
+                               hf_processor_mm_kwargs) for item in items
+            ]
+            for modality, items in mm_data_items.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [
+                idx for idx, item in enumerate(cache_items)
+                if item.value is None
+            ]
+            for modality, cache_items in mm_cache_items.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+
+        return mm_cache_items, mm_missing_data
+
+    def _hash_mm_items(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalHashes:
+        """Create MM hashes to be returned (only used in V1)."""
+        model_id = self.info.model_id
+
+        return {
+            modality: [
+                MultiModalHasher.hash_kwargs(model_id=model_id,
+                                             **{modality: item},
+                                             **hf_processor_mm_kwargs)
+                for item in items
+            ]
+            for modality, items in mm_items.items()
+        }
+
+    def _merge_mm_kwargs(
+        self,
+        cache: ProcessingCache,
+        mm_cache_items: dict[str, list[ProcessingCacheOptionalItem]],
+        mm_missing_data: dict[str, list[object]],
+        mm_missing_kwargs: MultiModalKwargs,
+    ) -> dict[str, list[ProcessingCacheItem]]:
+        mm_missing_next_idx = {modality: 0 for modality in mm_missing_data}
+
+        merged_items = defaultdict[str, list[ProcessingCacheItem]](list)
+        for modality, cache_items in mm_cache_items.items():
+            for cache_item in cache_items:
+                if cache_item.value is None:
+                    kw_item = mm_missing_kwargs.get_item(
+                        modality,
+                        mm_missing_next_idx[modality],
+                    )
+                    cache_item_new = ProcessingCacheItem(
+                        key=cache_item.key,
+                        value=kw_item,
+                    )
+
+                    cache.put_item(cache_item_new)
+                    mm_missing_next_idx[modality] += 1
+                else:
+                    cache_item_new = ProcessingCacheItem(
+                        key=cache_item.key,
+                        value=cache_item.value,
+                    )
+
+                merged_items[modality].append(cache_item_new)
+
+        return dict(merged_items)
+
+    def _apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+        (
+            prompt_ids,
+            mm_kwargs,
+            is_update_applied,
+        ) = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_update=True,
+        )
+
+        mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs)
+                     if return_mm_hashes else None)
+
+        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
+
     def _cached_apply_hf_processor(
         self,
         prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        *,
+        return_mm_hashes: bool,
+    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
         """
         Apply the HF processor on the full prompt text,
         caching the results and reusing cached results.
         """
         cache = self.cache
-        model_id = self.info.model_id
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
-            return self._apply_hf_processor_main(
+            return self._apply_hf_processor(
                 prompt=prompt,
-                mm_items=mm_data_items,
+                mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-                enable_hf_prompt_update=True,
+                return_mm_hashes=return_mm_hashes,
             )
 
-        mm_maybe_cached_kw_items = {
-            modality: [
-                cache.get(model_id, modality, item, hf_processor_mm_kwargs)
-                for item in items
-            ]
-            for modality, items in mm_data_items.items()
-        }
-
-        mm_missing_idxs = {
-            modality:
-            [idx for idx, item in enumerate(kw_items) if item is None]
-            for modality, kw_items in mm_maybe_cached_kw_items.items()
-        }
-        mm_missing_data = {
-            modality: [mm_data_items[modality][idx] for idx in idxs]
-            for modality, idxs in mm_missing_idxs.items()
-        }
-        mm_missing_data_items = self._to_mm_items(mm_missing_data)
+        (
+            mm_cache_items,
+            mm_missing_data,
+        ) = self._get_cache_missing_items(
+            cache=cache,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
 
         # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
         # so we can't apply prompt updates until the new multimodal
@@ -1388,48 +1520,29 @@ def _cached_apply_hf_processor(
             is_update_applied,
         ) = self._apply_hf_processor_main(
             prompt=prompt,
-            mm_items=mm_missing_data_items,
+            mm_items=self._to_mm_items(mm_missing_data),
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             enable_hf_prompt_update=False,
         )
 
-        mm_missing_next_idx = {
-            modality: 0
-            for modality in mm_missing_data_items
-        }
-
-        merged_kw_items = list[MultiModalKwargsItem]()
-        for modality, kw_items in mm_maybe_cached_kw_items.items():
-            for idx, kw_item in enumerate(kw_items):
-                if kw_item is None:
-                    kw_item = mm_missing_kwargs.get_item(
-                        modality,
-                        mm_missing_next_idx[modality],
-                    )
-
-                    cache.put(
-                        model_id,
-                        modality,
-                        mm_data_items[modality][idx],
-                        hf_processor_mm_kwargs,
-                        kw_item,
-                    )
-
-                    mm_missing_next_idx[modality] += 1
-
-                merged_kw_items.append(kw_item)
+        mm_cache_items_merged = self._merge_mm_kwargs(
+            cache,
+            mm_cache_items=mm_cache_items,
+            mm_missing_data=mm_missing_data,
+            mm_missing_kwargs=mm_missing_kwargs,
+        )
 
-        if self.enable_sanity_checks:
-            mm_missing_counts = mm_missing_data_items.get_all_counts()
-            assert all(
-                item_count == mm_missing_counts[modality]
-                for modality, item_count in mm_missing_next_idx.items()), dict(
-                    mm_missing_next_idx=mm_missing_next_idx,
-                    mm_missing_counts=mm_missing_counts)
+        mm_kwargs = MultiModalKwargs.from_items([
+            item.value for cache_items in mm_cache_items_merged.values()
+            for item in cache_items
+        ])
 
-        mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
+        mm_hashes = {
+            modality: [item.key for item in cache_items]
+            for modality, cache_items in mm_cache_items_merged.items()
+        } if return_mm_hashes else None
 
-        return prompt_ids, mm_kwargs, is_update_applied
+        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
 
     def _bind_and_group_updates(
         self,
@@ -1557,66 +1670,26 @@ def _validate_mm_placeholders(
             placeholders = mm_placeholders.get(modality, [])
 
             if len(placeholders) != item_count:
+                # NOTE: If you are a model developer, this can also arise from
+                # an inconsistency between `_call_hf_processor` and
+                # `_get_mm_fields_config` implementations
                 raise RuntimeError(
                     f"Expected there to be {item_count} prompt updates "
                     f"corresponding to {item_count} {modality} items, but "
                     f"instead found {len(placeholders)} prompt updates! "
-                    "Either the prompt text has missing/incorrect tokens for "
-                    "multi-modal inputs, or there is a problem with your "
-                    "implementation of merged multi-modal processor for this "
-                    "model (usually arising from an inconsistency between "
-                    "`_call_hf_processor` and `_get_prompt_updates`).")
+                    "This is likely because you forgot to include input "
+                    "placeholder tokens (e.g., `<image>`, `<|image_pad|>`) "
+                    "in the prompt. If the model has a chat template, make "
+                    "sure you have applied it before calling `LLM.generate`.")
 
-    def apply(
+    def _maybe_apply_prompt_updates(
         self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        return_mm_hashes: bool = False,
-    ) -> MultiModalInputs:
-        """
-        Process multi-modal inputs to be used in vLLM.
-
-        The main steps are:
-
-        1. Apply HF Processor on prompt text and multi-modal data together,
-           outputting token IDs and processed tensors.
-        2. Find and update sequences in the token IDs with placeholder tokens.
-           The number of placeholder tokens equals the feature size of the
-           multi-modal data outputted by the multi-modal encoder.
-        3. Extract information about the placeholder tokens from the
-           processed token IDs.
-        """
-        mm_items = self._to_mm_items(mm_data)
-
-        # Create MM hashes to be returned (only used in V1)
-        # TODO: Use these hash keys for caching operations in apply_hf_processor
-        # instead of rehashing.
-
-        if return_mm_hashes:
-            model_id = self.info.model_id
-            mm_hashes = {
-                modality: [
-                    MultiModalHasher.hash_kwargs(model_id=model_id,
-                                                 **{modality: item},
-                                                 **hf_processor_mm_kwargs)
-                    for item in items
-                ]
-                for modality, items in mm_items.items()
-            }
-        else:
-            mm_hashes = None
-
-        (
-            prompt_ids,
-            mm_kwargs,
-            is_update_applied,
-        ) = self._cached_apply_hf_processor(
-            prompt,
-            mm_items,
-            hf_processor_mm_kwargs,
-        )
-
+        prompt_ids: list[int],
+        mm_kwargs: MultiModalKwargs,
+        is_update_applied: bool,
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         unbound_prompt_updates = self._get_prompt_updates(
             mm_items,
             hf_processor_mm_kwargs,
@@ -1650,6 +1723,50 @@ def apply(
             )
             self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
+        return prompt_ids, prompt, mm_placeholders
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
+    ) -> MultiModalInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+
+        The main steps are:
+
+        1. Apply HF Processor on prompt text and multi-modal data together,
+           outputting token IDs and processed tensors.
+        2. Find and update sequences in the token IDs with placeholder tokens.
+           The number of placeholder tokens equals the feature size of the
+           multi-modal data outputted by the multi-modal encoder.
+        3. Extract information about the placeholder tokens from the
+           processed token IDs.
+        """
+        mm_items = self._to_mm_items(mm_data)
+
+        (
+            prompt_ids,
+            mm_kwargs,
+            mm_hashes,
+            is_update_applied,
+        ) = self._cached_apply_hf_processor(
+            prompt,
+            mm_items,
+            hf_processor_mm_kwargs,
+            return_mm_hashes=return_mm_hashes,
+        )
+
+        prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            prompt_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            is_update_applied=is_update_applied,
+        )
+
         mm_placeholder_ranges = {
             modality: [item.to_range() for item in placeholders]
             for modality, placeholders in mm_placeholders.items()
@@ -1674,7 +1791,7 @@ def create_encoder_prompt(
         mm_data: MultiModalDataDict,
     ) -> Union[str, list[int]]:
         """
-        Create input prompt for the encoder. HF processor will be applied on 
+        Create input prompt for the encoder. HF processor will be applied on
         this prompt during profiling and generation.
         """
         raise NotImplementedError
@@ -1691,28 +1808,12 @@ def create_decoder_prompt(
         """Create input prompt for the decoder."""
         return prompt
 
-    def apply(
+    def _get_enc_dec_inputs(
         self,
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        return_mm_hashes: bool = False,
-    ) -> MultiModalEncDecInputs:
-        """
-        Process multi-modal inputs to be used in vLLM.
-        The main processing steps are modified to fit encoder-decoder model:
-        1. Create encoder prompt from input prompt text.
-        2. Apply the HF processor on encoder prompt.
-        3. Copy the input prompt text as decoder prompt inputs.
-        """
-        encoder_prompt = self.create_encoder_prompt(prompt, mm_data)
-        encoder_inputs = super().apply(
-            encoder_prompt,
-            mm_data,
-            hf_processor_mm_kwargs,
-            return_mm_hashes,
-        )
-
+        encoder_inputs: MultiModalInputs,
+    ):
         tokenizer = self.info.get_tokenizer()
         decoder_prompt = self.create_decoder_prompt(prompt, mm_data)
         if isinstance(decoder_prompt, str):
@@ -1732,3 +1833,31 @@ def apply(
             "prompt_token_ids": decoder_prompt_ids
         })
         return mm_inputs
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
+    ) -> MultiModalEncDecInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+        The main processing steps are modified to fit encoder-decoder model:
+        1. Create encoder prompt from input prompt text.
+        2. Apply the HF processor on encoder prompt.
+        3. Copy the input prompt text as decoder prompt inputs.
+        """
+        encoder_prompt = self.create_encoder_prompt(prompt, mm_data)
+        encoder_inputs = super().apply(
+            encoder_prompt,
+            mm_data,
+            hf_processor_mm_kwargs,
+            return_mm_hashes,
+        )
+
+        return self._get_enc_dec_inputs(
+            prompt=prompt,
+            mm_data=mm_data,
+            encoder_inputs=encoder_inputs,
+        )
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index a173487c470..b5875124c12 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -25,7 +25,7 @@
 class ProcessorInputs:
     """
     Represents the keyword arguments to
-    :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
     """
     prompt_text: str
     mm_data: MultiModalDataDict
@@ -63,7 +63,7 @@ def __init__(self, info: _I) -> None:
     # TODO: @abstractmethod after transition
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         """
-        Build the text input corresponding to :code:`mm_counts`.
+        Build the text input corresponding to `mm_counts`.
         """
         if (type(self).get_dummy_processor_inputs ==
                 BaseDummyInputsBuilder.get_dummy_processor_inputs):
@@ -215,17 +215,14 @@ def get_encoder_dummy_data(
         elif total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning_once(
-                "The encoder sequence length used for profiling ("
-                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
-                " is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                f"({total_len} tokens in total, out of which "
-                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
-                "multi-modal embeddings). This may cause certain "
-                "multi-modal inputs to fail during inference, even when "
-                "the input text is short. To avoid this, you should "
-                "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.")
+                "The encoder sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
+                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
+                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
+                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
+                seq_len,
+                total_len,
+                str(self._get_mm_num_tokens(mm_inputs)),
+            )
 
         return DummyEncoderData(encoder_prompt_token_ids)
 
@@ -243,17 +240,14 @@ def get_decoder_dummy_data(
         if total_len > seq_len and not envs.VLLM_USE_V1:
             # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning_once(
-                "The sequence length used for profiling ("
-                f"max_num_batched_tokens / max_num_seqs = {seq_len}) "
-                "is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                f"({total_len} tokens in total, out of which "
-                f"{self._get_mm_num_tokens(mm_inputs)} are reserved for "
-                "multi-modal embeddings). This may cause certain "
-                "multi-modal inputs to fail during inference, even when "
-                "the input text is short. To avoid this, you should "
-                "increase `max_model_len`, reduce `max_num_seqs`, "
-                "and/or reduce `mm_counts`.")
+                "The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) "  # noqa: E501
+                "is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). "  # noqa: E501
+                "This may cause certain multi-modal inputs to fail during inference, even when the input text is short. "  # noqa: E501
+                "To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.",  # noqa: E501
+                seq_len,
+                total_len,
+                str(self._get_mm_num_tokens(mm_inputs)),
+            )
 
         if total_len < seq_len:
             prompt_token_ids.extend([0] * (seq_len - total_len))
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index def0595013b..3e62f4c43e1 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import functools
-from collections import UserDict
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar
+from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
+from typing_extensions import deprecated
 
 from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.inputs import InputProcessingContext
@@ -15,15 +13,10 @@
                                                cached_tokenizer_from_config)
 from vllm.utils import ClassRegistry
 
-from .audio import AudioPlugin
-from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
-from .image import ImagePlugin
-from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
 from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
                         DummyEncoderData, MultiModalProfiler)
-from .video import VideoPlugin
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -36,7 +29,7 @@
 
 
 class ProcessingInfoFactory(Protocol[_I_co]):
-    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    """Constructs a {class}`MultiModalProcessor` instance from the context."""
 
     def __call__(
         self,
@@ -47,7 +40,7 @@ def __call__(
 
 class DummyInputsBuilderFactory(Protocol[_I]):
     """
-    Constructs a :class:`BaseDummyInputsBuilder` instance from the context.
+    Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
     """
 
     def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
@@ -55,7 +48,7 @@ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
 
 
 class MultiModalProcessorFactory(Protocol[_I]):
-    """Constructs a :class:`MultiModalProcessor` instance from the context."""
+    """Constructs a {class}`MultiModalProcessor` instance from the context."""
 
     def __call__(
         self,
@@ -84,197 +77,48 @@ def build_processor(
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
-class _MultiModalLimits(UserDict["ModelConfig", dict[str, int]]):
-    """
-    Wraps `_limits_by_model` for a more informative error message
-    when attempting to access a model that does not exist.
-    """
-
-    def __getitem__(self, key: "ModelConfig") -> dict[str, int]:
-        try:
-            return super().__getitem__(key)
-        except KeyError as exc:
-            msg = (f"Cannot find `mm_limits` for model={key.model}. Did you "
-                   "forget to call `init_mm_limits_per_prompt`?")
-            raise KeyError(msg) from exc
-
-
 class MultiModalRegistry:
     """
     A registry that dispatches data processing according to the model.
     """
 
-    DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin())
-
-    def __init__(
-            self,
-            *,
-            plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
-        self._plugins = {p.get_data_key(): p for p in plugins}
-
+    def __init__(self) -> None:
         self._processor_factories = ClassRegistry[nn.Module,
                                                   _ProcessorFactories]()
 
-        # This is used for non-multimodal models
-        self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
-
-        self._limits_by_model = _MultiModalLimits()
-
         self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB)
 
-    def register_plugin(self, plugin: MultiModalPlugin) -> None:
-        """
-        Register a multi-modal plugin so it can be recognized by vLLM.
-        """
-        data_type_key = plugin.get_data_key()
-
-        if data_type_key in self._plugins:
-            logger.warning(
-                "A plugin is already registered for data type %s, "
-                "and will be overwritten by the new plugin %s.", data_type_key,
-                plugin)
-
-        self._plugins[data_type_key] = plugin
-
-    def _get_plugin(self, data_type_key: str):
-        plugin = self._plugins.get(data_type_key)
-        if plugin is not None:
-            return plugin
-
-        msg = f"Unknown multi-modal data type: {data_type_key}"
-        raise NotImplementedError(msg)
-
-    def register_input_mapper(
-        self,
-        data_type_key: str,
-        mapper: Optional[MultiModalInputMapper] = None,
-    ):
-        """
-        Register an input mapper for a specific modality to a model class.
-
-        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
-        """
-        return self._get_plugin(data_type_key).register_input_mapper(mapper)
-
-    def register_image_input_mapper(
-        self,
-        mapper: Optional[MultiModalInputMapper] = None,
-    ):
-        """
-        Register an input mapper for image data to a model class.
-
-        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
-        """
-        return self.register_input_mapper("image", mapper)
-
-    def map_input(
-        self,
-        model_config: "ModelConfig",
-        data: MultiModalDataDict,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    ) -> MultiModalKwargs:
-        """
-        Apply an input mapper to the data passed to the model.
-
-        The data belonging to each modality is passed to the corresponding
-        plugin which in turn converts the data into into keyword arguments
-        via the input mapper registered for that model.
-
-        See :meth:`MultiModalPlugin.map_input` for more details.
-
-        Note:
-            This should be called after :meth:`init_mm_limits_per_prompt`.
-        """
-        merged_dict = dict[str, NestedTensors]()
-
-        for data_key, data_value in data.items():
-            plugin = self._get_plugin(data_key)
-
-            num_items = len(data_value) if isinstance(data_value, list) else 1
-            max_items = self._limits_by_model[model_config][data_key]
-            if num_items > max_items:
-                raise ValueError(
-                    f"You set {data_key}={max_items} (or defaulted to 1) in "
-                    f"`--limit-mm-per-prompt`, but found {num_items} items "
-                    "in the same prompt.")
-
-            input_dict = plugin.map_input(model_config, data_value,
-                                          mm_processor_kwargs)
-            for input_key, input_tensor in input_dict.items():
-                if input_key in merged_dict:
-                    raise ValueError(f"The input mappers (keys={set(data)}) "
-                                     f"resulted in a conflicting keyword "
-                                     f"argument to `forward()`: {input_key}")
-
-                merged_dict[input_key] = input_tensor
-
-        return MultiModalKwargs(merged_dict)
-
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
     def create_input_mapper(self, model_config: "ModelConfig"):
-        """
-        Create an input mapper (see :meth:`map_input`) for a specific model.
-        """
-        # NOTE - we currently make the assumption that if a model has multiple
-        # supported modalities, they take the same kwargs. For the default,
-        # this could be an issue in the future if it falls back to two HF
-        # resources and we can't inspect the signature easily since it's
-        # getting initialized through the autoclass.
-        #
-        # If this is a problem in the future, we should revisit it, but since
-        # it potentially introduces a lot of complexity for a currently
-        # uncommon case, we do not for simplicity of both use & implementation
-        return functools.partial(self.map_input, model_config)
-
-    def register_max_multimodal_tokens(
-        self,
-        data_type_key: str,
-        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
-    ):
-        """
-        Register the maximum number of tokens, corresponding to a single
-        instance of multimodal data belonging to a specific modality, that are
-        passed to the language model for a model class.
-        """
-        return self._get_plugin(data_type_key) \
-            .register_max_multimodal_tokens(max_mm_tokens)
-
-    def register_max_image_tokens(
-        self,
-        max_mm_tokens: Optional[MultiModalTokensCalc] = None,
-    ):
-        """
-        Register the maximum number of image tokens, corresponding to a single
-        image, that are passed to the language model for a model class.
-        """
-        return self.register_max_multimodal_tokens("image", max_mm_tokens)
+        return lambda data, mm_processor_kwargs: data
 
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
     ) -> Mapping[str, int]:
         """
-        Get the maximum number of tokens per data item from each modality based 
+        Get the maximum number of tokens per data item from each modality based
         on underlying model configuration.
         """
-        if self.has_processor(model_config):
-            processor = self.create_processor(model_config, disable_cache=True)
-            profiler = MultiModalProfiler(processor)
+        if not model_config.is_multimodal_model:
+            return {}
 
-            seq_len = model_config.max_model_len
-            mm_limits = self.get_mm_limits_per_prompt(model_config)
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
 
-            return profiler.get_mm_max_tokens(
-                seq_len,
-                {
-                    modality: 1
-                    for modality, limit in mm_limits.items() if limit > 0
-                },
-            )
+        seq_len = model_config.max_model_len
+        mm_limits = self.get_mm_limits_per_prompt(model_config)
 
-        return {
-            key: plugin.get_max_multimodal_tokens(model_config)
-            for key, plugin in self._plugins.items()
-        }
+        return profiler.get_mm_max_tokens(
+            seq_len,
+            {
+                modality: 1
+                for modality, limit in mm_limits.items() if limit > 0
+            },
+        )
 
     def get_max_tokens_per_item_by_nonzero_modality(
         self,
@@ -282,11 +126,11 @@ def get_max_tokens_per_item_by_nonzero_modality(
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
-        on underlying model configuration, excluding modalities that user 
+        on underlying model configuration, excluding modalities that user
         explicitly disabled via `limit_mm_per_prompt`.
 
         Note:
-            This is currently directly used only in V1 for profiling the memory 
+            This is currently directly used only in V1 for profiling the memory
             usage of a model.
         """
         mm_limits = self.get_mm_limits_per_prompt(model_config)
@@ -306,10 +150,7 @@ def get_max_tokens_by_modality(
         Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
 
-        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
-
-        Note:
-            This should be called after :meth:`init_mm_limits_per_prompt`.
+        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
         """
         mm_limits = self.get_mm_limits_per_prompt(model_config)
 
@@ -324,48 +165,19 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         Get the maximum number of multi-modal tokens
         for profiling the memory usage of a model.
 
-        See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
-
-        Note:
-            This should be called after :meth:`init_mm_limits_per_prompt`.
+        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
         """
         return sum(self.get_max_tokens_by_modality(model_config).values())
 
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
     def init_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
     ) -> None:
-        """
-        Initialize the maximum number of multi-modal input instances for each
-        modality that are allowed per prompt for a model class.
-        """
-        if model_config in self._limits_by_model:
-            logger.warning(
-                "`mm_limits` has already been set for model=%s, and will "
-                "be overwritten by the new values.", model_config.model)
-
-        multimodal_config = model_config.multimodal_config
-        if multimodal_config is None:
-            limits_per_plugin = self._disabled_limits_per_plugin
-        else:
-            config_limits_per_plugin = multimodal_config.limit_per_prompt
-
-            extra_keys = config_limits_per_plugin.keys() - self._plugins.keys()
-            if extra_keys:
-                logger.warning(
-                    "Detected extra keys in `--limit-mm-per-prompt` which "
-                    "are not registered as multi-modal plugins: %s. "
-                    "They will be ignored.", extra_keys)
-
-            # NOTE: Currently the default is set to 1 for each plugin
-            # TODO: Automatically determine the limits based on budget
-            # once more models support multi-image inputs
-            limits_per_plugin = {
-                key: multimodal_config.get_limit_per_prompt(key)
-                for key in self._plugins
-            }
-
-        self._limits_by_model[model_config] = limits_per_plugin
+        pass
 
     def get_mm_limits_per_prompt(
         self,
@@ -374,16 +186,13 @@ def get_mm_limits_per_prompt(
         """
         Get the maximum number of multi-modal input instances for each modality
         that are allowed per prompt for a model class.
-
-        Note:
-            This should be called after :meth:`init_mm_limits_per_prompt`.
         """
-        if self.has_processor(model_config):
-            processor = self.create_processor(model_config, disable_cache=True)
-            profiler = MultiModalProfiler(processor)
-            return profiler.get_mm_limits()
+        if not model_config.is_multimodal_model:
+            return {}
 
-        return self._limits_by_model[model_config]
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        return profiler.get_mm_limits()
 
     def register_processor(
         self,
@@ -399,8 +208,9 @@ def register_processor(
         When the model receives multi-modal data, the provided function is
         invoked to transform the data into a dictionary of model inputs.
 
-        See also:
-            :ref:`mm-processing`
+        :::{seealso}
+        {ref}`mm-processing`
+        :::
         """
 
         def wrapper(model_cls: N) -> N:
@@ -427,14 +237,12 @@ def _get_model_cls(self, model_config: "ModelConfig"):
         model_cls, _ = get_model_architecture(model_config)
         return model_cls
 
+    @deprecated("Legacy input processor/mapper pipeline has been removed. "
+                "Please update your model runner to use "
+                "`seq_group_metadata.multi_modal_data` directly without "
+                "further processing.")
     def has_processor(self, model_config: "ModelConfig") -> bool:
-        """
-        Test whether a multi-modal processor is defined for a specific model.
-
-        See also:
-            :ref:`mm-processing`
-        """
-        return self._get_model_cls(model_config) in self._processor_factories
+        return True
 
     def create_processor(
         self,
@@ -446,13 +254,18 @@ def create_processor(
         """
         Create a multi-modal processor for a specific model and tokenizer.
 
-        See also:
-            :ref:`mm-processing`
+        :::{seealso}
+        {ref}`mm-processing`
+        :::
         """
+        if not model_config.is_multimodal_model:
+            raise ValueError(f"{model_config.model} is not a multimodal model")
+
         if tokenizer is None:
             tokenizer = cached_tokenizer_from_config(model_config)
         if disable_cache is None:
-            disable_cache = model_config.disable_mm_preprocessor_cache
+            mm_config = model_config.get_multimodal_config()
+            disable_cache = mm_config.disable_mm_preprocessor_cache
 
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
@@ -505,7 +318,9 @@ def get_encoder_dummy_data(
         token_ids = dummy_data.prompt_token_ids
         if len(token_ids) < seq_len:
             logger.warning_once(
-                f"Expected at least {seq_len} dummy encoder tokens for "
-                f"profiling, but found {len(token_ids)} tokens instead.")
+                "Expected at least %d dummy encoder tokens for profiling, but found %d tokens instead.",  # noqa: E501
+                seq_len,
+                len(token_ids),
+            )
 
         return dummy_data
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3f9b5be28b0..aef5f669ac6 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@
 
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
@@ -24,6 +24,10 @@
 if TYPE_CHECKING:
     from .hasher import MultiModalHashDict
     from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
+else:
+    MultiModalHashDict = Any
+    MultiModalKwargs = Any
+    MultiModalPlaceholderDict = Any
 
 
 class MediaConnector:
@@ -255,7 +259,7 @@ def fetch_image_embedding(
 
 
 global_media_connector = MediaConnector()
-"""The global :class:`MediaConnector` instance used by vLLM."""
+"""The global {class}`MediaConnector` instance used by vLLM."""
 
 fetch_audio = global_media_connector.fetch_audio
 fetch_image = global_media_connector.fetch_image
@@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str:
 
 
 def merge_and_sort_multimodal_metadata(
-    mm_positions: "MultiModalPlaceholderDict",
-    mm_hashes: Optional["MultiModalHashDict"],
+    mm_positions: MultiModalPlaceholderDict,
+    mm_hashes: Optional[MultiModalHashDict],
 ) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
     """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
     objects from all available modalities into a single list of 
-    PlaceholderRange, sorted by their offset (starting index in the input 
+    PlaceholderRange, sorted by their offset (starting index in the input
     sequence) in the ascending order.
 
-    Optionally if a MultiModalHashDict is given, same operation will be 
+    Optionally if a `MultiModalHashDict` is given, same operation will be
     applied to the object and the sorted list of hashes will be returned.
     
     Returns:
-        list[str]: List of item modalities in order of their positions in
-            the input sequence.
-        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
-            mm_positions.
-        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
-            given, None otherwise.
+        list[str]: List of item modalities in order of their positions in the
+        input sequence.
+        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
+        mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
+        None otherwise.
     """
 
     modalities = list(mm_positions.keys())
@@ -352,22 +356,23 @@ def merge_and_sort_multimodal_metadata(
 
 
 def group_mm_inputs_by_modality(
-        mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
-    """Group consecutive MultiModalKwargs from mm_inputs with the same modality 
-    together into the same list for batching purpose. For MultiModalKwargs with 
+        mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
+    """Group consecutive MultiModalKwargs from mm_inputs with the same modality
+    together into the same list for batching purpose. For MultiModalKwargs with
     multiple modalities, put them into their own list.
 
     Args:
         mm_inputs: List of MultiModalKwargs.
 
     Returns:
-        list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each 
-        inner list contains consecutive MultiModalKwargs with same modality.
+        list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
+        `MultiModalKwargs`, each inner list contains consecutive
+        `MultiModalKwargs` with same modality.
     """
     if not mm_inputs:
         return []
 
-    def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
+    def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
         # If the input has multiple modalities, return a id as the unique key
         # for the mm_input input.
         if len(mm_input.modalities) > 1:
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index f7c3f105295..6d875a1c651 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -4,80 +4,13 @@
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
-from vllm.inputs.registry import InputContext
-from vllm.logger import init_logger
-from vllm.transformers_utils.processor import cached_get_video_processor
-from vllm.utils import is_list_of
-
-from .base import MediaIO, ModalityData
-from .image import ImageMediaIO, ImagePlugin
-from .inputs import MultiModalKwargs, VideoItem
-
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
-logger = init_logger(__name__)
-
-
-class VideoPlugin(ImagePlugin):
-    """Plugin for video data."""
-
-    def get_data_key(self) -> str:
-        return "video"
-
-    def _get_hf_video_processor(
-        self,
-        model_config: "ModelConfig",
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    ):
-        if mm_processor_kwargs is None:
-            mm_processor_kwargs = {}
-        return cached_get_video_processor(
-            model_config.model,
-            trust_remote_code=model_config.trust_remote_code,
-            **mm_processor_kwargs)
-
-    def _default_input_mapper(
-        self,
-        ctx: InputContext,
-        data: ModalityData[VideoItem],
-        **mm_processor_kwargs,
-    ) -> MultiModalKwargs:
-        model_config = ctx.model_config
-
-        if isinstance(data, list) and len(data) == 1:
-            data = data[0]  # type: ignore
-
-        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
-            video_processor = self._get_hf_video_processor(
-                model_config,
-                mm_processor_kwargs,
-            )
-            if video_processor is None:
-                raise RuntimeError("No HuggingFace processor is available "
-                                   "to process the video object")
-            try:
-                # NOTE: Similar to image; it may be a good idea to filter and
-                # pass mm_processor_kwargs here too, but for now we don't to
-                # avoid extra complexity if the initializer and preprocess
-                # signatures of the processor don't align
-                batch_data = video_processor(data, return_tensors="pt").data
-            except Exception:
-                logger.error("Failed to process video (%s)", data)
-                raise
-
-            return MultiModalKwargs(batch_data)
-
-        raise TypeError(f"Invalid video type: {type(data)}")
-
-    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
-        return 4096
+from .base import MediaIO
+from .image import ImageMediaIO
 
 
 def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 014e8d5d882..65a6ed01451 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -134,26 +134,32 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
-    def add(self, next_output: "RequestOutput") -> None:
+    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
         """Merge subsequent RequestOutput into this one"""
 
         self.finished |= next_output.finished
 
         for next_completion in next_output.outputs:
-            for completion in self.outputs:
+            for i, completion in enumerate(self.outputs):
                 if completion.index == next_completion.index:
-                    # Merge outputs with same index
-                    completion.text += next_completion.text
-                    if not isinstance(completion.token_ids, MutableSequence):
-                        completion.token_ids = list(completion.token_ids)
-                    completion.token_ids.extend(next_completion.token_ids)
-                    if next_completion.logprobs:
-                        assert completion.logprobs is not None
-                        completion.logprobs.extend(next_completion.logprobs)
-                    completion.cumulative_logprob = (
-                        next_completion.cumulative_logprob)
-                    completion.finish_reason = next_completion.finish_reason
-                    completion.stop_reason = next_completion.stop_reason
+                    if aggregate:
+                        # Merge outputs with same index
+                        completion.text += next_completion.text
+                        if not isinstance(completion.token_ids,
+                                          MutableSequence):
+                            completion.token_ids = list(completion.token_ids)
+                        completion.token_ids.extend(next_completion.token_ids)
+                        if next_completion.logprobs:
+                            assert completion.logprobs is not None
+                            completion.logprobs.extend(
+                                next_completion.logprobs)
+                        completion.cumulative_logprob = (
+                            next_completion.cumulative_logprob)
+                        completion.finish_reason = next_completion.finish_reason
+                        completion.stop_reason = next_completion.stop_reason
+                    else:
+                        # Replace the output with the new one
+                        self.outputs[i] = next_completion
                     break
             else:
                 self.outputs.append(next_completion)
@@ -173,6 +179,13 @@ def from_seq_group(
                 group.finish_seq(seq_group)
             if assembled_seq_group is None:
                 return None
+
+            # clear finished seq in seq_id_to_seq_group
+            if len(group.to_be_finished) == 0:
+                for sub_request_id in list(group.seq_id_to_index.keys()):
+                    if sub_request_id in seq_id_to_seq_group:
+                        del seq_id_to_seq_group[sub_request_id]
+
             return cls.from_seq_group(assembled_seq_group, use_cache,
                                       seq_id_to_seq_group)
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 70553354a06..e45522a4c40 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -19,8 +19,6 @@
 else:
     VllmConfig = None
 
-logger = init_logger(__name__)
-
 
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0576022be44..ab03dece8c1 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -21,9 +21,6 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
-else:
-    ModelConfig = None
-    VllmConfig = None
 
 logger = init_logger(__name__)
 
@@ -109,7 +106,7 @@ def log_warnings(cls):
         pass
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         compilation_config = vllm_config.compilation_config
@@ -213,6 +210,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
         if use_v1:
+            if selected_backend == _Backend.FLASHINFER:
+                logger.info_once("Using FlashInfer backend on V1 engine.")
+                return "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
             if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
                 logger.info_once("Using Triton backend on V1 engine.")
                 return ("vllm.v1.attention.backends."
@@ -305,7 +305,7 @@ def supports_fp8(cls) -> bool:
         return cls.has_device_capability(89)
 
     @classmethod
-    def supports_v1(cls, model_config: ModelConfig) -> bool:
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
         return True
 
     @classmethod
@@ -454,10 +454,4 @@ def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
 
 CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
 
-try:
-    from sphinx.ext.autodoc.mock import _MockModule
-
-    if not isinstance(pynvml, _MockModule):
-        CudaPlatform.log_warnings()
-except ModuleNotFoundError:
-    CudaPlatform.log_warnings()
+CudaPlatform.log_warnings()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 2695da5778a..5df0e9d3d07 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 
-from vllm.inputs import PromptType
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -39,6 +39,7 @@ class _Backend(enum.Enum):
     TRITON_ATTN_VLLM_V1 = enum.auto()
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
+    ROCM_AITER_MLA = enum.auto()
     TORCH_SDPA = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
@@ -145,7 +146,7 @@ def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
     def is_cuda_alike(self) -> bool:
-        """Stateless version of :func:`torch.cuda.is_available`."""
+        """Stateless version of {func}`torch.cuda.is_available`."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     def is_sleep_mode_available(self) -> bool:
@@ -164,7 +165,7 @@ def get_device_capability(
         cls,
         device_id: int = 0,
     ) -> Optional[DeviceCapability]:
-        """Stateless version of :func:`torch.cuda.get_device_capability`."""
+        """Stateless version of {func}`torch.cuda.get_device_capability`."""
         return None
 
     @classmethod
@@ -179,7 +180,7 @@ def has_device_capability(
         The ``capability`` argument can either be:
 
         - A tuple ``(major, minor)``.
-        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
         """
         current_capability = cls.get_device_capability(device_id=device_id)
         if current_capability is None:
@@ -400,9 +401,26 @@ def validate_request(
         cls,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
 
+    def __getattr__(self, key: str):
+        device = getattr(torch, self.device_type, None)
+        if device is not None and hasattr(device, key):
+            return getattr(device, key)
+        else:
+            logger.warning("Current platform %s does not have '%s'" \
+            " attribute.", self.device_type, key)
+            return None
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        """
+        Returns the total number of compute units (CU) on single GPU.
+        """
+        raise NotImplementedError
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index c1f426e5b88..e37a3a578cf 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config:
             # neuron needs block_size = max_model_len
             vllm_config.cache_config.block_size = \
-                vllm_config.model_config.max_model_len
+                vllm_config.model_config.max_model_len  # type: ignore
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d18b7c26f7e..ff63f9656c0 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -13,9 +13,6 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
-else:
-    ModelConfig = None
-    VllmConfig = None
 
 logger = init_logger(__name__)
 
@@ -61,6 +58,15 @@
      "excessive use of shared memory. If this happens, disable Triton FA "
      "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
 }
+_ROCM_DEVICE_ID_NAME_MAP: Dict[str, str] = {
+    "0x74a0": "AMD_Instinct_MI300A",
+    "0x74a1": "AMD_Instinct_MI300X",
+    "0x74b5": "AMD_Instinct_MI300X",  # MI300X VF
+    "0x74a5": "AMD_Instinct_MI325X",
+    "0x74b9": "AMD_Instinct_MI325X",  # MI325X VF
+    "0x74a9": "AMD_Instinct_MI300X_HF",
+    "0x74bd": "AMD_Instinct_MI300X_HF",
+}
 
 # Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
 if "HIP_VISIBLE_DEVICES" in os.environ:
@@ -98,6 +104,11 @@ def device_id_to_physical_device_id(device_id: int) -> int:
         return device_id
 
 
+def on_mi250_mi300() -> bool:
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+
+
 @cache
 def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
                                     block_size: int, gqa_ratio: int,
@@ -105,17 +116,20 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
                                     sliding_window: int) -> bool:
 
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-    ON_NAVI = "gfx1" in GPU_ARCH
-    ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+    ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
 
-    # rocm custom page attention not support on navi (gfx1*)
-    return (ON_MI250_MI300 and not ON_NAVI
-            and (sliding_window == 0 or sliding_window == (-1, -1))
+    # rocm custom page attention not support on gfx1*
+    # custom paged attn always supported on V0. On V1, requires sliding window
+    # disabled due to observed numerical discrepancy.
+    return (ON_GFX9 and (not envs.VLLM_USE_V1 or sliding_window == 0
+                         or sliding_window == (-1, -1))
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+            and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+                     and envs.VLLM_ROCM_USE_AITER))
 
 
 class RocmPlatform(Platform):
@@ -128,8 +142,8 @@ class RocmPlatform(Platform):
     device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
 
     supported_quantization: list[str] = [
-        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-        "fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
+        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
+        "quark", "ptpc_fp8"
     ]
 
     @classmethod
@@ -137,8 +151,36 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
         if use_mla:
-            logger.info("Using Triton MLA backend.")
-            return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            from vllm.attention.backends.rocm_aiter_mla import (
+                is_aiter_mla_enabled)
+
+            if selected_backend is None:
+                selected_backend = (_Backend.ROCM_AITER_MLA if
+                                    is_aiter_mla_enabled() or block_size == 1
+                                    else _Backend.TRITON_MLA)
+
+            if selected_backend == _Backend.TRITON_MLA:
+                if block_size != 1:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}.")
+            elif selected_backend == _Backend.ROCM_AITER_MLA:
+                if block_size == 1:
+                    logger.info("Using AITER MLA backend.")
+                    return "vllm.attention.backends.rocm_aiter_mla.AiterMLABackend"  # noqa: E501
+                else:
+                    raise ValueError(
+                        f" The selected backend, {selected_backend.name},"
+                        f"does not support block size {block_size}."
+                        "(currently only supports block size 1)")
+            else:
+                raise ValueError(
+                    f" The selected backend, {selected_backend.name},"
+                    f"is not MLA type while requested for MLA backend.")
+
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if envs.VLLM_USE_V1:
@@ -192,7 +234,11 @@ def is_fully_connected(physical_device_ids: List[int]) -> bool:
     def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
         handle = amdsmi_get_processor_handles()[physical_device_id]
-        return amdsmi_get_gpu_asic_info(handle)["market_name"]
+        asic_info = amdsmi_get_gpu_asic_info(handle)
+        device_name: str = asic_info["device_id"]
+        if device_name in _ROCM_DEVICE_ID_NAME_MAP:
+            return _ROCM_DEVICE_ID_NAME_MAP[device_name]
+        return asic_info["market_name"]
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -210,7 +256,7 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
     @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
@@ -299,7 +345,7 @@ def fp8_dtype(cls) -> torch.dtype:
             return torch.float8_e4m3fn
 
     @classmethod
-    def supports_v1(cls, model_config: ModelConfig) -> bool:
+    def supports_v1(cls, model_config: "ModelConfig") -> bool:
         # V1 support on AMD gpus is experimental
         return True
 
@@ -307,5 +353,10 @@ def supports_v1(cls, model_config: ModelConfig) -> bool:
     def use_custom_allreduce(cls) -> bool:
         # We only enable custom allreduce for MI300 series
         gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
-        supported_archs = ['gfx94']
+        supported_archs = ['gfx94', 'gfx95']
         return any(gfx in gcn_arch for gfx in supported_archs)
+
+    @classmethod
+    def get_cu_count(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(
+            device_id).multi_processor_count
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index d8807a72ba2..9c95e6d3fa0 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -3,9 +3,10 @@
 from typing import TYPE_CHECKING, Optional, Union
 
 import torch
+from tpu_info import device
 
 import vllm.envs as envs
-from vllm.inputs import PromptType
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams, SamplingType
 
@@ -30,9 +31,7 @@ class TpuPlatform(Platform):
     ray_device_key: str = "TPU"
     device_control_env_var: str = "TPU_VISIBLE_CHIPS"
 
-    supported_quantization: list[str] = [
-        "tpu_int8", "compressed-tensors", "compressed_tensors"
-    ]
+    supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
 
     additional_env_vars: list[str] = [
         "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
@@ -56,7 +55,8 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
-        return "tpu"
+        chip_type, _ = device.get_local_chips()
+        return f"TPU {chip_type.name}"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -97,6 +97,20 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "Using bfloat16 instead.", vllm_config.model_config.dtype)
             vllm_config.model_config.dtype = torch.bfloat16
 
+        if envs.VLLM_USE_V1:
+            from vllm.v1.attention.backends.pallas import (
+                PallasAttentionBackend)
+            min_page_size = PallasAttentionBackend.get_min_page_size(
+                vllm_config)
+            if min_page_size > vllm_config.cache_config.block_size:
+                logger.warning(
+                    "Increase the page size from %s to %s to make sure there's"
+                    "no SMEM OOM",
+                    vllm_config.cache_config.block_size,
+                    min_page_size,
+                )
+                vllm_config.cache_config.block_size = min_page_size
+
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
@@ -150,12 +164,13 @@ def validate_request(
         cls,
         prompt: PromptType,
         params: Union[SamplingParams, PoolingParams],
+        processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
         if isinstance(params, SamplingParams):
-            if params.guided_decoding is not None:
+            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
                 raise ValueError("Structured output is not supported on "
-                                 f"{cls.device_name}.")
+                                 f"{cls.device_name} V0.")
             if params.sampling_type == SamplingType.RANDOM_SEED:
                 raise ValueError(
                     "Torch XLA does not support per-request seed.")
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index f71daf0c195..9a3b254f9b6 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -35,7 +35,16 @@ def verify(self, model_config: "ModelConfig") -> None:
                     f'Model "{model_config.served_model_name}" does not '
                     f'support matryoshka representation, '
                     f'changing output dimensions will lead to poor results.')
-            if self.dimensions < 1:
+
+            mds = model_config.matryoshka_dimensions
+            if mds is not None:
+                if self.dimensions not in mds:
+                    raise ValueError(
+                        f'Model "{model_config.served_model_name}" '
+                        f'only supports {str(mds)} matryoshka dimensions, '
+                        f'use other output dimensions will '
+                        f'lead to poor results.')
+            elif self.dimensions < 1:
                 raise ValueError("Dimensions must be greater than 0")
 
     def __repr__(self) -> str:
diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py
index 00af72b1d41..e69de29bb2d 100644
--- a/vllm/profiler/__init__.py
+++ b/vllm/profiler/__init__.py
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from .layerwise_profile import layerwise_profile
-
-__all__ = [
-    "layerwise_profile",
-]
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 45132a780e5..65606ce55af 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -3,10 +3,12 @@
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 from .granite_reasoning_parser import GraniteReasoningParser
+from .qwen3_reasoning_parser import Qwen3ReasoningParser
 
 __all__ = [
     "ReasoningParser",
     "ReasoningParserManager",
     "DeepSeekR1ReasoningParser",
     "GraniteReasoningParser",
+    "Qwen3ReasoningParser",
 ]
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
new file mode 100644
index 00000000000..7095034b1ca
--- /dev/null
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("qwen3")
+class Qwen3ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for the Qwen3 model.
+
+    The Qwen3 model uses <think>...</think> tokens to denote reasoning text
+    within its output. The model provides a strict switch to disable reasoning
+    output via the 'enable_thinking=False' parameter. This parser extracts the
+    reasoning content enclosed by <think> and </think> tokens from the model's
+    output.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+        self.think_start_token = "<think>"
+        self.think_end_token = "</think>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction.")
+
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.think_end_token_id = self.vocab.get(self.think_end_token)
+        if (self.think_start_token_id is None
+                or self.think_end_token_id is None):
+            raise RuntimeError(
+                "Qwen3 reasoning parser could not locate think start/end "
+                "tokens in the tokenizer!")
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.think_end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract reasoning content from a delta message.
+        Handles streaming output where previous + delta = current.
+        Uses token IDs for faster processing.
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+        """
+        # Skip single special tokens
+        if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
+                self.think_start_token_id, self.think_end_token_id
+        ]):
+            return None
+
+        if self.think_start_token_id in previous_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in previous, </think> in delta,
+                # extract reasoning content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # <think> in previous, </think> in previous,
+                # reasoning content continues
+                return DeltaMessage(content=delta_text)
+            else:
+                # <think> in previous, no </think> in previous or delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        elif self.think_start_token_id in delta_token_ids:
+            if self.think_end_token_id in delta_token_ids:
+                # <think> in delta, </think> in delta, extract reasoning content
+                start_index = delta_text.find(self.think_start_token)
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[start_index +
+                                               len(self.think_start_token
+                                                   ):end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            else:
+                # <think> in delta, no </think> in delta,
+                # reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
+        else:
+            # thinking is disabled, just content
+            return DeltaMessage(content=delta_text)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the model output contains the <think> and </think> tokens.
+        if (self.think_start_token not in model_output
+                or self.think_end_token not in model_output):
+            return None, model_output
+        # Check if the <think> is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.think_start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+        # Check if the model output contains the </think> tokens.
+        # If the end token is not found, return the model output as is.
+        if self.think_end_token not in model_output:
+            return None, model_output
+
+        # Extract reasoning content from the model output.
+        reasoning_content, _, content = model_output.partition(
+            self.think_end_token)
+
+        final_content = content or None
+        return reasoning_content, final_content
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 68ed9966494..66a77681be9 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -8,6 +8,7 @@
 
 import msgspec
 from pydantic import BaseModel
+from typing_extensions import deprecated
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
@@ -37,7 +38,12 @@ class GuidedDecodingParams:
     json_object: Optional[bool] = None
     """These are other options that can be set"""
     backend: Optional[str] = None
+    backend_was_auto: bool = False
+    disable_fallback: bool = False
+    disable_any_whitespace: bool = False
+    disable_additional_properties: bool = False
     whitespace_pattern: Optional[str] = None
+    structural_tag: Optional[str] = None
 
     @staticmethod
     def from_optional(
@@ -48,9 +54,10 @@ def from_optional(
         json_object: Optional[bool] = None,
         backend: Optional[str] = None,
         whitespace_pattern: Optional[str] = None,
+        structural_tag: Optional[str] = None,
     ) -> Optional["GuidedDecodingParams"]:
-        if all(arg is None
-               for arg in (json, regex, choice, grammar, json_object)):
+        if all(arg is None for arg in (json, regex, choice, grammar,
+                                       json_object, structural_tag)):
             return None
         # Extract json schemas from pydantic models
         if isinstance(json, (BaseModel, type(BaseModel))):
@@ -63,27 +70,9 @@ def from_optional(
             json_object=json_object,
             backend=backend,
             whitespace_pattern=whitespace_pattern,
+            structural_tag=structural_tag,
         )
 
-    @property
-    def backend_name(self) -> str:
-        """Return the backend name without any options.
-        
-        For example if the backend is "xgrammar:no-fallback", returns "xgrammar"
-        """
-        return (self.backend or "").split(":")[0]
-
-    def backend_options(self) -> list[str]:
-        """Return the backend options as a list of strings."""
-        if not self.backend or ":" not in self.backend:
-            return []
-        return self.backend.split(":")[1].split(",")
-
-    def no_fallback(self) -> bool:
-        """Returns True if the "no-fallback" option is supplied for the guided
-        decoding backend"""
-        return "no-fallback" in self.backend_options()
-
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
         guide_count = sum([
@@ -95,6 +84,27 @@ def __post_init__(self):
                 "You can only use one kind of guided decoding but multiple are "
                 f"specified: {self.__dict__}")
 
+        if self.backend is not None and ":" in self.backend:
+            self._extract_backend_options()
+
+    @deprecated(
+        "Passing guided decoding backend options inside backend in the format "
+        "'backend:...' is deprecated. This will be removed in v0.10.0. Please "
+        "use the dedicated arguments '--disable-fallback', "
+        "'--disable-any-whitespace' and '--disable-additional-properties' "
+        "instead.")
+    def _extract_backend_options(self):
+        """Extract backend options from the backend string."""
+        assert isinstance(self.backend, str)
+        self.backend, options = self.backend.split(":")
+        options_set = set(options.strip().split(","))
+        if "no-fallback" in options_set:
+            self.disable_fallback = True
+        if "disable-any-whitespace" in options_set:
+            self.disable_any_whitespace = True
+        if "no-additional-properties" in options_set:
+            self.disable_additional_properties = True
+
 
 class RequestOutputKind(Enum):
     # Return entire output so far in every RequestOutput
@@ -176,9 +186,10 @@ class SamplingParams(
         logits_processors: list of functions that modify logits based on
             previously generated tokens, and optionally prompt tokens as
             a first argument.
-        truncate_prompt_tokens: If set to an integer k, will use only the last k
-            tokens from the prompt (i.e., left truncation). Defaults to None
-            (i.e., no truncation).
+        truncate_prompt_tokens: If set to -1, will use the truncation size
+            supported by the model. If set to an integer k, will use only
+            the last k tokens from the prompt (i.e., left truncation).
+            Defaults to None (i.e., no truncation).
         guided_decoding: If provided, the engine will construct a guided
             decoding logits processor from these parameters. Defaults to None.
         logit_bias: If provided, the engine will construct a logits processor
@@ -423,6 +434,10 @@ def _verify_args(self) -> None:
                 and self.truncate_prompt_tokens < 1):
             raise ValueError(f"truncate_prompt_tokens must be >= 1, "
                              f"got {self.truncate_prompt_tokens}")
+        assert isinstance(self.stop_token_ids, list)
+        if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
+            raise ValueError(f"stop_token_ids must contain only integers, "
+                             f"got {self.stop_token_ids}.")
         assert isinstance(self.stop, list)
         if any(not stop_str for stop_str in self.stop):
             raise ValueError("stop cannot contain an empty string.")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 61867b02531..91f769d6dbd 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,9 +14,9 @@
 import msgspec
 import torch
 
-from vllm.inputs import SingletonInputs, SingletonInputsAdapter
+from vllm.inputs import SingletonInputs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
+from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
@@ -27,7 +27,7 @@
 
 
 def array_full(token_id: int, count: int):
-    """:class:`array` equivalent of :func:`numpy.full`."""
+    """{class}`array` equivalent of {func}`numpy.full`."""
     return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
 
 
@@ -166,6 +166,9 @@ class SequenceData(msgspec.Struct,
     _output_token_ids: array = msgspec.field(
         default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, []))
 
+    _prompt_embeds: Optional[torch.Tensor] = None
+    _output_embeds: Optional[torch.Tensor] = None
+
     ### The below fields should not be passed as an argument ###
     _cumulative_logprob: float = 0.0
     _prompt_token_ids_tuple: tuple[int,
@@ -176,6 +179,7 @@ class SequenceData(msgspec.Struct,
     _num_cached_tokens: int = 0
     _stage: SequenceStage = SequenceStage.PREFILL
     _cached_all_token_ids: list[int] = msgspec.field(default_factory=list)
+    _cached_all_token_embeds: Optional[torch.Tensor] = None
 
     # It is used to get delta input. It is reset when `get_delta_and_reset`
     # is called.
@@ -188,11 +192,11 @@ class SequenceData(msgspec.Struct,
     def from_prompt_token_counts(
             *token_counts: tuple[int, int]) -> "SequenceData":
         """
-        Construct a :class:`SequenceData` instance by concatenating
+        Construct a {class}`SequenceData` instance by concatenating
         prompt token sequences.
 
         Each tuple represents one token sequence, expressed in the form
-        :code:`(token_id, count)`.
+        `(token_id, count)`.
         """
         if len(token_counts) == 0:
             return SequenceData.from_seqs([])
@@ -208,22 +212,26 @@ def from_prompt_token_counts(
     def from_seqs(
         prompt_token_ids: GenericSequence[int],
         output_token_ids: Optional[GenericSequence[int]] = None,
+        *,
+        prompt_embeds: Optional[torch.Tensor] = None,
     ) -> "SequenceData":
         """
-        Construct a :class:`SequenceData` instance from prompt and output
+        Construct a {class}`SequenceData` instance from prompt and output
         token sequences.
         """
         prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
                                      prompt_token_ids)
 
         if output_token_ids is None:
-            return SequenceData(prompt_token_ids_arr)
+            return SequenceData(prompt_token_ids_arr,
+                                _prompt_embeds=prompt_embeds)
 
         output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
                                      output_token_ids)
 
         return SequenceData(prompt_token_ids_arr,
-                            _output_token_ids=output_token_ids_arr)
+                            _output_token_ids=output_token_ids_arr,
+                            _prompt_embeds=prompt_embeds)
 
     def __post_init__(self) -> None:
         assert self._prompt_token_ids.typecode == "l"
@@ -231,6 +239,8 @@ def __post_init__(self) -> None:
         self._prompt_token_ids_tuple: tuple[int, ...] = tuple(
             self._prompt_token_ids)
         self._update_cached_all_tokens()
+        if self._prompt_embeds is not None:
+            self._update_cached_all_token_embeds()
 
     def _update_cached_all_tokens(self):
         assert isinstance(self._prompt_token_ids, array)
@@ -238,6 +248,13 @@ def _update_cached_all_tokens(self):
         self._cached_all_token_ids: list[int] = list(self._prompt_token_ids +
                                                      self._output_token_ids)
 
+    def _update_cached_all_token_embeds(self):
+        assert isinstance(self._prompt_embeds, torch.Tensor)
+        self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds
+        if self._output_embeds is not None:
+            self._cached_all_token_embeds = torch.cat(
+                (self._cached_all_token_embeds, self._output_embeds), dim=0)
+
     @property
     def cumulative_logprob(self) -> float:
         return self._cumulative_logprob
@@ -270,6 +287,15 @@ def output_token_ids(self,
                                        new_output_token_ids)
         self._update_cached_all_tokens()
 
+    @property
+    def output_embeds(self) -> Optional[torch.Tensor]:
+        return self._output_embeds
+
+    @output_embeds.setter
+    def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None:
+        self._output_token_embeds = new_output_token_embeds
+        self._update_cached_all_token_embeds()
+
     @property
     def output_token_ids_array(self) -> array:
         """Return the prompt token ids in array type.
@@ -280,6 +306,15 @@ def output_token_ids_array(self) -> array:
         assert isinstance(self._output_token_ids, array)
         return self._output_token_ids
 
+    @property
+    def prompt_embeds(self) -> Optional[torch.Tensor]:
+        return self._prompt_embeds
+
+    @prompt_embeds.setter
+    def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None:
+        self._prompt_embeds = prompt_embeds
+        self._update_cached_all_token_embeds()
+
     @property
     def mrope_position_delta(self) -> Optional[int]:
         return self._mrope_position_delta
@@ -288,11 +323,28 @@ def mrope_position_delta(self) -> Optional[int]:
     def mrope_position_delta(self, new_mrope_position_delta):
         self._mrope_position_delta = new_mrope_position_delta
 
-    def append_token_id(self, token_id: int, logprob: float) -> None:
+    def append_token_id(self,
+                        token_id: int,
+                        logprob: float,
+                        token_embed: Optional[torch.Tensor] = None) -> None:
         self._output_token_ids.append(token_id)
         self._new_appended_tokens.append(token_id)
         self._cached_all_token_ids.append(token_id)
         self._cumulative_logprob += logprob
+        if token_embed is not None:
+            # Do not pass in with batch or sequence dimensions
+            assert token_embed.ndim == 1
+            token_embed = token_embed.detach().cpu().unsqueeze(0)
+            if self._output_embeds is None:
+                self._output_embeds = token_embed
+            else:
+                self._output_embeds = torch.cat(
+                    (self._output_embeds, token_embed), dim=0)
+            assert self._cached_all_token_embeds is not None
+            self._cached_all_token_embeds = torch.cat(
+                (self._cached_all_token_embeds,
+                 token_embed.to(device=self._cached_all_token_embeds.device)),
+                dim=0)
 
     def get_len(self) -> int:
         return len(self._output_token_ids) + len(self._prompt_token_ids)
@@ -306,6 +358,9 @@ def get_output_len(self) -> int:
     def get_token_ids(self) -> list[int]:
         return self._cached_all_token_ids
 
+    def get_token_embeddings(self) -> Optional[torch.Tensor]:
+        return self._cached_all_token_embeds
+
     def get_prefix_token_ids(
             self, num_tokens: int
     ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]:
@@ -387,6 +442,8 @@ def stage(self) -> SequenceStage:
     def __repr__(self) -> str:
         return (f"SequenceData("
                 f"prompt_token_ids={self._prompt_token_ids}, "
+                f"prompt_embeds.shape="
+                f"{getattr(self._prompt_embeds, 'shape', None)}, "
                 f"output_token_ids={self.output_token_ids}, "
                 f"cumulative_logprob={self.cumulative_logprob}, "
                 f"get_num_computed_tokens={self.get_num_computed_tokens()})")
@@ -395,9 +452,9 @@ def __repr__(self) -> str:
 class Sequence:
     """Stores the data, status, and block information of a sequence.
 
-    The sequence is constructed from the :data:`DecoderOnlyInputs`
-    (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
-    instance passed in through the :code:`inputs` constructor argument.
+    The sequence is constructed from the {data}`DecoderOnlyInputs`
+    (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
+    instance passed in through the `inputs` constructor argument.
 
     Args:
         seq_id: The ID of the sequence.
@@ -419,13 +476,16 @@ def __init__(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.seq_id = seq_id
-        self.inputs = SingletonInputsAdapter(inputs)
+        self.inputs = inputs
         self.block_size = block_size
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
         self.prompt_adapter_request = prompt_adapter_request
 
-        self.data = SequenceData.from_seqs(self.prompt_token_ids)
+        self.data = SequenceData.from_seqs(
+            self.prompt_token_ids,
+            prompt_embeds=self.inputs["prompt_embeds"]
+            if self.inputs["type"] == "embeds" else None)
         self.output_logprobs: SampleLogprobs = []
         self.output_text = ""
 
@@ -448,31 +508,35 @@ def n_blocks(self) -> int:
 
     @property
     def prompt(self) -> Optional[str]:
-        return self.inputs.prompt
+        if self.inputs["type"] == "embeds":
+            return None
+        return self.inputs.get("prompt")
 
     @property
     def prompt_token_ids(self) -> list[int]:
-        return self.inputs.prompt_token_ids
-
-    @property
-    def prompt_embeds(self) -> Optional[torch.Tensor]:
-        return self.inputs.prompt_embeds
+        if self.inputs["type"] == "embeds":
+            return [0] * len(self.inputs["prompt_embeds"])
+        return self.inputs["prompt_token_ids"]
 
     @property
     def token_type_ids(self) -> list[int]:
-        return self.inputs.token_type_ids
+        if self.inputs["type"] == "embeds":
+            return []
+        return self.inputs.get("token_type_ids", [])
 
     @property
-    def multi_modal_data(self) -> "MultiModalDataDict":
-        return self.inputs.multi_modal_data
+    def multi_modal_data(self) -> MultiModalKwargs:
+        if self.inputs["type"] == "multimodal":
+            return self.inputs["mm_kwargs"]
+
+        return MultiModalKwargs({})
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        return self.inputs.multi_modal_placeholders
+        if self.inputs["type"] == "multimodal":
+            return self.inputs["mm_placeholders"]
 
-    @property
-    def mm_processor_kwargs(self) -> dict[str, Any]:
-        return self.inputs.mm_processor_kwargs
+        return {}
 
     @property
     def lora_int_id(self) -> int:
@@ -556,11 +620,14 @@ def reset_state_for_recompute(self):
         """Reset the sequence states for recomputation."""
         self.data.reset_state_for_recompute()
 
-    def append_token_id(self, token_id: int, logprobs: dict[int,
-                                                            Logprob]) -> None:
+    def append_token_id(self,
+                        token_id: int,
+                        logprobs: dict[int, Logprob],
+                        token_embed: Optional[torch.Tensor] = None) -> None:
         assert token_id in logprobs
         self.output_logprobs.append(logprobs)
-        self.data.append_token_id(token_id, logprobs[token_id].logprob)
+        self.data.append_token_id(token_id, logprobs[token_id].logprob,
+                                  token_embed)
 
     def get_len(self) -> int:
         return self.data.get_len()
@@ -723,12 +790,12 @@ def token_type_ids(self) -> Optional[list[int]]:
         return self.first_seq.token_type_ids
 
     @property
-    def multi_modal_data(self) -> MultiModalDataDict:
+    def multi_modal_data(self) -> MultiModalKwargs:
         if self.first_seq.multi_modal_data:
             return self.first_seq.multi_modal_data
         elif self.encoder_seq is not None:
             return self.encoder_seq.multi_modal_data
-        return {}
+        return MultiModalKwargs({})
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
@@ -738,14 +805,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
             return self.encoder_seq.multi_modal_placeholders
         return {}
 
-    @property
-    def mm_processor_kwargs(self) -> dict[str, Any]:
-        if self.first_seq.multi_modal_data:
-            return self.first_seq.mm_processor_kwargs
-        elif self.encoder_seq is not None:
-            return self.encoder_seq.mm_processor_kwargs
-        return {}
-
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
@@ -899,6 +958,10 @@ def __repr__(self) -> str:
                 f"sampling_params={self.sampling_params}, "
                 f"num_seqs={len(self.seqs)})")
 
+    def uses_prompt_embeds(self) -> bool:
+        """Returns True if the sequence group uses input embeds."""
+        return any(seq.data.prompt_embeds is not None for seq in self.seqs)
+
 
 class SequenceGroupMetadataDelta(
         msgspec.Struct,
@@ -969,12 +1032,9 @@ class SequenceGroupMetadata(
     computed_block_nums: Optional[list[int]] = None
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
-    # "MultiModalDataDict" types. We have to use Any due to msgspec
-    # doesn't allow to have union of 2 different dicts.
     token_type_ids: Optional[list[int]] = None
-    multi_modal_data: Optional[Any] = None
+    multi_modal_data: Optional[MultiModalKwargs] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
-    mm_processor_kwargs: Optional[dict[str, Any]] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[list[int]] = None
     prompt_adapter_request: Optional[PromptAdapterRequest] = None
@@ -1056,10 +1116,14 @@ class SequenceOutput(
     parent_seq_id: int
     output_token: int
     logprobs: dict[int, Logprob]
+    output_embed: Optional[torch.Tensor] = None
 
     def __repr__(self) -> str:
+        output_embed_shape = \
+            self.output_embed.shape if self.output_embed is not None else None
         return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
                 f"output_token={self.output_token}, "
+                f"output_embed.shape={output_embed_shape}"
                 f"logprobs={self.logprobs})")
 
     def __eq__(self, other: object) -> bool:
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 3ad9b499332..a6276c56339 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -201,6 +201,9 @@ def execute_model(
             if self.prompt_adapter_config is not None:
                 raise ValueError("TP1DraftModelRunner has no support for "
                                  "prompt_adapter_config")
+            if model_input.inputs_embeds is not None:
+                raise ValueError("TP1DraftModelRunner has no support for "
+                                 "inputs_embeds")
             if model_input.multi_modal_kwargs:
                 raise ValueError(
                     "TP1DraftModelRunner has no support for multi_modal_kwargs"
@@ -242,9 +245,16 @@ def execute_model(
 
         # Get model
         if use_cuda_graph:
-            graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = (self.graph_runners[model_input.virtual_engine]
-                                [graph_batch_size])
+            if model_input.inputs_embeds is None:
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, False)])
+            else:
+                graph_batch_size = model_input.inputs_embeds.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, True)])
 
             if previous_hidden_states is not None:
                 hidden_states = torch.cat([
@@ -281,6 +291,7 @@ def execute_model(
                                      self.vllm_config):
                 hidden_states = model_executable(
                     input_ids=model_input.input_tokens,
+                    inputs_embeds=None,
                     positions=model_input.input_positions,
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
@@ -295,7 +306,7 @@ def execute_model(
             if not self.is_driver_worker:
                 return []
             # Sample the next token.
-            output = self.model.sample(
+            output = self.model_runner.sampler(
                 logits=logits,
                 sampling_metadata=model_input.sampling_metadata,
             )
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index bc0e0a121cd..0bb8d602ec8 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -8,6 +8,7 @@
 
 from vllm.model_executor.layers.spec_decode_base_sampler import (
     SpecDecodeBaseSampler)
+from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
 
@@ -89,14 +90,14 @@ def init_tensors(self,
         self._rank = rank
         if isinstance(device_type, torch.device):
             device_type = device_type.type
-        if device_type == 'cuda':
-            self._copy_stream = torch.cuda.Stream()
+        stream = current_platform.Stream
+        if stream is not None:
+            self._copy_stream = stream()
 
     def maybe_collect_rejsample_metrics(
             self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
-        # currently using cuda.Event, skip for any non_cuda_alike platform
-        from vllm.platforms import current_platform
-        if not current_platform.is_cuda_alike():
+        # Skip for any platform that doesn't have device Event
+        if current_platform.Event is None:
             return None
 
         # If a copy was initiated in the previous call, collect and return.
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index d8d54918fa9..de57403d1b5 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -50,11 +50,15 @@ def init_device(self) -> None:
 
     def set_include_gpu_probs_tensor(self) -> None:
         # Need include_gpu_probs_tensor for MultiStepWorker
-        self.model_runner.model.sampler.include_gpu_probs_tensor = True
+        self.model_runner.sampler.include_gpu_probs_tensor = True
+        if hasattr(self.model_runner.model, "sampler"):
+            (self.model_runner.model.sampler.include_gpu_probs_tensor) = True
 
     def set_should_modify_greedy_probs_inplace(self) -> None:
-        self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
-            True)
+        self.model_runner.sampler.should_modify_greedy_probs_inplace = True
+        if hasattr(self.model_runner.model, "sampler"):
+            (self.model_runner.model.sampler.should_modify_greedy_probs_inplace
+             ) = True
 
     @torch.inference_mode()
     def sampler_output(
@@ -278,7 +282,8 @@ def _append_new_tokens(
                 else:
                     count += 1
 
-                seq.append_token_id(token_id, token_logprob.logprob)
+                seq.append_token_id(token_id, token_logprob.logprob,
+                                    seq_output.output_embed)
                 seq.update_num_computed_tokens(1)
 
     @staticmethod
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index 69195624650..ea3d91d7893 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -52,7 +52,8 @@ def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]):
         """Create a SmallerTpProposerWorker.
 
         Args:
-            worker (MultiStepWorker): an actual worker wrapped with this class
+            worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an
+            actual worker wrapped with this class
             draft_ranks (List[int]): if this value is given, only the GPU ranks
             written in this value participate in draft generation
         """
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index a724beade12..6ba5a51007b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -410,9 +410,9 @@ def _configure_model_sampler_for_spec_decode(self):
         NOTE(cade): This will require a special check if the proposer worker
         does not have a sampler (e.g. ngram speculation).
         """
-        (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
+        (self.scorer_worker.model_runner.sampler.include_gpu_probs_tensor
          ) = True
-        (self.scorer_worker.model_runner.model.sampler.
+        (self.scorer_worker.model_runner.sampler.
          should_modify_greedy_probs_inplace) = True
         self.proposer_worker.set_include_gpu_probs_tensor()
         self.proposer_worker.set_should_modify_greedy_probs_inplace()
@@ -695,6 +695,7 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
                     seq_group_meta_with_hidden):
                 self.previous_hidden_states.update(hidden_states,
                                                    seq_group_meta_with_hidden)
+                self.previous_hidden_states.prune(seq_group_meta_with_hidden)
 
         if not skip_proposer:
             # We prepare the prefill hidden states here so that there no
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f37605be828..f6c2b35535b 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -34,12 +34,13 @@
                                              H2OVLChatConfig,
                                              InternVLChatConfig, JAISConfig,
                                              KimiVLConfig, MedusaConfig,
-                                             MllamaConfig, MLPSpeculatorConfig,
-                                             MPTConfig, NemotronConfig,
-                                             NVLM_D_Config, Olmo2Config,
-                                             RWConfig, SkyworkR1VChatConfig,
-                                             SolarConfig, Telechat2Config,
-                                             UltravoxConfig)
+                                             MiniMaxText01Config,
+                                             MiniMaxVL01Config, MllamaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             NemotronConfig, NVLM_D_Config,
+                                             OvisConfig, RWConfig,
+                                             SkyworkR1VChatConfig, SolarConfig,
+                                             Telechat2Config, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -74,9 +75,11 @@
     "exaone": ExaoneConfig,
     "h2ovl_chat": H2OVLChatConfig,
     "internvl_chat": InternVLChatConfig,
+    "minimax_text_01": MiniMaxText01Config,
+    "minimax_vl_01": MiniMaxVL01Config,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
-    "olmo2": Olmo2Config,
+    "ovis": OvisConfig,
     "solar": SolarConfig,
     "skywork_chat": SkyworkR1VChatConfig,
     "telechat": Telechat2Config,
@@ -222,8 +225,7 @@ def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None:
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
 
-def uses_mrope(config: PretrainedConfig) -> bool:
-    """Detect if the model with this config uses M-ROPE."""
+def _uses_mrope(config: PretrainedConfig) -> bool:
     rope_scaling = getattr(config, "rope_scaling", None)
     if rope_scaling is None:
         return False
@@ -231,6 +233,24 @@ def uses_mrope(config: PretrainedConfig) -> bool:
     return "mrope_section" in rope_scaling
 
 
+def uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config uses M-ROPE."""
+    return _uses_mrope(config) or thinker_uses_mrope(config)
+
+
+def thinker_uses_mrope(config: PretrainedConfig) -> bool:
+    """Detect if the model contains a thinker config and it uses M-ROPE."""
+    thinker_config = getattr(config, "thinker_config", None)
+    if thinker_config is None:
+        return False
+
+    thinker_text_config = getattr(thinker_config, "text_config", None)
+    if thinker_text_config is None:
+        return False
+
+    return uses_mrope(thinker_text_config)
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
     text_config = getattr(config, "text_config", None)
@@ -635,6 +655,11 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
     config_file_name = "params.json"
 
     config_dict = get_hf_file_to_dict(config_file_name, model, revision)
+    if config_dict is None:
+        raise ValueError(
+            f"Failed to load mistral '{config_file_name}' config for model "
+            f"{model}. Please check if the model is a mistral-format model "
+            f"and if the config file exists.")
     assert isinstance(config_dict, dict)
 
     config_mapping = {
@@ -673,6 +698,9 @@ def recurse_elems(elem: Any):
                 "quant_method": "fp8",
                 "activation_scheme": "static"
             }
+        elif quantization.get("quant_method") == "compressed-tensors":
+            # Pass through the quantization config to compressed-tensors
+            quantization_config = quantization
         else:
             raise ValueError(
                 f"Found unknown quantization='{quantization}' in config")
@@ -690,6 +718,7 @@ def recurse_elems(elem: Any):
 
     if config_type == "multimodal":
         multimodal_config = config_dict.pop("vision_encoder")
+        quantization_config = config_dict.get("quantization_config", {})
 
         config_dict = {
             "text_config": config_dict,
@@ -697,6 +726,8 @@ def recurse_elems(elem: Any):
         }
         config_dict["architectures"] = ["PixtralForConditionalGeneration"]
         config_dict["model_type"] = "pixtral"
+        if quantization_config:
+            config_dict["quantization_config"] = quantization_config
 
     config_dict.update(kwargs)
 
@@ -734,14 +765,22 @@ def get_hf_text_config(config: PretrainedConfig):
     """Get the "sub" config relevant to llm for multi modal models.
     No op for pure text models.
     """
-    if hasattr(config, "text_config"):
+    # This block should be unnecessary after https://github.com/huggingface/transformers/pull/37517
+    if hasattr(config, "thinker_config"):
+        # TODO(suyang.fy): Refactor code.
+        #  For Qwen2.5-Omni, change hf_text_config to
+        #  thinker_config.text_config.
+        return config.thinker_config.text_config
+
+    text_config = config.get_text_config()
+
+    if text_config is not config:
         # The code operates under the assumption that text_config should have
         # `num_attention_heads` (among others). Assert here to fail early
         # if transformers config doesn't align with this assumption.
-        assert hasattr(config.text_config, "num_attention_heads")
-        return config.text_config
-    else:
-        return config
+        assert hasattr(text_config, "num_attention_heads")
+
+    return text_config
 
 
 def try_get_generation_config(
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 739eea5cba5..db3efafeef9 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -15,13 +15,15 @@
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
+from vllm.transformers_utils.configs.minimax_text_01 import MiniMaxText01Config
+from vllm.transformers_utils.configs.minimax_vl_01 import MiniMaxVL01Config
 from vllm.transformers_utils.configs.mllama import MllamaConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
-from vllm.transformers_utils.configs.olmo2 import Olmo2Config
+from vllm.transformers_utils.configs.ovis2 import OvisConfig
 from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.telechat2 import Telechat2Config
@@ -40,13 +42,15 @@
     "MedusaConfig",
     "EAGLEConfig",
     "ExaoneConfig",
+    "MiniMaxText01Config",
+    "MiniMaxVL01Config",
     "MllamaConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
     "KimiVLConfig",
     "NemotronConfig",
     "NVLM_D_Config",
-    "Olmo2Config",
+    "OvisConfig",
     "SkyworkR1VChatConfig",
     "SolarConfig",
     "Telechat2Config",
diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py
index 8f40b2b7df7..bffa127fecb 100644
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also
-            allow the model to output the auxiliary loss. See [here]() for more details
+            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
 
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 3a9ad3e0ffc..586d5c7f5e5 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -15,6 +15,7 @@ class EAGLEConfig(PretrainedConfig):
     def __init__(self,
                  model: Union[PretrainedConfig, dict, None] = None,
                  truncated_vocab_size: Optional[int] = None,
+                 method: Optional[str] = 'eagle',
                  **kwargs):
 
         model_config: Union[PretrainedConfig, DeepseekV2Config, None]
@@ -45,7 +46,23 @@ def __init__(self,
         if not envs.VLLM_USE_V1:
             kwargs["architectures"] = ["EAGLEModel"]
         else:
-            kwargs["architectures"] = ["EagleLlamaForCausalLM"]
+            # Eagle model name should follow naming convention of
+            # LlamaForCausalLM -> EagleLlamaForCausalLM
+            if method == "eagle":
+                assert self.model is not None, \
+                    "model should not be None when method is eagle"
+                kwargs["architectures"] = [
+                    f"Eagle{arch}" for arch in self.model.architectures
+                ]
+            elif method == "eagle3":
+                assert self.model is not None, \
+                    "model should not be None when method is eagle3"
+                kwargs["architectures"] = [
+                    f"Eagle3{arch}" for arch in self.model.architectures
+                ]
+            else:
+                raise ValueError(f"Invalid method {method}. \
+                    Supported methods are eagle and eagle3.")
 
         super().__init__(**kwargs)
 
diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py
index 39364367e30..8181604191a 100644
--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
     Instantiating a configuration with the defaults will yield a similar
     configuration to that of the Exaone
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
     and can be used to control the model outputs. Read the documentation from :
     class:`~transformers.PretrainedConfig` for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+        vocab_size ({obj}`int`, `optional`, defaults to 50257):
             Vocabulary size of the GPT Lingvo model. Defines the number of
-            different tokens that can be represented by the :obj:`inputs_ids`
-            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            different tokens that can be represented by the {obj}`inputs_ids`
+            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
             size of the model.
             Defines the different tokens that can be represented by the
             `inputs_ids` passed to the forward method of :class:
             `~transformers.EXAONEModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+        hidden_size ({obj}`int`, `optional`, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
-        num_layers (:obj:`int`, `optional`, defaults to 24):
+        num_layers ({obj}`int`, `optional`, defaults to 24):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the
@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
             specified, will default to `num_attention_heads`.
         rotary_pct (`float`, *optional*, defaults to 0.25):
             percentage of hidden dimensions to allocate to rotary embeddings
-        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in
             the Transformer encoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`,
-        defaults to :obj:`"gelu_new"`):
+        activation_function ({obj}`str` or {obj}`function`, `optional`,
+        defaults to {obj}`"gelu_new"`):
             The non-linear activation function (function or string) in the
-            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
-            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
+            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
+        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the
             embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
             The maximum sequence length that this model might ever be used with.
             Typically set this to something large just in case
             (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling
-            :class:`~transformers.EXAONEModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
+            The vocabulary size of the {obj}`token_type_ids` passed when calling
+            {class}`~transformers.EXAONEModel`.
+        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for
             initializing all weight matrices.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
             Whether or not the model should return the last key/values
             attentions (not used by all models).
             Only relevant if ``config.is_decoder=True``.
-        gradient_checkpointing (:obj:`bool`, `optional`,
-        defaults to :obj:`False`):
+        gradient_checkpointing ({obj}`bool`, `optional`,
+        defaults to {obj}`False`):
             If True, use gradient checkpointing to save memory at the expense
             of slower backward pass.
         Example::
diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py
new file mode 100644
index 00000000000..660e870ac62
--- /dev/null
+++ b/vllm/transformers_utils/configs/minimax_text_01.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+""" MiniMaxText01 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MiniMaxText01Config(PretrainedConfig):
+    model_type = "MiniMaxText01"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py
new file mode 100644
index 00000000000..99e0d249dc5
--- /dev/null
+++ b/vllm/transformers_utils/configs/minimax_vl_01.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+"""MiniMaxVL01 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+
+from .minimax_text_01 import MiniMaxText01Config
+
+
+class MiniMaxVL01Config(PretrainedConfig):
+    model_type = "minimax_vl_01"
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_grid_pinpoints=None,
+        tie_word_embeddings=False,
+        image_seq_length=576,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError("vision_feature_select_strategy should " +
+                             "be one of 'default', 'full'." +
+                             f"Got: {vision_feature_select_strategy}")
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        image_grid_pinpoints = (
+            image_grid_pinpoints if image_grid_pinpoints is not None else
+            [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]])
+        self.image_grid_pinpoints = image_grid_pinpoints
+
+        if isinstance(vision_config, dict):
+            if "model_type" not in vision_config:
+                vision_config["model_type"] = "clip_vision_model"
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
+                **vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if text_config is not None:
+            text_config = MiniMaxText01Config(**text_config)
+        else:
+            text_config = MiniMaxText01Config()
+
+        self.text_config = text_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
deleted file mode 100644
index c6e446333b4..00000000000
--- a/vllm/transformers_utils/configs/olmo2.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# yapf: disable
-# ruff: noqa: E501
-# coding=utf-8
-# Copied from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py
-"""OLMo 2 configuration."""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class Olmo2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50304):
-            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Olmo2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            Padding token id.
-        bos_token_id (`int`, *optional*):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50279):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-
-    ```python
-    >>> from transformers import Olmo2Model, Olmo2Config
-
-    >>> # Initializing a Olmo2 7B style configuration
-    >>> configuration = Olmo2Config()
-
-    >>> # Initializing a model from the Olmo2 7B style configuration
-    >>> model = Olmo2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "olmo2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=50304,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=None,
-        eos_token_id=50279,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        rms_norm_eps=1e-5,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
-        self.rms_norm_eps = rms_norm_eps
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/vllm/transformers_utils/configs/ovis2.py b/vllm/transformers_utils/configs/ovis2.py
new file mode 100644
index 00000000000..437a16e778c
--- /dev/null
+++ b/vllm/transformers_utils/configs/ovis2.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# yapf: disable
+# ruff: noqa: E501
+# copied from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
+# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
+from typing import Any, Optional, Union
+
+from transformers import AutoConfig, PretrainedConfig
+
+
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+
+    model_type: str = "aimv2"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
+
+
+IGNORE_ID = -100
+IMAGE_TOKEN_ID = -200
+IMAGE_TOKEN = "<image>"
+IMAGE_ATOM_ID = -300
+IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, -305]
+
+AutoConfig.register("aimv2", AIMv2Config)
+
+
+# ----------------------------------------------------------------------
+#                     Visual Tokenizer Configuration
+# ----------------------------------------------------------------------
+class BaseVisualTokenizerConfig(PretrainedConfig):
+
+    def __init__(self,
+                 vocab_size=16384,
+                 tokenize_function="softmax",
+                 tau=1.0,
+                 depths=None,
+                 drop_cls_token=False,
+                 backbone_config: Optional[Union[PretrainedConfig,
+                                                 dict]] = None,
+                 hidden_stride: int = 1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.tokenize_function = tokenize_function
+        self.tau = tau
+        if isinstance(depths, str):
+            depths = [int(x) for x in depths.split('|')]
+        self.depths = depths
+        self.backbone_kwargs = dict[str, Any]()
+        self.drop_cls_token = drop_cls_token
+        if backbone_config is not None:
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            if not isinstance(backbone_config, PretrainedConfig):
+                model_type = backbone_config['model_type']
+                backbone_config.pop('model_type')
+                backbone_config = AutoConfig.for_model(model_type,
+                                                       **backbone_config)
+        self.backbone_config = backbone_config
+        self.hidden_stride = hidden_stride
+
+
+class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
+    model_type = "aimv2_visual_tokenizer"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.drop_cls_token:
+            self.drop_cls_token = False
+        if self.depths:
+            assert len(self.depths) == 1
+            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+
+
+AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
+
+
+# ----------------------------------------------------------------------
+#                           Ovis Configuration
+# ----------------------------------------------------------------------
+class OvisConfig(PretrainedConfig):
+    model_type = "ovis"
+
+    def __init__(self,
+                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
+                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
+                                                         dict]] = None,
+                 multimodal_max_length=8192,
+                 hidden_size=None,
+                 conversation_formatter_class=None,
+                 llm_attn_implementation=None,
+                 disable_tie_weight=False,
+                 **kwargs):
+        super().__init__(**kwargs)
+        if llm_config is not None:
+            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            if not isinstance(llm_config, PretrainedConfig):
+                model_type = llm_config['model_type']
+                llm_config.pop('model_type')
+                llm_config = AutoConfig.for_model(model_type, **llm_config)
+
+        # map llm_config to text_config
+        self.text_config = llm_config
+        if visual_tokenizer_config is not None:
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            if not isinstance(visual_tokenizer_config, PretrainedConfig):
+                model_type = visual_tokenizer_config['model_type']
+                visual_tokenizer_config.pop('model_type')
+                visual_tokenizer_config = AutoConfig.for_model(
+                    model_type, **visual_tokenizer_config)
+
+        self.visual_tokenizer_config = visual_tokenizer_config
+        self.multimodal_max_length = multimodal_max_length
+        self.hidden_size = hidden_size
+        self.conversation_formatter_class = conversation_formatter_class
+        self.llm_attn_implementation = llm_attn_implementation
+        self.disable_tie_weight = disable_tie_weight
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 9d1d4bb92e4..991d5631e64 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -8,13 +8,13 @@
 from .detokenizer_utils import (convert_prompt_ids_to_tokens,
                                 detokenize_incrementally)
 from .tokenizer import AnyTokenizer
-from .tokenizer_group import BaseTokenizerGroup
+from .tokenizer_group import TokenizerGroup
 
 
 class Detokenizer:
     """Provides methods to decode the output of a model into text."""
 
-    def __init__(self, tokenizer_group: BaseTokenizerGroup):
+    def __init__(self, tokenizer_group: TokenizerGroup):
         self.tokenizer_group = tokenizer_group
 
     def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 1d09b99d50c..d27c26659b5 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -33,7 +33,8 @@ def __hash__(self) -> int:  # type: ignore[override]
 
 
 def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
-    base_kwargs = model_config.mm_processor_kwargs
+    mm_config = model_config.get_multimodal_config()
+    base_kwargs = mm_config.mm_processor_kwargs
     if base_kwargs is None:
         base_kwargs = {}
 
@@ -111,20 +112,20 @@ def cached_processor_from_config(
     )
 
 
-def get_image_processor(
+def get_feature_extractor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
     **kwargs: Any,
 ):
-    """Load an image processor for the given model name via HuggingFace."""
+    """Load an audio feature extractor for the given model name 
+    via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
-    from transformers import AutoImageProcessor
-    from transformers.image_processing_utils import BaseImageProcessor
-
+    from transformers import AutoFeatureExtractor
+    from transformers.feature_extraction_utils import FeatureExtractionMixin
     try:
-        processor = AutoImageProcessor.from_pretrained(
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
@@ -135,61 +136,75 @@ def get_image_processor(
         # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
         if not trust_remote_code:
             err_msg = (
-                "Failed to load the image processor. If the image processor is "
-                "a custom processor not yet available in the HuggingFace "
-                "transformers library, consider setting "
+                "Failed to load the feature extractor. If the feature "
+                "extractor is a custom extractor not yet available in the "
+                "HuggingFace transformers library, consider setting "
                 "`trust_remote_code=True` in LLM or using the "
                 "`--trust-remote-code` flag in the CLI.")
             raise RuntimeError(err_msg) from e
         else:
             raise e
+    return cast(FeatureExtractionMixin, feature_extractor)
 
-    return cast(BaseImageProcessor, processor)
 
+cached_get_feature_extractor = lru_cache(get_feature_extractor)
 
-cached_get_image_processor = lru_cache(get_image_processor)
 
-
-def cached_image_processor_from_config(
+def cached_feature_extractor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    return cached_get_image_processor(
+    return cached_get_feature_extractor(
         model_config.model,
         trust_remote_code=model_config.trust_remote_code,
         **_merge_mm_kwargs(model_config, **kwargs),
     )
 
 
-def get_video_processor(
+def get_image_processor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
     **kwargs: Any,
 ):
-    """Load a video processor for the given model name via HuggingFace."""
+    """Load an image processor for the given model name via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
+    from transformers import AutoImageProcessor
     from transformers.image_processing_utils import BaseImageProcessor
 
-    processor = get_processor(
-        processor_name,
-        *args,
-        trust_remote_code=trust_remote_code,
-        **kwargs,
-    )
+    try:
+        processor = AutoImageProcessor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the image processor. If the image processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
 
-    return cast(BaseImageProcessor, processor.video_processor)
+    return cast(BaseImageProcessor, processor)
 
 
-cached_get_video_processor = lru_cache(get_video_processor)
+cached_get_image_processor = lru_cache(get_image_processor)
 
 
-def cached_video_processor_from_config(
+def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
-    return cached_get_video_processor(
+    return cached_get_image_processor(
         model_config.model,
         trust_remote_code=model_config.trust_remote_code,
         **_merge_mm_kwargs(model_config, **kwargs),
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 4696f0c49df..2e9cf3e4d90 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -2,5 +2,6 @@
 
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
+from vllm.transformers_utils.processors.ovis2 import OvisProcessor
 
-__all__ = ["DeepseekVLV2Processor"]
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
diff --git a/vllm/transformers_utils/processors/ovis2.py b/vllm/transformers_utils/processors/ovis2.py
new file mode 100644
index 00000000000..a633256ec12
--- /dev/null
+++ b/vllm/transformers_utils/processors/ovis2.py
@@ -0,0 +1,399 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+__all__ = [ 'OvisProcessor']
+IGNORE_ID = -100
+
+class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'max_partition':9,
+            'covering_threshold':0.9,
+            'convert_to_rgb':True,
+        'return_tensors':'pt'},
+    }
+
+
+
+class OvisProcessor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "Qwen2Tokenizer"
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, image_pad_token=None, **kwargs):
+        self.image_token = "<image>"
+        self.image_pad_token = "<|image_pad|>" if image_pad_token is None else image_pad_token
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        self.extra_special_tokens = {
+            "image_token": -200,
+            "image_atom": -300,
+            "image_start": -301,
+            "image_prefix": -302,
+            "image_col_sep": -303,
+            "image_row_sep": -304,
+            "image_end": -305,
+            'image_pad': self.image_pad_token_id,
+        }
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        **kwargs: Unpack[OvisProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats are supported.
+                text (`str`, `List[str]`, `List[List[str]]`):
+                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+                  `None`).
+                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            OvisProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process all images first
+        image_features = {}
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = self.preprocess_image(
+                    image=image, **output_kwargs["images_kwargs"]
+                )
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                image_features["image_placeholders"] = image_placeholders_list
+
+        # Process text input
+        if text is not None:
+
+            if not isinstance(text, list):
+                text = [text]
+
+            tokenized_batched_text = self._tokenize_with_image_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            replaced_ids_list = []
+            idx = 0
+            for ids_tensor in tokenized_batched_text:
+                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                    if idx < len(image_features["image_placeholders"]):
+                        # Converts in list for ease of use
+                        ids_list = ids_tensor.tolist()
+
+                        new_ids = []
+
+                        # replace placeholders
+                        for i, token_id in enumerate(ids_list):
+                            if token_id == image_token_id:
+                                placeholder_ids = image_features["image_placeholders"][idx]
+                                new_ids.extend(placeholder_ids)
+                                idx += 1
+                            else:
+                                new_ids.append(token_id)
+
+                        # Converts back to tensors
+                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                    else:
+                        raise RuntimeError(
+                            'Mismatch between the images you provided and the number of placeholder present in the text')
+
+                replaced_ids_list.append(ids_tensor)
+
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+
+            # Create the output with text features
+            output = BatchFeature(
+                data={
+                    "input_ids": replaced_and_tokenized_ids,
+                }
+            )
+
+            # Add image features if present
+            if image_features:
+                output["pixel_values"] = processed_images
+                output['grids'] = grids
+
+            return output
+
+        # If only images were provided
+        return BatchFeature(data=image_features)
+
+    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
+                           text.split(self.image_token)]
+            token_ids = []
+            num_chuck = len(text_chunks)
+            for i, chunk in enumerate(text_chunks):
+                token_ids.extend(chunk)
+                if i < num_chuck - 1:
+                    token_ids.append(self.get_token_value("image_token"))
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    def get_image_size(self):
+        height = self.image_processor.crop_size["height"]
+        width = self.image_processor.crop_size["width"]
+        return height, width
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_image_indicators(self, grid):
+        image_placeholders = [self.get_token_value('image_start'),
+                              self.get_token_value('image_atom'),
+                              self.get_token_value('image_prefix')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(self.get_token_value('image_atom') )
+                    if c < grid[1] - 1:
+                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                if r < grid[0] - 1:
+                    image_placeholders.append(self.get_token_value('image_row_sep'))
+        image_placeholders.append(self.get_token_value('image_end'))
+        return image_placeholders
+
+    def construct_image_placeholders(self, grid):
+
+        image_placeholders = self.construct_image_indicators(grid)
+
+        image_atom_token_id = self.get_token_value('image_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in image_placeholders:
+            padded_placeholder_tokens.append(image_padding_token_id)
+            if token == image_atom_token_id:
+                # Add 255 padding tokens after each image atom token
+                padded_placeholder_tokens.extend([image_padding_token_id] * 255)
+        return padded_placeholder_tokens
+
+    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+        def _preprocess(img: PIL.Image.Image, side):
+            # first resize and preprocess
+            w, h = img.size
+            if w == h:
+                new_width = new_height = side
+            elif w > h:
+                new_width = side
+                new_height = int(h / w * new_width)
+            else:
+                new_height = side
+                new_width = int(w / h * new_height)
+            new_size = dict(height=new_height, width=new_width)
+            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+
+            # then pad to square
+            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            new_height, new_width = pixel_values.shape[2:]
+            if new_height == new_width:
+                square_values[:, :, :, :] = pixel_values
+            elif new_height > new_width:
+                from_index = (side - new_width) // 2
+                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+            else:
+                from_index = (side - new_height) // 2
+                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+
+            return square_values
+
+        def _partition(img, grid) -> list[tuple[int, int, int, int]]:
+            w, h = img.size
+            row_height = h // grid[0]
+            col_width = w // grid[1]
+
+            partition = []
+            for row in range(grid[0]):
+                for col in range(grid[1]):
+                    left = col * col_width
+                    upper = row * row_height
+                    right = w if col == grid[1] - 1 else (col + 1) * col_width
+                    lower = h if row == grid[0] - 1 else (row + 1) * row_height
+                    partition.append((left, upper, right, lower))
+
+            return partition
+
+        def _covering_area(left, upper, right, lower, side):
+            w = right - left
+            h = lower - upper
+            w, h = max(w, h), min(w, h)
+            if w > side:
+                h = h / w * side
+                w = side
+            return w * h
+
+        def _get_best_grid(img, side):
+            img_area = img.size[0] * img.size[1]
+
+            candidate_grids = []
+            for i in range(1, max_partition + 1):
+                for j in range(1, max_partition + 1):
+                    if i * j <= max_partition:
+                        candidate_grids.append((i, j))
+
+            all_grids = []
+            good_grids = []
+            for grid in candidate_grids:
+                partition = _partition(img, grid)
+                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                assert covering_ratio <= 1.0
+                all_grids.append((grid, covering_ratio))
+                if covering_ratio > covering_threshold:
+                    good_grids.append((grid, covering_ratio))
+
+            if len(good_grids) > 0:
+                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+            else:
+                # pick the partition with maximum covering_ratio and break the tie using #sub_images
+                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
+
+        if convert_to_rgb and image.mode != 'RGB':
+            image = image.convert('RGB')
+
+
+        sides = self.get_image_size()
+        if sides[0] != sides[1]:
+            raise ValueError('get_image_size() returns non-square size')
+        side = sides[0]
+        grid = _get_best_grid(image, side)
+        partition = _partition(image, grid)
+        crops = [image.crop(p) for p in partition]
+        if len(crops) > 1:
+            crops.insert(0, image)
+        pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
+        image_placeholders = self.construct_image_placeholders(grid)
+        return pixel_values, image_placeholders, grid
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(self, generated_outputs):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor + ["second_per_grid_ts"]
+
+
+AutoProcessor.register("OvisProcessor", OvisProcessor)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 1bfb5032833..e31580ede57 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import copy
 import os
 import warnings
 from functools import lru_cache
@@ -38,9 +39,9 @@ def decode_tokens(
 ) -> str:
     """
     Backend-agnostic equivalent of HF's
-    :code:`tokenizer.decode(token_ids, ...)`.
+    `tokenizer.decode(token_ids, ...)`.
 
-    :code:`skip_special_tokens=None` means to use the backend's default
+    `skip_special_tokens=None` means to use the backend's default
     settings.
     """
     if skip_special_tokens is not None:
@@ -54,34 +55,43 @@ def encode_tokens(
     tokenizer: AnyTokenizer,
     text: str,
     *,
+    truncation: Optional[bool] = None,
+    max_length: Optional[int] = None,
     add_special_tokens: Optional[bool] = None,
 ) -> list[int]:
     """
     Backend-agnostic equivalent of HF's
-    :code:`tokenizer.encode(text, ...)`.
+    `tokenizer.encode(text, ...)`.
 
-    :code:`add_special_tokens=None` means to use the backend's default
+    `add_special_tokens=None` means to use the backend's default
     settings.
     """
-    if add_special_tokens is not None:
-        return tokenizer.encode(text, add_special_tokens=add_special_tokens)
 
-    return tokenizer.encode(text)
+    kw_args: dict[str, Any] = {}
+    if max_length is not None:
+        kw_args["max_length"] = max_length
 
+    if truncation is not None:
+        kw_args["truncation"] = truncation
 
-def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
-    """Get tokenizer with cached properties.
+    if add_special_tokens is not None:
+        kw_args["add_special_tokens"] = add_special_tokens
 
-    This will patch the tokenizer object in place.
+    return tokenizer.encode(text, **kw_args)
 
+
+def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
+    """
     By default, transformers will recompute multiple tokenizer properties
-    each time they are called, leading to a significant slowdown. This
-    function caches these properties for faster access."""
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
 
-    tokenizer_all_special_ids = set(tokenizer.all_special_ids)
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
     tokenizer_all_special_tokens_extended = (
         tokenizer.all_special_tokens_extended)
-    tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
     tokenizer_vocab = tokenizer.get_vocab()
     tokenizer_len = len(tokenizer)
 
@@ -97,31 +107,34 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
     class CachedTokenizer(tokenizer.__class__):  # type: ignore
 
         @property
-        def all_special_ids(self):
+        def all_special_ids(self) -> list[int]:
             return tokenizer_all_special_ids
 
         @property
-        def all_special_tokens(self):
+        def all_special_tokens(self) -> list[str]:
             return tokenizer_all_special_tokens
 
         @property
-        def all_special_tokens_extended(self):
+        def all_special_tokens_extended(self) -> list[str]:
             return tokenizer_all_special_tokens_extended
 
         @property
-        def max_token_id(self):
+        def max_token_id(self) -> int:
             return max_token_id
 
-        def get_vocab(self):
+        def get_vocab(self) -> dict[str, int]:
             return tokenizer_vocab
 
-        def __len__(self):
+        def __len__(self) -> int:
             return tokenizer_len
 
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer, )
+
     CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
 
-    tokenizer.__class__ = CachedTokenizer
-    return tokenizer
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer
 
 
 def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index bb5ddaf88b2..d69e5a6b425 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -2,7 +2,7 @@
 
 import importlib
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -12,17 +12,17 @@ class TokenizerBase(ABC):
 
     @property
     @abstractmethod
-    def all_special_tokens_extended(self) -> List[str]:
+    def all_special_tokens_extended(self) -> list[str]:
         raise NotImplementedError()
 
     @property
     @abstractmethod
-    def all_special_tokens(self) -> List[str]:
+    def all_special_tokens(self) -> list[str]:
         raise NotImplementedError()
 
     @property
     @abstractmethod
-    def all_special_ids(self) -> List[int]:
+    def all_special_ids(self) -> list[int]:
         raise NotImplementedError()
 
     @property
@@ -66,7 +66,7 @@ def __len__(self) -> int:
     @abstractmethod
     def __call__(
         self,
-        text: Union[str, List[str], List[int]],
+        text: Union[str, list[str], list[int]],
         text_pair: Optional[str] = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
@@ -75,11 +75,11 @@ def __call__(
         raise NotImplementedError()
 
     @abstractmethod
-    def get_vocab(self) -> Dict[str, int]:
+    def get_vocab(self) -> dict[str, int]:
         raise NotImplementedError()
 
     @abstractmethod
-    def get_added_vocab(self) -> Dict[str, int]:
+    def get_added_vocab(self) -> dict[str, int]:
         raise NotImplementedError()
 
     @abstractmethod
@@ -88,44 +88,46 @@ def encode_one(
         text: str,
         truncation: bool = False,
         max_length: Optional[int] = None,
-    ) -> List[int]:
+    ) -> list[int]:
         raise NotImplementedError()
 
     @abstractmethod
     def encode(self,
                text: str,
-               add_special_tokens: Optional[bool] = None) -> List[int]:
+               truncation: Optional[bool] = None,
+               max_length: Optional[int] = None,
+               add_special_tokens: Optional[bool] = None) -> list[int]:
         raise NotImplementedError()
 
     @abstractmethod
     def apply_chat_template(self,
-                            messages: List["ChatCompletionMessageParam"],
-                            tools: Optional[List[Dict[str, Any]]] = None,
-                            **kwargs) -> List[int]:
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
         raise NotImplementedError()
 
     @abstractmethod
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError()
 
     @abstractmethod
     def decode(self,
-               ids: Union[List[int], int],
+               ids: Union[list[int], int],
                skip_special_tokens: bool = True) -> str:
         raise NotImplementedError()
 
     @abstractmethod
     def convert_ids_to_tokens(
         self,
-        ids: List[int],
+        ids: list[int],
         skip_special_tokens: bool = True,
-    ) -> List[str]:
+    ) -> list[str]:
         raise NotImplementedError()
 
 
 class TokenizerRegistry:
     # Tokenizer name -> (tokenizer module, tokenizer class)
-    REGISTRY: Dict[str, Tuple[str, str]] = {}
+    REGISTRY: dict[str, tuple[str, str]] = {}
 
     @staticmethod
     def register(name: str, module: str, class_name: str) -> None:
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
similarity index 77%
rename from vllm/transformers_utils/tokenizer_group/tokenizer_group.py
rename to vllm/transformers_utils/tokenizer_group.py
index b6e9005bcd2..aff2d2eb1c3 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -2,7 +2,7 @@
 
 from typing import List, Optional
 
-from vllm.config import TokenizerPoolConfig
+from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
                                                get_lora_tokenizer,
@@ -10,10 +10,8 @@
                                                get_tokenizer)
 from vllm.utils import LRUCache
 
-from .base_tokenizer_group import BaseTokenizerGroup
 
-
-class TokenizerGroup(BaseTokenizerGroup):
+class TokenizerGroup:
     """A group of tokenizers that can be used for LoRA adapters."""
 
     def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
@@ -27,15 +25,6 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.lora_tokenizers = LRUCache[int, AnyTokenizer](
             capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
-    @classmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                    **init_kwargs) -> "TokenizerGroup":
-        return cls(**init_kwargs)
-
-    def ping(self) -> bool:
-        """Check if the tokenizer group is alive."""
-        return True
-
     def get_max_input_len(self,
                           lora_request: Optional[LoRARequest] = None
                           ) -> Optional[int]:
@@ -56,11 +45,16 @@ def _raise_if_input_too_long(self,
 
     def encode(self,
                prompt: str,
+               max_length: Optional[int] = None,
+               truncation: Optional[bool] = None,
                lora_request: Optional[LoRARequest] = None,
                add_special_tokens: Optional[bool] = None) -> List[int]:
+
         tokenizer = self.get_lora_tokenizer(lora_request)
         ret = encode_tokens(tokenizer,
                             prompt,
+                            max_length=max_length,
+                            truncation=truncation,
                             add_special_tokens=add_special_tokens)
         self._raise_if_input_too_long(ret, lora_request)
         return ret
@@ -68,11 +62,15 @@ def encode(self,
     async def encode_async(
             self,
             prompt: str,
+            max_length: Optional[int] = None,
+            truncation: Optional[bool] = None,
             lora_request: Optional[LoRARequest] = None,
             add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = await self.get_lora_tokenizer_async(lora_request)
         ret = encode_tokens(tokenizer,
                             prompt,
+                            max_length=max_length,
+                            truncation=truncation,
                             add_special_tokens=add_special_tokens)
         self._raise_if_input_too_long(ret, lora_request)
         return ret
@@ -104,3 +102,18 @@ async def get_lora_tokenizer_async(
             return tokenizer
         else:
             return self.lora_tokenizers[lora_request.lora_int_id]
+
+
+def init_tokenizer_from_configs(model_config: ModelConfig,
+                                scheduler_config: SchedulerConfig,
+                                lora_config: Optional[LoRAConfig]):
+    return TokenizerGroup(
+        tokenizer_id=model_config.tokenizer,
+        enable_lora=bool(lora_config),
+        max_num_seqs=scheduler_config.max_num_seqs,
+        max_loras=lora_config.max_loras if lora_config else 0,
+        max_input_length=None,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
+        truncation_side=model_config.truncation_side)
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
deleted file mode 100644
index 9d2209575bd..00000000000
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional, Type
-
-from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, TokenizerPoolConfig)
-from vllm.executor.ray_utils import ray
-
-from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
-from .tokenizer_group import TokenizerGroup
-
-if ray:
-    from .ray_tokenizer_group import RayTokenizerGroupPool
-else:
-    RayTokenizerGroupPool = None  # type: ignore
-
-
-def init_tokenizer_from_configs(model_config: ModelConfig,
-                                scheduler_config: SchedulerConfig,
-                                parallel_config: ParallelConfig,
-                                lora_config: Optional[LoRAConfig]):
-    init_kwargs = dict(tokenizer_id=model_config.tokenizer,
-                       enable_lora=bool(lora_config),
-                       max_num_seqs=scheduler_config.max_num_seqs,
-                       max_loras=lora_config.max_loras if lora_config else 0,
-                       max_input_length=None,
-                       tokenizer_mode=model_config.tokenizer_mode,
-                       trust_remote_code=model_config.trust_remote_code,
-                       revision=model_config.tokenizer_revision,
-                       truncation_side=model_config.truncation_side)
-
-    return get_tokenizer_group(parallel_config.tokenizer_pool_config,
-                               **init_kwargs)
-
-
-def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                        **init_kwargs) -> BaseTokenizerGroup:
-    tokenizer_cls: Type[BaseTokenizerGroup]
-    if tokenizer_pool_config is None:
-        tokenizer_cls = TokenizerGroup
-    elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
-            tokenizer_pool_config.pool_type, BaseTokenizerGroup):
-        tokenizer_cls = tokenizer_pool_config.pool_type
-    elif tokenizer_pool_config.pool_type == "ray":
-        if RayTokenizerGroupPool is None:
-            raise ImportError(
-                "RayTokenizerGroupPool is not available. Please install "
-                "the ray package to use the Ray tokenizer group pool.")
-        tokenizer_cls = RayTokenizerGroupPool
-    else:
-        raise ValueError(
-            f"Unknown pool type: {tokenizer_pool_config.pool_type}")
-    return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
-
-
-__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
deleted file mode 100644
index c5108a7fc6e..00000000000
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from abc import ABC, abstractmethod
-from typing import List, Optional
-
-from vllm.config import TokenizerPoolConfig
-from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-
-class BaseTokenizerGroup(ABC):
-    """A group of tokenizers that can be used for LoRA adapters."""
-
-    @classmethod
-    @abstractmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                    **init_kwargs) -> "BaseTokenizerGroup":
-        pass
-
-    @abstractmethod
-    def ping(self) -> bool:
-        """Check if the tokenizer group is alive."""
-        pass
-
-    @abstractmethod
-    def get_max_input_len(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> Optional[int]:
-        """Get the maximum input length for the LoRA request."""
-        pass
-
-    @abstractmethod
-    def encode(self,
-               prompt: str,
-               lora_request: Optional[LoRARequest] = None,
-               add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group."""
-        pass
-
-    @abstractmethod
-    async def encode_async(
-            self,
-            prompt: str,
-            lora_request: Optional[LoRARequest] = None,
-            add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group."""
-        pass
-
-    @abstractmethod
-    def get_lora_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        """Get a tokenizer for a LoRA request."""
-        pass
-
-    @abstractmethod
-    async def get_lora_tokenizer_async(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        """Get a tokenizer for a LoRA request."""
-        pass
-
-    def check_health(self):
-        """Raise exception if the tokenizer group is unhealthy."""
-        return
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
deleted file mode 100644
index b048b809417..00000000000
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import asyncio
-import os
-from typing import List, Optional
-
-try:
-    from ray.exceptions import ActorDiedError  # type: ignore
-except ImportError:
-    # For older versions of Ray
-    from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
-from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
-
-from vllm.config import TokenizerPoolConfig
-from vllm.executor.ray_utils import ray
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-from .base_tokenizer_group import BaseTokenizerGroup
-from .tokenizer_group import TokenizerGroup
-
-logger = init_logger(__name__)
-
-
-class RayTokenizerGroupPool(BaseTokenizerGroup):
-    """A Ray-based pool of TokenizerGroups for async tokenization."""
-
-    # Class to use for workers making up the pool.
-    _worker_cls = TokenizerGroup
-
-    @classmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                    **init_kwargs) -> "RayTokenizerGroupPool":
-        if not tokenizer_pool_config:
-            raise ValueError("tokenizer_pool_config must not be None.")
-        ray_actor_options = (tokenizer_pool_config.extra_config or {
-            "num_cpus": 0
-        })
-        ray_actor_options.setdefault(
-            "scheduling_strategy",
-            NodeAffinitySchedulingStrategy(
-                node_id=ray.get_runtime_context().get_node_id(), soft=True))
-
-        # Carry over the env vars to the actors.
-        # This is necessary for API keys and such.
-        ray_actor_options.setdefault("runtime_env", {})
-        _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
-
-        init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
-        init_kwargs["ray_actor_options"] = ray_actor_options
-
-        return cls(**init_kwargs)
-
-    def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
-                 max_input_length: Optional[int], num_actors: int,
-                 ray_actor_options: dict, **tokenizer_config):
-        # Store a local copy of the TokenizerGroup for quick access
-        # to underlying HF tokenizers.
-        self._tokenizer_config = {
-            "tokenizer_id": tokenizer_id,
-            "enable_lora": enable_lora,
-            "max_num_seqs": max_num_seqs,
-            "max_input_length": max_input_length,
-            **tokenizer_config
-        }
-        self._local_tokenizer_group = self._worker_cls(
-            **self._tokenizer_config, )
-
-        self._ray_tokenizer_group_cls = ray.remote(
-            self._worker_cls).options(**ray_actor_options)  # type: ignore
-        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
-        self._idle_actors: Optional[asyncio.Queue] = None
-
-        # If set, actor is unhealthy. Will reraise on the next
-        # check_health call.
-        self._exception: Optional[ActorDiedError] = None
-
-    def _init_actor(self) -> ray.ObjectRef:
-        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
-
-    @property
-    def pool_size(self) -> int:
-        return len(self.tokenizer_actors)
-
-    def ping(self):
-        return ray.get([
-            actor.ping.remote()  # type: ignore
-            for actor in self.tokenizer_actors
-        ])
-
-    def _ensure_queue_initialized(self):
-        if self._idle_actors is None:
-            self._idle_actors = asyncio.Queue()
-            for actor in self.tokenizer_actors:
-                self._idle_actors.put_nowait(actor)
-
-    def _finalize_encode(self, actor: ray.ObjectRef,
-                         original_actor: ray.ObjectRef, actor_is_alive: bool):
-        assert self._idle_actors is not None
-        # Cleanup the dead actor.
-        if not actor_is_alive or original_actor is not actor:
-            self.tokenizer_actors.remove(original_actor)
-        if actor_is_alive:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
-            # Add back the new actor.
-            if original_actor is not actor:
-                self.tokenizer_actors.append(actor)
-
-    def encode(self,
-               prompt: str,
-               lora_request: Optional[LoRARequest] = None,
-               add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group.
-
-        We pick an idle actor and use it to encode the prompt.
-        The actor is then put back in the queue for future use.
-        This is blocking.
-        """
-        self.check_health()
-        self._ensure_queue_initialized()
-        assert self._idle_actors is not None
-
-        if self._idle_actors.empty():
-            raise RuntimeError("No idle actors available.")
-        actor = self._idle_actors.get_nowait()
-        actor_is_alive = True
-        original_actor = actor
-        try:
-            ret = ray.get(
-                actor.encode.remote(prompt=prompt,
-                                    lora_request=lora_request,
-                                    add_special_tokens=add_special_tokens))
-        except ActorDiedError as e:
-            # If the actor is dead, we first try to reinitialize it.
-            logger.warning("%s died with ActorDiedError, reinitializing.",
-                           actor,
-                           exc_info=e)
-            actor = self._init_actor()
-            try:
-                ret = ray.get(
-                    actor.encode.remote(prompt=prompt,
-                                        lora_request=lora_request,
-                                        add_special_tokens=add_special_tokens))
-            except ActorDiedError as e:
-                logger.error(
-                    "%s died for second time in a row, marking "
-                    "RayTokenizerGroupPool as unhealthy.", actor)
-                actor_is_alive = False
-                if not self._exception:
-                    self._exception = e
-                self.check_health()
-        finally:
-            self._finalize_encode(actor, original_actor, actor_is_alive)
-        return ret
-
-    async def encode_async(
-            self,
-            prompt: str,
-            lora_request: Optional[LoRARequest] = None,
-            add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group.
-
-        We pick an idle actor and use it to encode the prompt.
-        If there are no idle actors, we wait until one becomes
-        available.
-        The actor is then put back in the queue for future use.
-        This is non-blocking.
-        """
-        self.check_health()
-        self._ensure_queue_initialized()
-        assert self._idle_actors is not None
-
-        actor = await self._idle_actors.get()
-        actor_is_alive = True
-        original_actor = actor
-        try:
-            ret = await actor.encode.remote(
-                prompt=prompt,
-                lora_request=lora_request,
-                add_special_tokens=add_special_tokens)
-        except ActorDiedError as e:
-            # If the actor is dead, we first try to reinitialize it.
-            logger.warning("%s died with ActorDiedError, reinitializing.",
-                           actor,
-                           exc_info=e)
-            actor = self._init_actor()
-            try:
-                ret = await actor.encode.remote(
-                    prompt=prompt,
-                    lora_request=lora_request,
-                    add_special_tokens=add_special_tokens)
-            except ActorDiedError as e:
-                logger.error(
-                    "%s died for second time in a row, marking "
-                    "RayTokenizerGroupPool as unhealthy.", actor)
-                actor_is_alive = False
-                if not self._exception:
-                    self._exception = e
-                self.check_health()
-        finally:
-            self._finalize_encode(actor, original_actor, actor_is_alive)
-        return ret
-
-    def get_max_input_len(self,
-                          lora_request: Optional[LoRARequest] = None
-                          ) -> Optional[int]:
-        """Get the maximum input length for the LoRA request."""
-        return self._local_tokenizer_group.get_max_input_len(lora_request)
-
-    def get_lora_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
-
-    async def get_lora_tokenizer_async(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return await self._local_tokenizer_group.get_lora_tokenizer_async(
-            lora_request)
-
-    def check_health(self):
-        if self._exception:
-            raise RuntimeError(
-                "TokenizerGroupPool is unhealthy.") from self._exception
-
-
-def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
-    """Copy over all current process environment variables to the runtime_env.
-
-    The variables in runtime_env will take precedence over the current process
-    environment variables.
-
-    runtime_env will be modified in place."""
-    env_vars = os.environ.copy()
-    runtime_env.setdefault("env_vars", {})
-    env_vars.update(runtime_env["env_vars"])
-    runtime_env["env_vars"] = env_vars
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 58a114fa3a3..3db7a0a5c5c 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -227,6 +227,7 @@ def from_pretrained(cls,
         else:
             assert Path(
                 path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
+            tokenizer_file = str(Path(path_or_repo_id))
 
         from mistral_common.tokens.tokenizers.mistral import (
             MistralTokenizer as PublicMistralTokenizer)
@@ -257,7 +258,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
     # the following attributes are set to fit vLLM's design and are used
     # by the guided structured output backends.
     @property
-    def all_special_tokens_extended(self) -> List[str]:
+    def all_special_tokens_extended(self) -> list[str]:
         from mistral_common.tokens.tokenizers.base import SpecialTokens
 
         # tekken defines its own extended special tokens list
@@ -271,11 +272,11 @@ def all_special_tokens_extended(self) -> List[str]:
         ]
 
     @property
-    def all_special_tokens(self) -> List[str]:
+    def all_special_tokens(self) -> list[str]:
         return self.all_special_tokens_extended
 
     @property
-    def all_special_ids(self) -> List[int]:
+    def all_special_ids(self) -> list[int]:
         return [
             self.all_special_tokens.index(t) for t in self.all_special_tokens
         ]
@@ -335,12 +336,12 @@ def __call__(
             input_ids = self.encode_one(text, truncation, max_length)
         return Encoding(input_ids=input_ids)
 
-    def get_vocab(self) -> Dict[str, int]:
+    def get_vocab(self) -> dict[str, int]:
         # NB: the dictionary form of the vocabulary collapses token ids that map
         # to the same string but have different bytes
         return self._vocab_dict
 
-    def get_added_vocab(self) -> Dict[str, int]:
+    def get_added_vocab(self) -> dict[str, int]:
         # Mistral tokenizers have no added vocabulary
         return {}
 
@@ -359,6 +360,8 @@ def encode_one(
 
     def encode(self,
                text: str,
+               truncation: Optional[bool] = None,
+               max_length: Optional[int] = None,
                add_special_tokens: Optional[bool] = None) -> List[int]:
         # `encode` should only be used for prompt completion
         # it should never be used for chat_completion.
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 43918bcd7c5..bffc56a2e75 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -2,4 +2,4 @@
 
 from vllm.triton_utils.importing import HAS_TRITON
 
-__all__ = ["HAS_TRITON"]
\ No newline at end of file
+__all__ = ["HAS_TRITON"]
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index a20700248c2..0a0c0a4bd17 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,17 +1,47 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import types
 from importlib.util import find_spec
 
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
 HAS_TRITON = (
     find_spec("triton") is not None
-    and not current_platform.is_xpu()  # Not compatible
+    or find_spec("pytorch-triton-xpu") is not None  # Not compatible
 )
 
 if not HAS_TRITON:
     logger.info("Triton not installed or not compatible; certain GPU-related"
                 " functions will not be available.")
+
+    class TritonPlaceholder(types.ModuleType):
+
+        def __init__(self):
+            super().__init__("triton")
+            self.jit = self._dummy_decorator("jit")
+            self.autotune = self._dummy_decorator("autotune")
+            self.heuristics = self._dummy_decorator("heuristics")
+            self.language = TritonLanguagePlaceholder()
+            logger.warning_once(
+                "Triton is not installed. Using dummy decorators. "
+                "Install it via `pip install triton` to enable kernel"
+                "compilation.")
+
+        def _dummy_decorator(self, name):
+
+            def decorator(func=None, **kwargs):
+                if func is None:
+                    return lambda f: f
+                return func
+
+            return decorator
+
+    class TritonLanguagePlaceholder(types.ModuleType):
+
+        def __init__(self):
+            super().__init__("triton.language")
+            self.constexpr = None
+            self.dtype = None
+            self.int64 = None
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 2ee3f9104d1..67b834533b7 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -19,6 +19,7 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
+from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
@@ -168,12 +169,20 @@ def _report_usage_once(self, model_architecture: str,
         # Platform information
         from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
-            device_property = torch.cuda.get_device_properties(0)
-            self.gpu_count = torch.cuda.device_count()
-            self.gpu_type = device_property.name
-            self.gpu_memory_per_device = device_property.total_memory
+            self.gpu_count = cuda_device_count_stateless()
+            self.gpu_type, self.gpu_memory_per_device = (
+                cuda_get_device_properties(0, ("name", "total_memory")))
         if current_platform.is_cuda():
             self.cuda_runtime = torch.version.cuda
+        if current_platform.is_tpu():
+            try:
+                import torch_xla
+                self.gpu_count = torch_xla.runtime.world_size()
+                self.gpu_type = torch_xla.tpu.get_tpu_type()
+                self.gpu_memory_per_device = (
+                    torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
+            except Exception:
+                pass
         self.provider = _detect_cloud_provider()
         self.architecture = platform.machine()
         self.platform = platform.platform()
diff --git a/vllm/utils.py b/vllm/utils.py
index c4c74a37d2f..3f334f94bc2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -38,11 +38,14 @@
 from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
                              Iterable, Iterator, KeysView, Mapping)
+from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Tuple, Type, TypeVar, Union, cast, overload)
+                    Optional, Sequence, Tuple, Type, TypeVar, Union, cast,
+                    overload)
+from urllib.parse import urlparse
 from uuid import uuid4
 
 import cachetools
@@ -61,6 +64,9 @@
 from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
+# NOTE: import triton_utils to make TritonPlaceholderModule work
+#       if triton is unavailable
+import vllm.triton_utils  # noqa: F401
 from vllm.logger import enable_trace_function_call, init_logger
 
 if TYPE_CHECKING:
@@ -303,8 +309,8 @@ def stat(self, *, delta: bool = False) -> CacheInfo:
         """
         Gets the cumulative number of hits and queries against this cache.
 
-        If :code:`delta=True`, instead gets these statistics
-        since the last call that also passed :code:`delta=True`.
+        If `delta=True`, instead gets these statistics
+        since the last call that also passed `delta=True`.
         """
         info = CacheInfo(hits=self._hits, total=self._total)
 
@@ -316,7 +322,10 @@ def stat(self, *, delta: bool = False) -> CacheInfo:
         return info
 
     def touch(self, key: _K) -> None:
-        self._LRUCache__update(key)  # type: ignore
+        try:
+            self._LRUCache__order.move_to_end(key)  # type: ignore
+        except KeyError:
+            self._LRUCache__order[key] = None  # type: ignore
 
     @overload
     def get(self, key: _K, /) -> Optional[_V]:
@@ -625,12 +634,12 @@ def get_open_port() -> int:
     process. Currently it uses 2 ports.
     """
     if "VLLM_DP_MASTER_PORT" in os.environ:
-        dp_port = envs.VLLM_DP_MASTER_PORT
+        dp_master_port = envs.VLLM_DP_MASTER_PORT
+        reserved_port_range = range(dp_master_port, dp_master_port + 10)
         while True:
-            port = _get_open_port()
-            if dp_port <= port < dp_port + 10:
-                continue
-            return port
+            candidate_port = _get_open_port()
+            if candidate_port not in reserved_port_range:
+                return candidate_port
     return _get_open_port()
 
 
@@ -757,21 +766,28 @@ def create_kv_caches_with_random_flash(
     model_dtype: Optional[Union[str, torch.dtype]] = None,
     seed: Optional[int] = None,
     device: Optional[str] = "cuda",
+    cache_layout: Optional[str] = "NHD",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
-    key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+    generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
+    assert cache_layout in ("NHD", "HND")
+    stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2,
+                                                                  4)
+
+    kv_cache_allocation_shape = tuple(generic_kv_cache_shape[i]
+                                      for i in stride_order)
     scale = head_size**-0.5
 
     key_caches: list[torch.Tensor] = []
     value_caches: list[torch.Tensor] = []
 
     for _ in range(num_layers):
-        key_value_cache = torch.empty(size=key_value_cache_shape,
+        key_value_cache = torch.empty(size=kv_cache_allocation_shape,
                                       dtype=torch_dtype,
-                                      device=device)
+                                      device=device).permute(*stride_order)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             key_value_cache.uniform_(-scale, scale)
         elif cache_dtype == 'fp8':
@@ -967,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
 
 def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
     """
-    Unlike :class:`itertools.groupby`, groups are not broken by
+    Unlike {class}`itertools.groupby`, groups are not broken by
     non-contiguous data.
     """
     groups = defaultdict[_K, list[_V]](list)
@@ -1232,6 +1248,22 @@ def cuda_is_initialized() -> bool:
     return torch.cuda.is_initialized()
 
 
+def cuda_get_device_properties(device,
+                               names: Sequence[str],
+                               init_cuda=False) -> tuple[Any, ...]:
+    """Get specified CUDA device property values without initializing CUDA in
+    the current process."""
+    if init_cuda or cuda_is_initialized():
+        props = torch.cuda.get_device_properties(device)
+        return tuple(getattr(props, name) for name in names)
+
+    # Run in subprocess to avoid initializing CUDA as a side effect.
+    mp_ctx = multiprocessing.get_context("fork")
+    with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
+        return executor.submit(cuda_get_device_properties, device, names,
+                               True).result()
+
+
 def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
     """Make an instance method that weakly references
     its associated instance and no-ops once that
@@ -1741,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
 
 
-def is_in_doc_build() -> bool:
-    try:
-        from sphinx.ext.autodoc.mock import _MockModule
-        return isinstance(torch, _MockModule)
-    except ModuleNotFoundError:
-        return False
-
-
 def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     """
     Import a Python file according to its file path.
@@ -1788,10 +1812,11 @@ class _PlaceholderBase:
     Disallows downstream usage of placeholder modules.
 
     We need to explicitly override each dunder method because
-    :meth:`__getattr__` is not called when they are accessed.
+    {meth}`__getattr__` is not called when they are accessed.
 
-    See also:
-        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    :::{seealso}
+    [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
+    :::
     """
 
     def __getattr__(self, key: str) -> Never:
@@ -2020,9 +2045,6 @@ def direct_register_custom_op(
     library object. If you want to bind the operator to a different library,
     make sure the library object is alive when the operator is used.
     """
-    if is_in_doc_build():
-        return
-
     if not supports_custom_op():
         from vllm.platforms import current_platform
         assert not current_platform.is_cuda_alike(), (
@@ -2247,6 +2269,27 @@ def get_exception_traceback():
     return err_str
 
 
+def split_zmq_path(path: str) -> Tuple[str, str, str]:
+    """Split a zmq path into its parts."""
+    parsed = urlparse(path)
+    if not parsed.scheme:
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    scheme = parsed.scheme
+    host = parsed.hostname or ""
+    port = str(parsed.port or "")
+
+    if scheme == "tcp" and not all((host, port)):
+        # The host and port fields are required for tcp
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    if scheme != "tcp" and port:
+        # port only makes sense with tcp
+        raise ValueError(f"Invalid zmq path: {path}")
+
+    return scheme, host, port
+
+
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
 def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
@@ -2286,6 +2329,12 @@ def make_zmq_socket(
     if identity is not None:
         socket.setsockopt(zmq.IDENTITY, identity)
 
+    # Determine if the path is a TCP socket with an IPv6 address.
+    # Enable IPv6 on the zmq socket if so.
+    scheme, host, _ = split_zmq_path(path)
+    if scheme == "tcp" and is_valid_ipv6_address(host):
+        socket.setsockopt(zmq.IPV6, 1)
+
     if bind:
         socket.bind(path)
     else:
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9c12406676a..df937bf297a 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -11,12 +11,14 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
+from vllm.attention.layer import Attention
 from vllm.attention.ops.merge_attn_states import merge_attn_states
+from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8,
-                                           get_flash_attn_version)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -24,7 +26,8 @@
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 if current_platform.is_cuda():
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
+    from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                      get_scheduler_metadata)
 
 logger = init_logger(__name__)
 
@@ -64,10 +67,6 @@ def get_kv_cache_shape(
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return use_cascade_attention(*args, **kwargs)
-
 
 @dataclass
 class FlashAttentionMetadata:
@@ -94,8 +93,9 @@ class FlashAttentionMetadata:
     prefix_kv_lens: Optional[torch.Tensor]
     suffix_kv_lens: Optional[torch.Tensor]
 
-    # For logging.
-    num_input_tokens: int = 0  # Number of tokens including padding.
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
 
     # for local attention
     @dataclass
@@ -105,6 +105,7 @@ class LocalAttentionMetadata:
         local_block_table: torch.Tensor
         local_max_query_len: int
         local_max_seq_len: int
+        local_scheduler_metadata: Optional[torch.Tensor]
 
     local_attn_metadata: Optional[LocalAttentionMetadata] = None
 
@@ -275,10 +276,34 @@ def make_local_attention_virtual_batches(
         block_table_local
 
 
+def _get_sliding_window_configs(
+        vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]:
+    """Get the set of all sliding window configs used in the model."""
+    sliding_window_configs: set[Optional[tuple[int, int]]] = set()
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    for layer in layers.values():
+        assert isinstance(layer.impl, FlashAttentionImpl)
+        sliding_window_configs.add(layer.impl.sliding_window)
+    return sliding_window_configs
+
+
 class FlashAttentionMetadataBuilder:
 
     def __init__(self, runner: "GPUModelRunner"):
+        model_config = runner.model_config
+
         self.runner = runner
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            runner.parallel_config)
+        self.headdim = model_config.get_head_size()
+        self.page_size = self.runner.block_size
+
+        self.aot_schedule = (get_flash_attn_version() == 3)
+        # Sliding window size to be used with the AOT scheduler will be
+        # populated on first build() call.
+        self.aot_sliding_window: Optional[tuple[int, int]] = None
 
     def reorder_batch(
             self, input_batch: "InputBatch",
@@ -298,6 +323,40 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             self.runner.device, non_blocking=True).long()
 
+        if self.aot_sliding_window is None:
+            self.aot_sliding_window = (-1, -1)
+            # For the AOT scheduler we need the sliding window value to be
+            # constant for all layers to. We have to populate this on the first
+            # build() call so the layers are constructed (cannot populate)
+            # in __init__.
+            if self.aot_schedule:
+                sliding_window_configs = _get_sliding_window_configs(
+                    self.runner.vllm_config)
+                if len(sliding_window_configs) == 1:
+                    sliding_window_config = sliding_window_configs.pop()
+                    if sliding_window_config is not None:
+                        self.aot_sliding_window = sliding_window_config
+                elif len(sliding_window_configs) > 1:
+                    self.aot_schedule = False
+
+        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
+                     max_seq_len, causal):
+            if self.aot_schedule:
+                return get_scheduler_metadata(
+                    batch_size=batch_size,
+                    max_seqlen_q=max_query_len,
+                    max_seqlen_k=max_seq_len,
+                    cache_seqlens=seqlens,
+                    num_heads_q=self.num_heads_q,
+                    num_heads_kv=self.num_heads_kv,
+                    headdim=self.headdim,
+                    page_size=self.page_size,
+                    cu_seqlens_q=cu_query_lens,
+                    causal=causal,
+                    window_size=self.aot_sliding_window,
+                )
+            return None
+
         # for local attention
         local_attn_metadata = None
         if self.runner.attention_chunk_size is not None:
@@ -309,18 +368,31 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                     block_table,
                     self.runner.block_size,
                 )
+            local_query_start_loc = torch.from_numpy(virt_q_cu_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_seqused_k = torch.from_numpy(virt_k_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_max_query_len = seqlens_q_local_np.max()
+            local_max_seq_len = virt_k_seqlens_np.max()
+            local_scheduler_metadata = schedule(
+                batch_size=local_query_start_loc.shape[0] - 1,
+                cu_query_lens=local_query_start_loc,
+                max_query_len=local_max_query_len,
+                seqlens=local_seqused_k,
+                max_seq_len=local_max_seq_len,
+                causal=True)
+
             local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
-                local_query_start_loc=torch.from_numpy(
-                    virt_q_cu_seqlens_np).to(self.runner.device,
-                                             non_blocking=True),
-                local_seqused_k=torch.from_numpy(virt_k_seqlens_np).to(
-                    self.runner.device, non_blocking=True),
+                local_query_start_loc=local_query_start_loc,
+                local_seqused_k=local_seqused_k,
                 local_block_table=virt_block_table,
-                local_max_query_len=seqlens_q_local_np.max(),
-                local_max_seq_len=virt_k_seqlens_np.max(),
+                local_max_query_len=local_max_query_len,
+                local_max_seq_len=local_max_seq_len,
+                local_scheduler_metadata=local_scheduler_metadata,
             )
 
         use_cascade = common_prefix_len > 0
+
         if use_cascade:
             cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
                                                 dtype=torch.int32,
@@ -332,10 +404,31 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                               common_prefix_len)
             suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(
                 self.runner.device)
+            prefix_scheduler_metadata = schedule(
+                batch_size=1,
+                cu_query_lens=cu_prefix_query_lens,
+                max_query_len=num_actual_tokens,
+                seqlens=prefix_kv_lens,
+                max_seq_len=common_prefix_len,
+                causal=False)
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=suffix_kv_lens,
+                                          max_seq_len=max_seq_len -
+                                          common_prefix_len,
+                                          causal=True)
         else:
             cu_prefix_query_lens = None
             prefix_kv_lens = None
             suffix_kv_lens = None
+            prefix_scheduler_metadata = None
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
+                                          max_query_len=max_query_len,
+                                          seqlens=seq_lens,
+                                          max_seq_len=max_seq_len,
+                                          causal=True)
 
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -347,13 +440,18 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
             slot_mapping=slot_mapping,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
+            scheduler_metadata=scheduler_metadata,
             cu_prefix_query_lens=cu_prefix_query_lens,
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
             local_attn_metadata=local_attn_metadata,
+            prefix_scheduler_metadata=prefix_scheduler_metadata,
         )
         return attn_metadata
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return use_cascade_attention(*args, **kwargs)
+
 
 class FlashAttentionImpl(AttentionImpl):
 
@@ -493,12 +591,14 @@ def forward(
                 max_seqlen_q = local_metadata.local_max_query_len
                 max_seqlen_k = local_metadata.local_max_seq_len
                 block_table = local_metadata.local_block_table
+                scheduler_metadata = local_metadata.local_scheduler_metadata
             else:
                 cu_seqlens_q = attn_metadata.query_start_loc
                 seqused_k = attn_metadata.seq_lens
                 max_seqlen_q = attn_metadata.max_query_len
                 max_seqlen_k = attn_metadata.max_seq_len
                 block_table = attn_metadata.block_table
+                scheduler_metadata = attn_metadata.scheduler_metadata
 
             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
 
@@ -517,6 +617,7 @@ def forward(
                 window_size=self.sliding_window,
                 block_table=block_table,
                 softcap=self.logits_soft_cap,
+                scheduler_metadata=scheduler_metadata,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
                 k_descale=layer._k_scale.expand(descale_shape),
@@ -545,6 +646,8 @@ def forward(
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
             fa_version=self.vllm_flash_attn_version,
+            prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
+            suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
             q_descale=layer._q_scale,
             k_descale=layer._k_scale,
             v_descale=layer._v_scale,
@@ -638,6 +741,8 @@ def cascade_attention(
     block_table: torch.Tensor,
     common_prefix_len: int,
     fa_version: int,
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None,
+    suffix_scheduler_metadata: Optional[torch.Tensor] = None,
     q_descale: Optional[torch.Tensor] = None,
     k_descale: Optional[torch.Tensor] = None,
     v_descale: Optional[torch.Tensor] = None,
@@ -669,6 +774,7 @@ def cascade_attention(
         block_table=block_table[:1],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        scheduler_metadata=prefix_scheduler_metadata,
         fa_version=fa_version,
         q_descale=q_descale.expand(descale_shape)
         if q_descale is not None else None,
@@ -695,6 +801,7 @@ def cascade_attention(
         block_table=block_table[:, num_common_kv_blocks:],
         softcap=logits_soft_cap,
         return_softmax_lse=True,
+        scheduler_metadata=suffix_scheduler_metadata,
         fa_version=fa_version,
         q_descale=q_descale.expand(descale_shape)
         if q_descale is not None else None,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
new file mode 100755
index 00000000000..6e964b471fa
--- /dev/null
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -0,0 +1,635 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashInfer."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
+                        BatchPrefillWithPagedKVCacheWrapper,
+                        MultiLevelCascadeAttentionWrapper)
+
+import vllm.envs as envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionType)
+from vllm.attention.layer import Attention
+from vllm.config import (VllmConfig, get_current_vllm_config,
+                         get_layers_from_vllm_config)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.flash_attn import use_cascade_attention
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
+
+logger = init_logger(__name__)
+
+
+class FlashInferBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [64, 128, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type[FlashInferImpl]:
+        return FlashInferImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type[FlashInferMetadata]:
+        return FlashInferMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type[FlashInferMetadataBuilder]:
+        return FlashInferMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+
+@dataclass
+class PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters.
+    """
+
+    window_left: int
+    logits_soft_cap: Optional[float]
+    sm_scale: float
+
+
+def get_per_layer_parameters(
+        vllm_config: VllmConfig) -> dict[str, PerLayerParameters]:
+    """
+    Scan all attention layers and determine some hyperparameters
+    to use during `plan`.
+    """
+
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    per_layer_params: dict[str, PerLayerParameters] = {}
+
+    for key, layer in layers.items():
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
+
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
+
+    return per_layer_params
+
+
+def infer_global_hyperparameters(
+        per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
+    """
+    Currently, FlashInfer backend only support models in which all layers share
+    the same values for the following hyperparameters:
+    - `window_left`
+    - `logits_soft_cap`
+    - `sm_scale`
+
+    So this function asserts that all layers share the same values for these
+    hyperparameters and returns the global values.
+    """
+
+    assert len(per_layer_params) > 0, "No attention layers found in the model."
+
+    param_sets = list(per_layer_params.values())
+    global_params = param_sets[0]
+    for params in param_sets:
+        assert params == global_params, (
+            "FlashInfer backend currently only supports models in which all "
+            "layers share the same values for the following hyperparameters: "
+            "`window_left`, `logits_soft_cap`, `sm_scale`.")
+
+    return global_params
+
+
+@dataclass
+class FlashInferMetadata:
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    qo_indptr: torch.Tensor
+    # An example for paged_kv_indices, paged_kv_indptr:
+    # request 1, page indices [0, 5, 8]
+    # request 2, page indices [1, 6, 7]
+    # request 3, page indices [3, 4]
+    # paged_kv_indices is a concatenation of page indices of all requests:
+    # [0, 5, 8, 1, 6, 7, 3, 4]
+    # paged_kv_indptr is used to index into paged_kv_indices:
+    # [0, 3, 6, 8]
+    # The indptr of the paged kv cache, shape: [batch_size + 1]
+    paged_kv_indptr: torch.Tensor
+    # The page indices of the paged kv cache
+    paged_kv_indices: torch.Tensor
+    # The number of entries in the last page of each request in
+    # the paged kv cache, shape: [batch_size]
+    paged_kv_last_page_len: torch.Tensor
+    # The number of query/output heads
+    num_qo_heads: int
+    # The number of key/value heads
+    num_kv_heads: int
+    # The dimension of the attention heads
+    head_dim: int
+    # Block size of vllm
+    page_size: int
+    # The data type of the paged kv cache
+    data_type: torch.dtype
+    # The data type of the query
+    q_data_type: torch.dtype
+
+    slot_mapping: torch.Tensor
+
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+    num_prefill_tokens: int
+
+    # For cascade attention.
+    use_cascade: bool
+    shared_qo_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indptr: Optional[torch.Tensor] = None
+    shared_kv_page_indices: Optional[torch.Tensor] = None
+    shared_kv_last_page_len: Optional[torch.Tensor] = None
+
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
+    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
+    cascade_wrapper: Optional[MultiLevelCascadeAttentionWrapper] = None
+
+    @property
+    def query_start_loc(self):
+        # The GPUModelRunner expects to be able to access this property.
+        return self.qo_indptr
+
+    def __post_init__(self):
+        # Refer to
+        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
+        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f" received {self.head_dim}.")
+
+
+class FlashInferMetadataBuilder:
+
+    def __init__(self, runner: GPUModelRunner):
+        self.runner = runner
+        self._workspace_buffer = None
+        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._decode_wrapper = None  # Wrapper for decode
+        self._cascade_wrapper = None  # Wrapper for cascade attention
+
+        # Global hyperparameters shared by all attention layers
+        self.global_hyperparameters: Optional[PerLayerParameters] = None
+
+        self.vllm_config = get_current_vllm_config()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        # We now want to reorder the batch so that the "decode" requests are and
+        # the front and the "prefill" requests are at the using the least amount
+        # swaps possible. (NOTE for now we loosely use "decode" to mean requests
+        # where attention is likely memory-bound and "prefill" to mean requests
+        # where attention is likely compute-bound, TODO(lucas): figure out a
+        # better naming here)
+        decodes = []
+        prefills = []
+        num_decode_tokens = 0
+        num_prefill_tokens = 0
+
+        for i, req_id in enumerate(input_batch.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            # for now treat 1 scheduled token as "decode" even if its not,
+            # we should update this to something like < 8 in the future but
+            # currently the decode run only supports num_tokens = 1
+            if num_tokens == 1:
+                decodes.append(i)
+                num_decode_tokens += num_tokens
+            else:
+                prefills.append(i)
+                num_prefill_tokens += num_tokens
+
+        # We hope that this is fairly minimal since decodes
+        # should be around for a number of iterations so hopefully they are
+        # relatively stationary (and new request are generally appended to the
+        # persistent batch so already should be at the back)
+        # To achieve this we loop over the decodes in descending order and
+        # the prefills in ascending order. We swap decodes from the  "back"
+        # i.e. past where the last decode should be in the reodorered with
+        # prefills from the front of the batch.
+        # `decodes` and `prefills` are already in ascending order just based on
+        # the above loop
+        num_decodes = len(decodes)
+        num_prefills = len(prefills)
+        modified_batch = False
+
+        for i in range(1, min(num_decodes, num_prefills) + 1):
+            # If the decode is at the "back" of the batch, i, we can swap it
+            # with the prefill closest to the front of the batch
+            decode_idx = decodes[num_decodes - i]
+            if decode_idx < num_decodes:
+                break
+
+            input_batch.swap_states(prefills[i - 1], decode_idx)
+            modified_batch = True
+
+        # Save for next `build` call
+        # TODO(lucas): this is a bit of a hack, we should probably have a
+        # better way of doing this
+        self._num_decodes = num_decodes
+        self._num_prefills = num_prefills
+        self._num_decode_tokens = num_decode_tokens
+        self._num_prefill_tokens = num_prefill_tokens
+
+        return modified_batch
+
+    def _get_workspace_buffer(self):
+        if self._workspace_buffer is None:
+            self._workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_BUFFER_SIZE,
+                dtype=torch.uint8,
+                device=self.runner.device)
+        return self._workspace_buffer
+
+    def _get_prefill_wrapper(self):
+        if self._prefill_wrapper is None:
+            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(), "NHD")
+        return self._prefill_wrapper
+
+    def _get_decode_wrapper(self):
+        if self._decode_wrapper is None:
+            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config))
+            num_kv_heads = self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config)
+            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
+                num_qo_heads // num_kv_heads > 4)
+            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+                self._get_workspace_buffer(),
+                "NHD",
+                use_tensor_cores=use_tensor_cores)
+        return self._decode_wrapper
+
+    def _get_cascade_wrapper(self):
+        if self._cascade_wrapper is None:
+            self._cascade_wrapper = MultiLevelCascadeAttentionWrapper(
+                2, self._get_workspace_buffer(), "NHD")
+        return self._cascade_wrapper
+
+    def _plan(self, attn_metadata: FlashInferMetadata):
+        if self.global_hyperparameters is None:
+            self.global_hyperparameters = infer_global_hyperparameters(
+                get_per_layer_parameters(self.vllm_config))
+        if attn_metadata.use_cascade:
+            attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
+            attn_metadata.cascade_wrapper.plan(
+                [attn_metadata.shared_qo_indptr, attn_metadata.qo_indptr],
+                [
+                    attn_metadata.shared_kv_page_indptr,
+                    attn_metadata.paged_kv_indptr
+                ],
+                [
+                    attn_metadata.shared_kv_page_indices,
+                    attn_metadata.paged_kv_indices
+                ],
+                [
+                    attn_metadata.shared_kv_last_page_len,
+                    attn_metadata.paged_kv_last_page_len
+                ],
+                attn_metadata.num_qo_heads,
+                attn_metadata.num_kv_heads,
+                attn_metadata.head_dim,
+                attn_metadata.page_size,
+                causal=True,
+                sm_scale=self.global_hyperparameters.sm_scale,
+                window_left=self.global_hyperparameters.window_left,
+                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                q_data_type=attn_metadata.q_data_type,
+            )
+        else:
+            # Regular attention (common case).
+            # Decodes are at the front and prefills are at the back,
+            # according to reorder_batch()
+            if self._num_prefills > 0:
+                # Decodes are first so prefills start after the last decode
+                prefill_start = self._num_decodes
+                attn_metadata.prefill_wrapper = self._get_prefill_wrapper()
+                assert attn_metadata.qo_indptr[prefill_start:].shape[
+                    0] == self._num_prefills + 1
+                assert attn_metadata.paged_kv_indptr[prefill_start:].shape[
+                    0] == self._num_prefills + 1
+                assert attn_metadata.paged_kv_last_page_len[
+                    prefill_start:].shape[0] == self._num_prefills
+                # Since prefill_wrapper.run() will be called with
+                # query[num_decode_tokens:] we need to adjust the qo_indptr
+                # to be relative to the start of the prefill queries.
+                qo_indptr = attn_metadata.qo_indptr[
+                    prefill_start:] - attn_metadata.qo_indptr[prefill_start]
+                attn_metadata.prefill_wrapper.plan(
+                    qo_indptr,
+                    attn_metadata.paged_kv_indptr[prefill_start:],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[prefill_start:],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    causal=True,
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.data_type,
+                )
+
+            if self._num_decodes > 0:
+                attn_metadata.decode_wrapper = self._get_decode_wrapper()
+                attn_metadata.decode_wrapper.plan(
+                    attn_metadata.paged_kv_indptr[:self._num_decodes + 1],
+                    attn_metadata.paged_kv_indices,
+                    attn_metadata.paged_kv_last_page_len[:self._num_decodes],
+                    attn_metadata.num_qo_heads,
+                    attn_metadata.num_kv_heads,
+                    attn_metadata.head_dim,
+                    attn_metadata.page_size,
+                    # Disable flashinfer's pos encoding and use vllm's rope.
+                    pos_encoding_mode="NONE",
+                    sm_scale=self.global_hyperparameters.sm_scale,
+                    window_left=self.global_hyperparameters.window_left,
+                    logits_soft_cap=self.global_hyperparameters.
+                    logits_soft_cap,
+                    q_data_type=attn_metadata.q_data_type,
+                    kv_data_type=attn_metadata.data_type,
+                )
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        assert self._num_decodes + self._num_prefills == num_reqs
+        assert (self._num_decode_tokens +
+                self._num_prefill_tokens == num_actual_tokens)
+        page_size = self.runner.block_size
+        device = self.runner.device
+        qo_indptr = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
+            self.runner.device, non_blocking=True)
+        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(self.runner.device,
+                                                          non_blocking=True)
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            self.runner.device, non_blocking=True).long()
+
+        block_table_bounds = (seq_lens + page_size - 1) // page_size
+
+        use_cascade = common_prefix_len > 0
+        if use_cascade:
+            # Grab the blocks of the shared prefix from the first request.
+            assert common_prefix_len % page_size == 0
+            num_common_kv_blocks = common_prefix_len // page_size
+            shared_qo_indptr = torch.tensor([0, num_actual_tokens],
+                                            dtype=torch.int32,
+                                            device=device)
+            shared_kv_page_indptr = torch.tensor([0, num_common_kv_blocks],
+                                                 dtype=torch.int32,
+                                                 device=device)
+            shared_kv_page_indices = block_table[0, :num_common_kv_blocks]
+            shared_kv_last_page_len = torch.tensor([page_size],
+                                                   dtype=torch.int32,
+                                                   device=device)
+            # Remove the blocks of the shared prefix from all requests.
+            block_table = block_table[:, num_common_kv_blocks:]
+            block_table_bounds -= num_common_kv_blocks
+        else:
+            shared_qo_indptr = None
+            shared_kv_page_indptr = None
+            shared_kv_page_indices = None
+            shared_kv_last_page_len = None
+
+        mask = (torch.arange(block_table.size(1),
+                             dtype=block_table.dtype,
+                             device=block_table.device).unsqueeze(0)
+                < block_table_bounds.unsqueeze(1))
+        paged_kv_indices = block_table[mask]
+
+        paged_kv_indptr = torch.cat([
+            torch.zeros(1,
+                        dtype=block_table_bounds.dtype,
+                        device=block_table_bounds.device),
+            block_table_bounds.cumsum(dim=0, dtype=torch.int32)
+        ])
+
+        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
+                                             page_size, paged_kv_last_page_len)
+
+        attn_metadata = FlashInferMetadata(
+            num_actual_tokens=num_actual_tokens,
+            qo_indptr=qo_indptr,
+            paged_kv_indptr=paged_kv_indptr,
+            paged_kv_indices=paged_kv_indices,
+            paged_kv_last_page_len=paged_kv_last_page_len,
+            num_qo_heads=self.runner.num_query_heads,
+            num_kv_heads=self.runner.num_kv_heads,
+            head_dim=self.runner.head_size,
+            page_size=page_size,
+            data_type=self.runner.kv_cache_dtype,
+            q_data_type=self.runner.dtype,
+            slot_mapping=slot_mapping,
+            num_decodes=self._num_decodes,
+            num_decode_tokens=self._num_decode_tokens,
+            num_prefills=self._num_prefills,
+            num_prefill_tokens=self._num_prefill_tokens,
+            use_cascade=use_cascade,
+            shared_qo_indptr=shared_qo_indptr,
+            shared_kv_page_indptr=shared_kv_page_indptr,
+            shared_kv_page_indices=shared_kv_page_indices,
+            shared_kv_last_page_len=shared_kv_last_page_len,
+        )
+
+        self._plan(attn_metadata)
+
+        return attn_metadata
+
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        if self.runner.kv_cache_dtype != self.runner.model_config.dtype:
+            # TODO: The cascade wrapper currently does not support setting
+            # kv cache dtype to something different from query dtype.
+            return False
+        return use_cascade_attention(*args, **kwargs)
+
+
+class FlashInferImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        self.logits_soft_cap = logits_soft_cap
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferImpl")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashInferMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashInfer.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [num_blocks, 2, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        # Reshape the input keys and values and store them in the cache.
+        # NOTE(woosuk): Here, key and value are padded while slot_mapping is
+        # not padded. However, we don't need to do key[:num_actual_tokens] and
+        # value[:num_actual_tokens] because the reshape_and_cache_flash op uses
+        # the slot_mapping's shape to determine the number of actual tokens.
+        torch.ops._C_cache_ops.reshape_and_cache_flash(
+            key,
+            value,
+            kv_cache[:, 0],
+            kv_cache[:, 1],
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        window_left = (self.sliding_window[0]
+                       if self.sliding_window is not None else -1)
+
+        # Inputs and outputs may be padded for CUDA graphs
+        query = query[:num_actual_tokens]
+        output_padded = output
+        output = output[:num_actual_tokens]
+
+        if attn_metadata.use_cascade:
+            # Cascade attention (rare case).
+            assert attn_metadata.cascade_wrapper is not None
+            output.copy_(attn_metadata.cascade_wrapper.run(query, kv_cache))
+            return output
+
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+
+        # Regular attention (common case).
+        # Decodes are at the front and prefills are at the back,
+        # according to reorder_batch()
+        if prefill_wrapper := attn_metadata.prefill_wrapper:
+            prefill_query = query[num_decode_tokens:]
+            assert prefill_query.shape[0] == num_prefill_tokens
+            assert prefill_wrapper is not None
+            assert prefill_wrapper._causal
+            assert prefill_wrapper._window_left == window_left
+            assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                        or 0.0)
+            assert prefill_wrapper._sm_scale == self.scale
+            prefill_wrapper.run(
+                prefill_query,
+                kv_cache,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[num_decode_tokens:],
+            )
+
+        if decode_wrapper := attn_metadata.decode_wrapper:
+            decode_query = query[:num_decode_tokens]
+            assert decode_query.shape[0] == num_decode_tokens
+            assert decode_wrapper is not None
+            assert decode_wrapper._window_left == window_left
+            assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
+                                                       or 0.0)
+            assert decode_wrapper._sm_scale == self.scale
+            decode_wrapper.run(
+                decode_query,
+                kv_cache,
+                k_scale=layer._k_scale_float,
+                v_scale=layer._v_scale_float,
+                out=output[:num_decode_tokens],
+            )
+
+        return output_padded
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index dbd05428970..692581933e1 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """
+# MLA Common Components
+
 This file implements common components for MLA implementations.
 
 First we define:
@@ -196,21 +198,24 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
+from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.merge_attn_states import merge_attn_states
+from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase, RowParallelLinear,
+                                               LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
-from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
     from flash_attn import flash_attn_varlen_func
+    is_vllm_fa = False
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -249,10 +254,6 @@ def get_kv_cache_shape(
     def get_supported_head_sizes() -> list[int]:
         return [576]
 
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
 
 @dataclass
 class MLACommonPrefillMetadata:
@@ -314,9 +315,6 @@ class MLACommonMetadata(Generic[D]):
     num_decode_tokens: int
     num_prefills: int
 
-    # For logging.
-    num_input_tokens: int = 0  # Number of tokens including padding.
-
     # The dimension of the attention heads
     head_dim: Optional[int] = None
 
@@ -351,6 +349,14 @@ def __init__(self,
         model_config = runner.model_config
         cache_config = runner.cache_config
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.num_heads = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.mla_dims = get_mla_dims(model_config)
+        self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3)
+
+        # Dont try to access the runner on AMD
+        if self.aot_schedule:
+            self.page_size = self.runner.block_size
 
         if self.chunked_prefill_enabled:
             self.chunked_prefill_workspace_size = min(
@@ -376,7 +382,6 @@ def __init__(self,
                 dtype=model_config.dtype,
                 device=runner.device,
             )
-            self.page_size = self.runner.block_size
 
     def reorder_batch(
         self,
@@ -424,19 +429,19 @@ def reorder_batch(
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
             # with the prefill closest to the front of the batch
-            if decodes[num_decodes - i] < num_decodes:
+            decode_idx = decodes[num_decodes - i]
+            if decode_idx < num_decodes:
                 break
 
-            i1 = prefills[i - 1]
-            i2 = decodes[num_decodes - i]
-            input_batch.swap_states(i1, i2)
+            prefill_idx = prefills[i - 1]
+            input_batch.swap_states(prefill_idx, decode_idx)
 
             # Using "move" operation of LogitsProcessors via temporary slot
             # currently.
             # TODO possibly add more direct swap operation to LPs
-            swaps.append((i1, input_batch.max_num_reqs))
-            swaps.append((i2, i1))
-            swaps.append((input_batch.max_num_reqs, i2))
+            swaps.append((prefill_idx, input_batch.max_num_reqs))
+            swaps.append((decode_idx, prefill_idx))
+            swaps.append((input_batch.max_num_reqs, decode_idx))
 
         # Save for next `build` call
         # TODO(lucas): this is a bit of a hack, we should probably have a
@@ -475,7 +480,6 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
 
         seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
         seq_lens = seq_lens_cpu.to(device, non_blocking=True)
-        max_query_len = seq_lens_cpu.max().item()
 
         prefill_metadata = None
         if self._num_prefills > 0:
@@ -486,6 +490,8 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                 num_computed_tokens_cpu_tensor[reqs_start:num_reqs]
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
 
             chunked_context_metadata = None
             if self.chunked_prefill_enabled and self._num_prefills > 0 \
@@ -548,8 +554,7 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
             prefill_metadata = MLACommonPrefillMetadata(
                 input_positions=input_positions[tokens_start:],
                 block_table=block_table[reqs_start:, ...],
-                query_start_loc=query_start_loc[reqs_start:] -
-                query_start_loc[reqs_start],
+                query_start_loc=prefill_query_start_loc,
                 max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
             )
@@ -575,6 +580,9 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
             decode=decode_metadata,
         )
 
+    def use_cascade_attention(self, *args, **kwargs) -> bool:
+        return False
+
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
@@ -602,12 +610,7 @@ def __init__(
         qk_head_dim: int,
         v_head_dim: int,
         rotary_emb: RotaryEmbedding,
-        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
-        # attention backend perspective we rely on the layer to pass in the
-        # correct matrix
-        q_proj: ColumnParallelLinear,
         kv_b_proj: ColumnParallelLinear,
-        o_proj: RowParallelLinear,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -630,41 +633,70 @@ def __init__(
         if current_platform.is_cuda():
             self.rotary_emb = rotary_emb.forward_cuda
 
-        self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
-        self.o_proj = o_proj
         self.vllm_flash_attn_version = get_flash_attn_version()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
         self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version()
         if self.vllm_flash_attn_version is not None:
             self.flash_attn_varlen_func = \
                 functools.partial(flash_attn_varlen_func,
                                   fa_version=self.vllm_flash_attn_version)
 
-    def _v_up_proj_and_o_proj(self, x):
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim for attention backends that do
+        # not support different headdims
+        # We don't need to pad V if we are on a hopper system with FA3
+        self._pad_v = self.vllm_flash_attn_version is None or not (
+            self.vllm_flash_attn_version == 3
+            and current_platform.get_device_capability()[0] == 9)
+
+    def _flash_attn_varlen_diff_headdims(self,
+                                         q,
+                                         k,
+                                         v,
+                                         return_softmax_lse=False,
+                                         softmax_scale=None,
+                                         **kwargs):
+        maybe_padded_v = v
+        if self._pad_v:
+            maybe_padded_v = torch.nn.functional.pad(
+                v, [0, q.shape[-1] - v.shape[-1]], value=0)
+
+        attn_out = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=maybe_padded_v,
+            return_softmax_lse=return_softmax_lse,
+            softmax_scale=softmax_scale,
+            **kwargs,
+        )
+
+        # Unpack the output if there is multiple results
+        lse = None
+        if isinstance(attn_out, tuple):
+            attn_out, lse = attn_out[0], attn_out[1]
+
+        # unpad if necessary
+        if self._pad_v:
+            attn_out = attn_out[..., :v.shape[-1]]
+
+        # Remain consistent with old `flash_attn_varlen_func` where there
+        # is only one output tensor if `return_softmax_lse` is False.
+        if return_softmax_lse:
+            return attn_out, lse
+        return attn_out
+
+    def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
         # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
         x = torch.bmm(x, self.W_UV)
         # Convert from (N, B, V) to (B, N * V)
-        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
-        return self.o_proj(x)[0]
-
-    # Return `ql_nope`, `q_pe`
-    def _q_proj_and_k_up_proj(self, x):
-        q_nope, q_pe = self.q_proj(x)[0]\
-            .view(-1, self.num_heads, self.qk_head_dim)\
-            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
-
-        # Convert from (B, N, P) to (N, B, P)
-        q_nope = q_nope.transpose(0, 1)
-        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-        ql_nope = torch.bmm(q_nope, self.W_UK_T)
-        # Convert from (N, B, L) to (B, N, L)
-        return ql_nope.transpose(0, 1), q_pe
+        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
@@ -756,16 +788,11 @@ def _compute_prefill_context(
             k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                           dim=-1)
 
-            # For MLA the v head dim is smaller than qk head dim so we pad
-            # out v with 0s to match the qk head dim
-            v_padded = torch.nn.functional.pad(v,
-                                               [0, q.shape[-1] - v.shape[-1]],
-                                               value=0)
-
-            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+            attn_output, attn_softmax_lse = \
+                self._flash_attn_varlen_diff_headdims(
                 q=q,
                 k=k,
-                v=v_padded,
+                v=v,
                 cu_seqlens_q=prefill_metadata.query_start_loc,
                 cu_seqlens_k=prefill_metadata.chunked_context.cu_seq_lens[i],
                 max_seqlen_q=prefill_metadata.max_query_len,
@@ -812,15 +839,10 @@ def _forward_prefill(
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        output = self.flash_attn_varlen_func(
+        output = self._flash_attn_varlen_diff_headdims(
             q=q,
             k=k,
-            v=v_padded,
+            v=v,
             cu_seqlens_q=attn_metadata.prefill.query_start_loc,
             cu_seqlens_k=attn_metadata.prefill.query_start_loc,
             max_seqlen_q=attn_metadata.prefill.max_query_len,
@@ -844,12 +866,7 @@ def _forward_prefill(
                 suffix_lse=suffix_lse,
             )
 
-        # slice by `:v.shape[-1]` in order to remove v headdim padding
-        output = output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(output)[0]
+        return output.flatten(start_dim=-2)
 
     @abstractmethod
     def _forward_decode(
@@ -864,7 +881,7 @@ def _forward_decode(
     def forward(
         self,
         layer: AttentionLayer,
-        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        q: torch.Tensor,
         k_c_normed: torch.Tensor,  # key in unified attn
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
@@ -883,7 +900,7 @@ def forward(
         # Inputs and outputs may be padded for CUDA graphs
         output_padded = output
         output = output[:num_actual_toks, ...]
-        hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...]
+        q = q[:num_actual_toks, ...]
         k_c_normed = k_c_normed[:num_actual_toks, ...]
         k_pe = k_pe[:num_actual_toks, ...]
 
@@ -898,30 +915,34 @@ def forward(
         has_prefill = attn_metadata.num_prefills > 0
         num_decode_tokens = attn_metadata.num_decode_tokens
 
-        decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
+        decode_q = q[:num_decode_tokens]
         decode_k_pe = k_pe[:num_decode_tokens]
 
-        prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:]
+        prefill_q = q[num_decode_tokens:]
         prefill_k_pe = k_pe[num_decode_tokens:]
         prefill_k_c_normed = k_c_normed[num_decode_tokens:]
 
         if has_decode:
             assert attn_metadata.decode is not None
-            decode_ql_nope, decode_q_pe = \
-                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
+            decode_q_nope, decode_q_pe = decode_q.split(
+                [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+            # Convert from (B, N, P) to (N, B, P)
+            decode_q_nope = decode_q_nope.transpose(0, 1)
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            decode_ql_nope = decode_ql_nope.transpose(0, 1)
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
-                attn_metadata.decode.input_positions, decode_q_pe.contiguous(),
-                decode_k_pe)
+                attn_metadata.decode.input_positions, decode_q_pe, decode_k_pe)
 
         if has_prefill:
             assert attn_metadata.prefill is not None
-            prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
-                .view(-1, self.num_heads, self.qk_head_dim)
             prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
 
             prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
-                attn_metadata.prefill.input_positions,
-                prefill_q_pe.contiguous(), prefill_k_pe)
+                attn_metadata.prefill.input_positions, prefill_q_pe,
+                prefill_k_pe)
 
         # write the latent and rope to kv cache
         if kv_cache.numel() > 0:
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 143bfe35bb5..f18c9c8b646 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -146,4 +146,4 @@ def _forward_decode(
             causal=True,
         )
 
-        return self._v_up_proj_and_o_proj(o)
+        return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 8e7e4f10b81..2e6b619db62 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -115,4 +115,4 @@ def _forward_decode(
                              attn_metadata.decode.seq_lens, attn_logits,
                              num_kv_splits, self.scale, PAGE_SIZE)
 
-        return self._v_up_proj_and_o_proj(o)
+        return self._v_up_proj(o)
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 3e8149a24eb..05b97172bc6 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -10,7 +10,9 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.utils import cdiv
 
 logger = init_logger(__name__)
 
@@ -50,6 +52,19 @@ def swap_blocks(
     ) -> None:
         raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
+    # In recent TPU generations, up to v6e, the SMEM size is 1MB. The
+    # block_tables within the PallasMetadata constitute almost the entire SMEM
+    # requirement. Its size is max_num_seqs * num_page_per_seq * 4 (Int). Here
+    # we simply make sure that the size is smaller than half of SMEM capacity.
+    @staticmethod
+    def get_min_page_size(vllm_config: VllmConfig) -> int:
+        max_num_page_per_req = (1024 * 1024 // 2 //
+                                vllm_config.scheduler_config.max_num_seqs // 4)
+        min_page_size = cdiv(vllm_config.model_config.max_model_len,
+                             max_num_page_per_req)
+        min_page_size = 1 << (min_page_size - 1).bit_length()
+        return min_page_size
+
 
 @dataclass
 class PallasMetadata:
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 74f3f7852c9..f2ed183b68f 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -3,6 +3,8 @@
 from collections.abc import Iterable
 from typing import Callable, Optional
 
+from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
+                                        BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -26,7 +28,12 @@ class BlockPool:
         enable_caching: Whether to enable prefix caching.
     """
 
-    def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+    def __init__(
+        self,
+        num_gpu_blocks: int,
+        enable_caching: bool,
+        enable_kv_cache_events: bool = False,
+    ):
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
@@ -56,6 +63,9 @@ def __init__(self, num_gpu_blocks: int, enable_caching: bool):
         # avoid freeing it.
         self.null_block = self.free_block_queue.popleft()
 
+        self.enable_kv_cache_events = enable_kv_cache_events
+        self.kv_event_queue: list[KVCacheEvent] = []
+
     def get_cached_block(self,
                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.
@@ -116,6 +126,9 @@ def cache_full_blocks(
             assert prev_block.block_hash is not None
             prev_block_hash_value = prev_block.block_hash.hash_value
 
+        parent_block_hash = prev_block_hash_value
+        new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
+                                           else None)
         for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
 
@@ -153,8 +166,23 @@ def cache_full_blocks(
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            if new_hashes is not None:
+                new_hashes.append(block_hash.hash_value)
             prev_block_hash_value = block_hash.hash_value
 
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(
+                BlockStored(
+                    block_hashes=new_hashes,
+                    parent_block_hash=parent_block_hash,
+                    token_ids=request.
+                    all_token_ids[num_cached_blocks *
+                                  block_size:num_full_blocks * block_size],
+                    block_size=block_size,
+                    lora_id=request.lora_request.id
+                    if request.lora_request else None,
+                ))
+
     def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
         """Get new blocks from the free block pool.
 
@@ -206,6 +234,9 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
             if len(self.cached_block_hash_to_block[block_hash]) == 0:
                 del self.cached_block_hash_to_block[block_hash]
 
+            if self.enable_kv_cache_events:
+                self.kv_event_queue.append(
+                    BlockRemoved(block_hashes=[block_hash.hash_value]))
             return True
         return False
 
@@ -262,6 +293,10 @@ def reset_prefix_cache(self) -> bool:
             block.reset_hash()
 
         logger.info("Successfully reset prefix cache")
+
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(AllBlocksCleared())
+
         return True
 
     def get_num_free_blocks(self) -> int:
@@ -279,3 +314,15 @@ def get_usage(self) -> float:
             The KV cache usage (between 0.0 and 1.0).
         """
         return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Atomically takes all events and clears the queue.
+        
+        Returns:
+            A list of KV cache events.
+        """
+        if not self.enable_kv_cache_events:
+            return []
+        events = self.kv_event_queue
+        self.kv_event_queue = []
+        return events
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 33761cf7f9c..a2fa5825bb1 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -4,6 +4,7 @@
 from collections.abc import Iterable
 from typing import Optional
 
+from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
 from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
@@ -25,8 +26,9 @@ def __init__(
         max_model_len: int,
         enable_caching: bool = True,
         caching_hash_algo: str = "builtin",
-        num_preallocate_tokens: int = 64,
+        use_eagle: bool = False,
         log_stats: bool = False,
+        enable_kv_cache_events: bool = False,
     ) -> None:
         assert len(kv_cache_config.kv_cache_groups) == 1, (
             "KVCacheManager does not support hybrid models with more than 1 "
@@ -39,27 +41,18 @@ def __init__(
 
         self.enable_caching = enable_caching
         self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
-        # FIXME: make prefix cache stats conditional on log_stats
+        self.use_eagle = use_eagle
         self.log_stats = log_stats
-        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
-        # blocks for each request. For example, when a request reaches the end
-        # of its block table, we preallocate N blocks in advance. This way, we
-        # reduce the overhead of updating free_block_ids and ref_cnts for each
-        # request every step (at the cost of some memory waste).
-        # NOTE(woosuk): This is different from the "lookahead" slots since this
-        # does not guarantee that the request always has N empty blocks. After
-        # the request gets N empty blocks, it starts to use the blocks without
-        # further allocation. When it uses up all the N empty blocks, it gets
-        # N new empty blocks.
-        self.num_preallocate_tokens = num_preallocate_tokens
-        self.num_preallocate_blocks = cdiv(num_preallocate_tokens,
-                                           self.block_size)
-
-        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
+
+        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching,
+                                    enable_kv_cache_events)
 
         self.specialized_manager = get_specialized_manager(
             kv_cache_spec=kv_cache_spec,
             block_pool=self.block_pool,
+            use_eagle=self.use_eagle,
         )
 
         # Mapping from request ID to blocks to track the blocks allocated
@@ -79,7 +72,6 @@ def __init__(
         # This is only used to track the RUNNING requests, we do not track the
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
-        self.prefix_cache_stats = PrefixCacheStats()
 
     @property
     def usage(self) -> float:
@@ -90,12 +82,14 @@ def usage(self) -> float:
         """
         return self.block_pool.get_usage()
 
-    def make_prefix_cache_stats(self) -> PrefixCacheStats:
+    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
         """Get (and reset) the prefix cache stats.
 
         Returns:
-            The current prefix caching stats.
+            The current prefix caching stats, or None if logging is disabled.
         """
+        if not self.log_stats:
+            return None
         stats = self.prefix_cache_stats
         self.prefix_cache_stats = PrefixCacheStats()
         return stats
@@ -125,7 +119,9 @@ def get_computed_blocks(
                                                self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        self.prefix_cache_stats.requests += 1
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.requests += 1
         # When the request requires prompt logprobs, we skip prefix caching.
         if request.sampling_params.prompt_logprobs is not None:
             return [], 0
@@ -145,8 +141,11 @@ def get_computed_blocks(
 
         computed_blocks = (
             self.specialized_manager.find_longest_cache_hit(block_hashes))
-        self.prefix_cache_stats.queries += len(block_hashes)
-        self.prefix_cache_stats.hits += len(computed_blocks)
+
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
+            self.prefix_cache_stats.queries += len(block_hashes)
+            self.prefix_cache_stats.hits += len(computed_blocks)
 
         if last_block_hash is not None:
             # Add back the last block hash if it was removed.
@@ -171,8 +170,9 @@ def allocate_slots(
 
         Args:
             request: The request to allocate slots.
-            num_tokens: The number of tokens to allocate. Note that this does
-                not include the tokens that have already been computed.
+            num_tokens: The number of tokens to allocate, including external
+                tokens. Note that this does not include tokens that have
+                already been computed locally (i.e. new_computed_blocks).
             new_computed_blocks: A list of new computed blocks just hitting the
                 prefix caching.
             num_lookahead_tokens: The number of speculative tokens to allocate.
@@ -180,6 +180,7 @@ def allocate_slots(
                 as eagle.
 
         Blocks layout:
+        ```
         -----------------------------------------------------------------------
         | < computed > | < new computed > |    < new >    | < pre-allocated > |
         -----------------------------------------------------------------------
@@ -189,6 +190,7 @@ def allocate_slots(
         ------------------------------------------------
                                           | <new full> |
                                           --------------
+        ```
         The following *_blocks are illustrated in this layout.
 
         Returns:
@@ -249,13 +251,9 @@ def allocate_slots(
             # No new block is needed.
             new_blocks = []
         else:
-            # Get new blocks from the free block pool considering
-            # preallocated blocks.
-            num_preallocate_blocks = max(
-                0, self.num_preallocate_blocks -
-                num_lookahead_tokens // self.block_size)
+            # Get new blocks from the free block pool.
             num_new_blocks = min(
-                num_new_blocks + num_preallocate_blocks,
+                num_new_blocks,
                 self.block_pool.get_num_free_blocks(),
                 # Should not exceed the maximum number of blocks per request.
                 # This is especially because the block table has the shape
@@ -316,17 +314,19 @@ def free(self, request: Request) -> None:
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
-        flows to invalid prefix caching after the weights are updated,
+        flows to invalidate prefix caching after the weights are updated,
         or used for resetting prefix caching status for benchmarking.
 
         Returns:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        if self.block_pool.reset_prefix_cache():
+        if not self.block_pool.reset_prefix_cache():
+            return False
+        if self.log_stats:
+            assert self.prefix_cache_stats is not None
             self.prefix_cache_stats.reset = True
-            return True
-        return False
+        return True
 
     def get_num_common_prefix_blocks(
         self,
@@ -383,3 +383,11 @@ def free_block_hashes(self, request: Request) -> None:
         is finished, not when it is preempted.
         """
         self.req_to_block_hashes.pop(request.request_id, None)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Take the KV cache events from the block pool.
+
+        Returns:
+            A list of KV cache events.
+        """
+        return self.block_pool.take_events()
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index bd0e01d045d..27c51583508 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -43,19 +43,19 @@ class BlockHashType(NamedTuple):
 # This aligns with the behavior of Python's hash() function, which also uses
 # a random seed if PYTHONHASHSEED is not set.
 NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
-    'PYTHONHASHSEED') is not None else sha256(os.getenv('PYTHONHASHSEED'))
+    'PYTHONHASHSEED') is None else sha256(os.getenv('PYTHONHASHSEED'))
 
 
 class PrefixCachingMetrics:
-    """Metrics for prefix caching with a hit rate of the most recent N requests.
+    """Metrics for prefix caching with a hit rate of the max recent N requests.
 
     Args:
-        interval: The number of the most recent requests to aggregate.
+        max_recent_requests: The number of the max recent requests to aggregate.
             Defaults to 1000.
     """
 
-    def __init__(self, interval: int = 1000):
-        self.interval = interval
+    def __init__(self, max_recent_requests: int = 1000):
+        self.max_recent_requests = max_recent_requests
         # The current aggregated values.
         self.aggregated_requests = 0
         self.aggregated_query_total = 0
@@ -70,7 +70,7 @@ def observe(self, stats: PrefixCacheStats):
         are being scheduled and are looking for computed blocks.
 
         When there are more than `interval` requests, the oldest set of
-        requestsare removed from the metrics.
+        requests are removed from the metrics.
 
         Args:
             stats: The prefix cache stats.
@@ -87,7 +87,7 @@ def observe(self, stats: PrefixCacheStats):
         self.aggregated_query_hit += stats.hits
 
         # Remove the oldest stats if the number of requests exceeds.
-        if self.aggregated_requests > self.interval:
+        if self.aggregated_requests > self.max_recent_requests:
             old_requests, old_queries, old_hits = self.query_queue.popleft()
             self.aggregated_requests -= old_requests
             self.aggregated_query_total -= old_queries
@@ -275,7 +275,10 @@ def need_extra_keys(request: Request) -> bool:
 
     # Multimodal requests need to include the MM hash.
     # LoRA requests need to include the LoRA ID.
-    return bool(request.mm_positions) or (request.lora_request is not None)
+    # Request with provided cache salt need to include the salt.
+    return bool(request.mm_positions) or (request.lora_request
+                                          is not None) or (request.cache_salt
+                                                           is not None)
 
 
 def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
@@ -380,8 +383,10 @@ def generate_block_hash_extra_keys(
     mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
         request, start_token_idx, end_token_idx, start_mm_idx)
     lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
+    cache_salt_keys: list[str] = [request.cache_salt] if (
+        start_token_idx == 0 and request.cache_salt) else []
 
-    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys
+    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys + cache_salt_keys
 
     if not extra_keys:
         return None, new_start_mm_idx
@@ -657,10 +662,10 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
     Only models with one type of KV cache are supported yet. This function tries
-    to convert the KV cache specs to one type if the model is a hybrid model 
+    to convert the KV cache specs to one type if the model is a hybrid model
     with multiple type of KV cache. It will convert all SlidingWindowSpec to
     FullAttentionSpec if both types are present.
-    
+
     Args:
         kv_cache_spec: The kv cache spec of each attention layer in the model
     """
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index bfed44f9d58..0b328f51090 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -117,11 +117,6 @@ def has_requests(self) -> bool:
         not yet returned in SchedulerOutputs."""
         return self.has_unfinished_requests() or self.has_finished_requests()
 
-    @abstractmethod
-    def get_num_unscheduled_requests(self) -> int:
-        """Number of requests that are not being processed by the executor."""
-        raise NotImplementedError
-
     @abstractmethod
     def reset_prefix_cache(self) -> bool:
         """Reset the prefix cache for KV cache.
@@ -137,3 +132,8 @@ def make_stats(self) -> Optional["SchedulerStats"]:
         The SchedulerStats object is created for every scheduling step.
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the scheduler."""
+        raise NotImplementedError
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index dc0d2d59fea..928fb231a1f 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -9,6 +9,8 @@
     import numpy as np
     import numpy.typing as npt
 
+    from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+        KVConnectorMetadata)
     from vllm.lora.request import LoRARequest
     from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
     from vllm.sampling_params import SamplingParams
@@ -20,7 +22,6 @@ class NewRequestData:
 
     req_id: str
     prompt_token_ids: list[int]
-    prompt: Optional[str]
     mm_inputs: list[MultiModalKwargs]
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
@@ -38,7 +39,6 @@ def from_request(
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
-            prompt=request.prompt,
             mm_inputs=request.mm_inputs,
             mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
@@ -121,3 +121,6 @@ class SchedulerOutput:
     structured_output_request_ids: dict[str, int]
     # the bitmask for the whole batch
     grammar_bitmask: Optional[npt.NDArray[np.int32]]
+
+    # KV Cache Connector metadata.
+    kv_connector_metadata: Optional[KVConnectorMetadata] = None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a81574875a5..05472ea573d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -3,12 +3,15 @@
 from __future__ import annotations
 
 import time
-from collections import deque
+from collections import defaultdict, deque
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import VllmConfig
+from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorRole
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@@ -34,21 +37,19 @@ class Scheduler(SchedulerInterface):
 
     def __init__(
         self,
-        scheduler_config: SchedulerConfig,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
-        speculative_config: SpeculativeConfig = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
         log_stats: bool = False,
     ) -> None:
-        self.scheduler_config = scheduler_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
         self.kv_cache_config = kv_cache_config
+        self.kv_events_config = vllm_config.kv_events_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
@@ -63,14 +64,24 @@ def __init__(
         self.max_num_scheduled_tokens = \
             self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
+        self.enable_kv_cache_events = (
+            self.kv_events_config is not None
+            and self.kv_events_config.enable_kv_cache_events)
+
+        # Create KVConnector for the Scheduler. Note that each Worker
+        # will have a corresponding KVConnector with Role=WORKER.
+        # KV Connector pushes/pull of remote KVs for P/D and offloading.
+        self.connector = None
+        if self.vllm_config.kv_transfer_config is not None:
+            self.connector = KVConnectorFactory.create_connector_v1(
+                config=self.vllm_config, role=KVConnectorRole.SCHEDULER)
+
+        self.kv_event_publisher = EventPublisherFactory.create(
+            self.kv_events_config)
+
+        num_gpu_blocks = self.cache_config.num_gpu_blocks
+        assert num_gpu_blocks is not None and num_gpu_blocks > 0
 
-        # Create the KV cache manager.
-        self.kv_cache_manager = KVCacheManager(
-            kv_cache_config=kv_cache_config,
-            max_model_len=self.max_model_len,
-            enable_caching=cache_config.enable_prefix_caching,
-            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
-            log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 
         # req_id -> Request
@@ -78,9 +89,6 @@ def __init__(
         # Priority queues for requests.
         self.waiting: deque[Request] = deque()
         self.running: list[Request] = []
-        # The requests that have been scheduled and are being executed
-        # by the executor.
-        self.scheduled_req_ids: set[str] = set()
 
         # The request IDs that are finished in between the previous and the
         # current steps. This is used to notify the workers about the finished
@@ -90,8 +98,9 @@ def __init__(
 
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
-        # Request id -> CachedRequestData
-        self._cached_reqs_data: dict[str, CachedRequestData] = {}
+        # Request id -> deque of CachedRequestData
+        self._cached_reqs_data: dict[
+            str, deque[CachedRequestData]] = defaultdict(deque)
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
@@ -99,8 +108,8 @@ def __init__(
         # This can be changed when we make encoder cache for embedding caching
         # across requests.
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
-            model_config=model_config,
-            scheduler_config=scheduler_config,
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
             mm_registry=mm_registry,
         )
 
@@ -114,10 +123,26 @@ def __init__(
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
-        self.num_lookahead_tokens = 0
-        if speculative_config and speculative_config.method == "eagle":
-            self.num_lookahead_tokens = \
-                speculative_config.num_speculative_tokens
+        speculative_config = vllm_config.speculative_config
+
+        self.use_eagle = False
+        self.num_spec_tokens = self.num_lookahead_tokens = 0
+        if speculative_config:
+            self.num_spec_tokens = speculative_config.num_speculative_tokens
+            if speculative_config.use_eagle():
+                self.use_eagle = True
+                self.num_lookahead_tokens = self.num_spec_tokens
+
+        # Create the KV cache manager.
+        self.kv_cache_manager = KVCacheManager(
+            kv_cache_config=kv_cache_config,
+            max_model_len=self.max_model_len,
+            enable_caching=self.cache_config.enable_prefix_caching,
+            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
+            use_eagle=self.use_eagle,
+            log_stats=self.log_stats,
+            enable_kv_cache_events=self.enable_kv_cache_events,
+        )
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -160,10 +185,6 @@ def schedule(self) -> SchedulerOutput:
         req_index = 0
         while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
-            if request.request_id in self.scheduled_req_ids:
-                # This request has already been scheduled.
-                req_index += 1
-                continue
 
             num_new_tokens = (request.num_tokens_with_spec -
                               request.num_computed_tokens)
@@ -172,26 +193,35 @@ def schedule(self) -> SchedulerOutput:
                 num_new_tokens = (
                     self.scheduler_config.long_prefill_token_threshold)
             num_new_tokens = min(num_new_tokens, token_budget)
-            assert num_new_tokens > 0
+
+            # Make sure the input position does not exceed the max model len.
+            # This is necessary when using spec decoding.
+            num_new_tokens = min(
+                num_new_tokens,
+                self.max_model_len - request.num_computed_tokens)
 
             # Schedule encoder inputs.
+            encoder_inputs_to_schedule = None
+            new_encoder_budget = encoder_budget
             if request.has_encoder_inputs:
                 (encoder_inputs_to_schedule, num_new_tokens,
                  new_encoder_budget) = self._try_schedule_encoder_inputs(
                      request, request.num_computed_tokens, num_new_tokens,
                      encoder_budget)
-                if num_new_tokens == 0:
-                    # The request cannot be scheduled because the encoder budget
-                    # or the encoder cache is exhausted.
-                    # NOTE(woosuk): By using `continue` instead of `break` here,
-                    # we intentionally relax the strict FCFS scheduling policy
-                    # to allow lower-priority requests to be scheduled when a
-                    # higher-priority request is blocked by encoder constraints.
-                    req_index += 1
-                    continue
-            else:
-                encoder_inputs_to_schedule = None
-                new_encoder_budget = encoder_budget
+
+            if num_new_tokens == 0:
+                # The request cannot be scheduled because one of the following
+                # reasons:
+                # 1. No new tokens to schedule. This may happen when PP>1 and
+                #    we have already scheduled all prompt tokens but they are
+                #    not finished yet.
+                # 2. The encoder budget is exhausted.
+                # 3. The encoder cache is exhausted.
+                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
+                # we do not strictly follow the FCFS scheduling policy and
+                # allow the lower-priority requests to be scheduled.
+                req_index += 1
+                continue
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
@@ -225,7 +255,6 @@ def schedule(self) -> SchedulerOutput:
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
-            self.scheduled_req_ids.add(request.request_id)
             if request.use_structured_output:
                 # PERF: in case of chunked prefill,
                 # request might not include any new tokens.
@@ -303,7 +332,18 @@ def schedule(self) -> SchedulerOutput:
 
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
-                    self.kv_cache_manager.get_computed_blocks(request)
+                    self.kv_cache_manager.get_computed_blocks(
+                        request)
+
+                # Get externally-cached tokens if using a KVConnector.
+                num_external_tokens = (
+                    0 if self.connector is None else
+                    self.connector.get_num_new_matched_tokens(
+                        request, num_computed_tokens))
+
+                # Total computed tokens (local + external).
+                num_computed_tokens += num_external_tokens
+
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,
@@ -330,18 +370,30 @@ def schedule(self) -> SchedulerOutput:
                     new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens, computed_blocks)
+                    request,
+                    num_new_tokens + num_external_tokens,
+                    computed_blocks,
+                    num_lookahead_tokens=self.num_lookahead_tokens,
+                )
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break
 
+                # KVConnector: update internal state after allocation.
+                # This information is used to determine if a load is
+                # needed for this request.
+                if self.connector is not None:
+                    self.connector.update_state_after_alloc(
+                        request,
+                        num_external_tokens,
+                    )
+
                 self.waiting.popleft()
                 if request.use_structured_output:
                     structured_output_request_ids[
                         request.request_id] = req_index
                 req_index += 1
                 self.running.append(request)
-                self.scheduled_req_ids.add(request.request_id)
                 if self.log_stats:
                     request.record_event(EngineCoreEventType.SCHEDULED,
                                          scheduled_timestamp)
@@ -399,7 +451,7 @@ def schedule(self) -> SchedulerOutput:
         grammar_bitmask = self.structured_output_manager.grammar_bitmask(
             self.requests,
             structured_output_request_ids,
-            len(self.running),
+            scheduled_spec_decode_tokens,
         )
         # Construct the scheduler output.
         new_reqs_data = [
@@ -443,6 +495,19 @@ def schedule(self) -> SchedulerOutput:
             grammar_bitmask=grammar_bitmask,
         )
 
+        # NOTE(Kuntai): this function is designed for multiple purposes:
+        # 1. Plan the KV cache store
+        # 2. Wrap up all the KV cache load / save ops into an opaque object
+        # 3. Clear the internal states of the connector
+        if self.connector is not None:
+            meta = self.connector.build_connector_meta(scheduler_output)
+            scheduler_output.kv_connector_metadata = meta
+
+        events = self.kv_cache_manager.take_events()
+        if events:
+            batch = KVEventBatch(ts=time.time(), events=events)
+            self.kv_event_publisher.publish(batch)
+
         # Advance the number of computed tokens for the request AFTER
         # the request is scheduled.
         # 1. The scheduler_output of the current step has to include the
@@ -472,18 +537,21 @@ def _make_cached_request_data(
         num_regular_tokens = num_scheduled_tokens - num_scheduled_spec_tokens
         new_token_ids = request.all_token_ids[
             num_computed_tokens:num_computed_tokens + num_regular_tokens]
-        req_data = self._cached_reqs_data.get(request.request_id)
-        if req_data is not None:
+
+        req_data_queue = self._cached_reqs_data.get(request.request_id)
+        if req_data_queue:
+            req_data = req_data_queue.popleft()
             req_data.resumed_from_preemption = resumed_from_preemption
             req_data.new_token_ids = new_token_ids
             req_data.new_block_ids = new_block_ids
             req_data.num_computed_tokens = num_computed_tokens
         else:
+            # No cached request data, or all cached request data has been
+            # used by the scheduled requests.
             req_data = CachedRequestData.from_request(request,
                                                       resumed_from_preemption,
                                                       new_token_ids,
                                                       new_block_ids)
-            self._cached_reqs_data[request.request_id] = req_data
         return req_data
 
     def _try_schedule_encoder_inputs(
@@ -508,7 +576,12 @@ def _try_schedule_encoder_inputs(
         If an encoder input cannot be scheduled due to cache or budget
         limitations, the method adjusts `num_new_tokens` to schedule only the
         decoder tokens up to just before the unschedulable encoder input.
+
+        Note that num_computed_tokens includes both locally cached
+        blocks and externally cached blocks (via KVConnector).
         """
+        if num_new_tokens == 0 or not request.has_encoder_inputs:
+            return [], num_new_tokens, encoder_budget
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
@@ -624,10 +697,6 @@ def update_from_output(
                         self.encoder_cache_manager.free_encoder_input(
                             request, input_id)
 
-            # Add newly generated spec token ids to the request.
-            if spec_token_ids is not None:
-                request.spec_token_ids = spec_token_ids[req_index]
-
             stopped = False
             new_logprobs = None
             new_token_ids = generated_token_ids
@@ -659,6 +728,17 @@ def update_from_output(
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
                     req_id, new_token_ids)
 
+            # Add newly generated spec token ids to the request.
+            if spec_token_ids is not None:
+                if request.use_structured_output:
+                    metadata = request.structured_output_request
+                    assert metadata is not None and metadata.grammar is not None
+                    # Needs to happen after new_token_ids are accepted.
+                    request.spec_token_ids = metadata.grammar.validate_tokens(
+                        spec_token_ids[req_index])
+                else:
+                    request.spec_token_ids = spec_token_ids[req_index]
+
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
             if new_token_ids:
@@ -676,10 +756,16 @@ def update_from_output(
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
 
-            self.scheduled_req_ids.remove(req_id)
             if not stopped:
                 new_running.append(request)
 
+        # Return the cached request data to the queue so they can be reused.
+        for req_data in scheduler_output.scheduled_cached_reqs:
+            # NOTE(rob): since we free stopped reqs above, adding stopped reqs
+            # to _cached_reqs_data will cause a memory leak.
+            if req_data.req_id not in self.finished_req_ids:
+                self._cached_reqs_data[req_data.req_id].append(req_data)
+
         self.running = new_running
         engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
@@ -722,7 +808,6 @@ def finish_requests(
 
             if request.status == RequestStatus.RUNNING:
                 self.running.remove(request)
-                self.scheduled_req_ids.discard(request.request_id)
             else:
                 self.waiting.remove(request)
             request.status = finished_status
@@ -743,10 +828,6 @@ def get_num_unfinished_requests(self) -> int:
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def get_num_unscheduled_requests(self) -> int:
-        """Number of requests that are not being processed by the executor."""
-        return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
-
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
@@ -756,11 +837,13 @@ def make_stats(
     ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
+        prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
+        assert prefix_cache_stats is not None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
-            prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            prefix_cache_stats=prefix_cache_stats,
             spec_decoding_stats=spec_decoding_stats,
         )
 
@@ -773,7 +856,12 @@ def make_spec_decoding_stats(
         if not self.log_stats:
             return None
         if spec_decoding_stats is None:
-            spec_decoding_stats = SpecDecodingStats()
-        spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
-                                    num_accepted_tokens=num_accepted_tokens)
+            spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
+        spec_decoding_stats.observe_draft(
+            num_draft_tokens=num_draft_tokens,
+            num_accepted_tokens=num_accepted_tokens)
         return spec_decoding_stats
+
+    def shutdown(self) -> None:
+        if self.kv_event_publisher:
+            self.kv_event_publisher.shutdown()
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py
index 7a8a98361c7..f04eedf4266 100644
--- a/vllm/v1/core/specialized_manager.py
+++ b/vllm/v1/core/specialized_manager.py
@@ -18,6 +18,7 @@ def __init__(
         self,
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
+        use_eagle: bool,
     ) -> None:
         """
         Initializes the SpecializedManager.
@@ -30,12 +31,17 @@ def __init__(
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
+        # Needs special handling for find_longest_cache_hit if eagle is enabled
+        self.use_eagle = use_eagle
+
     @abstractmethod
     def find_longest_cache_hit(
             self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
         """
         Get the longest cache hit prefix of the blocks. If no cache hit is 
-        found, return an empty list.
+        found, return an empty list. if eagle is enabled, drop the last matched 
+        block to force recompute the last block to get the required hidden 
+        states for eagle drafting head.
 
         Args:
             block_hashes: The block hashes of the request.
@@ -79,6 +85,8 @@ def find_longest_cache_hit(
                 computed_blocks.append(cached_block)
             else:
                 break
+        if self.use_eagle and len(computed_blocks) > 0:
+            computed_blocks.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
@@ -89,14 +97,20 @@ def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
 
 class SlidingWindowManager(SpecializedManager):
 
-    def __init__(self, kv_cache_spec: SlidingWindowSpec,
-                 block_pool: BlockPool):
-        super().__init__(kv_cache_spec, block_pool)
+    def __init__(self, kv_cache_spec: SlidingWindowSpec, block_pool: BlockPool,
+                 use_eagle: bool):
+        super().__init__(kv_cache_spec, block_pool, use_eagle)
         self.sliding_window = kv_cache_spec.sliding_window
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
         self.sliding_window_contiguous_blocks = cdiv(
             (kv_cache_spec.sliding_window - 1), self.block_size)
+        if self.use_eagle:
+            # Need to drop the last matched block if eagle is enabled. For
+            # sliding window layer, we achieve this by increasing the number of
+            # contiguous blocks needed for prefix cache hit by one and dropping
+            # the last matched block.
+            self.sliding_window_contiguous_blocks += 1
         self._null_block = block_pool.null_block
 
     def find_longest_cache_hit(
@@ -109,6 +123,7 @@ def find_longest_cache_hit(
         computed_blocks = [self._null_block] * len(block_hashes)
         num_contiguous_blocks = 0
 
+        match_found = False
         # Search from right to left and early stop when a match is found.
         for i in range(len(block_hashes) - 1, -1, -1):
             if cached_block := self.block_pool.get_cached_block(
@@ -121,12 +136,16 @@ def find_longest_cache_hit(
                     # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
                     # when sliding_window_contiguous_blocks=2.
                     del computed_blocks[i + num_contiguous_blocks:]
-                    return computed_blocks
+                    match_found = True
+                    break
             else:
                 num_contiguous_blocks = 0
-        # The first `num_contiguous_blocks` is a cache hit even if
-        # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
-        del computed_blocks[num_contiguous_blocks:]
+        if not match_found:
+            # The first `num_contiguous_blocks` is a cache hit even if
+            # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
+            del computed_blocks[num_contiguous_blocks:]
+        if self.use_eagle and len(computed_blocks) > 0:
+            computed_blocks.pop()
         return computed_blocks
 
     def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
@@ -155,7 +174,7 @@ def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
 
 
 def get_specialized_manager(kv_cache_spec: KVCacheSpec,
-                            block_pool: BlockPool) -> SpecializedManager:
+                            **kwargs) -> SpecializedManager:
     manager_class = spec_manager_map[type(kv_cache_spec)]
-    manager = manager_class(kv_cache_spec, block_pool)
+    manager = manager_class(kv_cache_spec, **kwargs)
     return manager
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 1264e43c79d..e33d1a1e5dc 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -49,9 +49,6 @@ class EngineCoreRequest(
     # due to circular imports and typing we have in data.py
 
     request_id: str
-    # NOTE(ywang96): original text prompt is needed when a request is added to
-    # Detokenizer, but set to None when it is added to EngineCoreClient.
-    prompt: Optional[str]
     prompt_token_ids: list[int]
     mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
     mm_hashes: Optional[list[str]]
@@ -60,6 +57,12 @@ class EngineCoreRequest(
     eos_token_id: Optional[int]
     arrival_time: float
     lora_request: Optional[LoRARequest]
+    cache_salt: Optional[str]
+
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
 
 
 class EngineCoreEventType(enum.IntEnum):
@@ -139,8 +142,12 @@ class EngineCoreOutputs(
     utility_output: Optional[UtilityOutput] = None
     finished_requests: Optional[set[str]] = None
 
-    # In DP case, used to signal that the engine is paused.
-    engine_paused: bool = False
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: Optional[int] = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: Optional[int] = None
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -154,5 +161,7 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    START_DP = b'\x02'
+    START_DP_WAVE = b'\x02'
     UTILITY = b'\x03'
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b'\x04'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6d24ba2bc98..14ce820cc39 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,11 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
-import logging
-import os
 from collections.abc import AsyncGenerator, Mapping
 from copy import copy
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 
@@ -26,16 +23,17 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cdiv, kill_process_tree
+from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
-                                     StatLoggerBase)
+from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory,
+                                     setup_default_loggers)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -53,7 +51,28 @@ def __init__(
         use_cached_outputs: bool = False,
         log_requests: bool = True,
         start_engine_loop: bool = True,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
     ) -> None:
+        """
+        Create an AsyncLLM.
+
+        Args:
+            vllm_config: global configuration.
+            executor_class: an Executor impl, e.g. MultiprocExecutor.
+            log_stats: Whether to log stats.
+            usage_context: Usage context of the LLM.
+            mm_registry: Multi-modal registry.
+            use_cached_outputs: Whether to use cached outputs.
+            log_requests: Whether to log requests.
+            start_engine_loop: Whether to start the engine loop.
+            stat_loggers: customized stat loggers for the engine.
+                If not provided, default stat loggers will be used.
+                PLEASE BE AWARE THAT STAT LOGGER IS NOT STABLE
+                IN V1, AND ITS BASE CLASS INTERFACE MIGHT CHANGE.
+
+        Returns:
+            None
+        """
         if not envs.VLLM_USE_V1:
             raise ValueError(
                 "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
@@ -61,31 +80,24 @@ def __init__(
                 "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
                 "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
-        assert start_engine_loop
-
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.log_requests = log_requests
         self.log_stats = log_stats
 
         # Set up stat loggers; independent set for each DP rank.
-        self.stat_loggers: list[list[StatLoggerBase]] = []
-        if self.log_stats:
-            for i in range(vllm_config.parallel_config.data_parallel_size):
-                loggers: list[StatLoggerBase] = []
-                if logger.isEnabledFor(logging.INFO):
-                    loggers.append(LoggingStatLogger(engine_index=i))
-                loggers.append(
-                    PrometheusStatLogger(vllm_config, engine_index=i))
-                self.stat_loggers.append(loggers)
+        self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers(
+            vllm_config=vllm_config,
+            log_stats=self.log_stats,
+            engine_num=vllm_config.parallel_config.data_parallel_size,
+            custom_stat_loggers=stat_loggers,
+        )
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
@@ -99,15 +111,24 @@ def __init__(
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = EngineCoreClient.make_client(
-            multiprocess_mode=True,
-            asyncio_mode=True,
+        core_client_class = AsyncMPClient if (
+            vllm_config.parallel_config.data_parallel_size
+            == 1) else DPAsyncMPClient
+
+        self.engine_core = core_client_class(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
         )
-
+        for stat_logger in self.stat_loggers[0]:
+            stat_logger.log_engine_initialized()
         self.output_handler: Optional[asyncio.Task] = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self._run_output_handler()
+        except RuntimeError:
+            pass
 
     @classmethod
     def from_vllm_config(
@@ -115,7 +136,7 @@ def from_vllm_config(
         vllm_config: VllmConfig,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
         disable_log_requests: bool = False,
         disable_log_stats: bool = False,
     ) -> "AsyncLLM":
@@ -126,17 +147,12 @@ def from_vllm_config(
                 "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
                 "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
-        # FIXME(rob): refactor VllmConfig to include the StatLoggers
-        # include StatLogger in the Oracle decision.
-        if stat_loggers is not None:
-            raise ValueError("Custom StatLoggers are not yet supported on V1. "
-                             "Explicitly set VLLM_USE_V1=0 to disable V1.")
-
         # Create the LLMEngine.
         return cls(
             vllm_config=vllm_config,
             executor_class=Executor.get_class(vllm_config),
             start_engine_loop=start_engine_loop,
+            stat_loggers=stat_loggers,
             log_requests=not disable_log_requests,
             log_stats=not disable_log_stats,
             usage_context=usage_context,
@@ -148,6 +164,7 @@ def from_engine_args(
         engine_args: AsyncEngineArgs,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
@@ -163,8 +180,12 @@ def from_engine_args(
             log_stats=not engine_args.disable_log_stats,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,
+            stat_loggers=stat_loggers,
         )
 
+    def __del__(self):
+        self.shutdown()
+
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
@@ -181,12 +202,16 @@ async def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
+        if self.errored:
+            raise EngineDeadError()
+
         assert isinstance(params, SamplingParams), \
             "Pooling is not supported in V1"
 
@@ -194,14 +219,13 @@ async def add_request(
         queue = RequestOutputCollector(output_kind=params.output_kind)
 
         # Convert Input --> Request.
-        request = self.processor.process_inputs(request_id, prompt, params,
-                                                arrival_time, lora_request,
-                                                trace_headers,
-                                                prompt_adapter_request,
-                                                priority)
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, prompt_adapter_request,
+            priority)
 
         if params.n == 1:
-            await self._add_request(request, None, 0, queue)
+            await self._add_request(request, prompt_str, None, 0, queue)
             return queue
 
         # Fan out child requests (for n>1).
@@ -211,15 +235,18 @@ async def add_request(
             child_request = request if idx == params.n - 1 else copy(request)
             child_request.request_id = request_id
             child_request.sampling_params = params
-            await self._add_request(child_request, parent_request, idx, queue)
+            await self._add_request(child_request, prompt_str, parent_request,
+                                    idx, queue)
         return queue
 
     async def _add_request(self, request: EngineCoreRequest,
+                           prompt: Optional[str],
                            parent_req: Optional[ParentRequest], index: int,
                            queue: RequestOutputCollector):
 
         # Add the request to OutputProcessor (this process).
-        self.output_processor.add_request(request, parent_req, index, queue)
+        self.output_processor.add_request(request, prompt, parent_req, index,
+                                          queue)
 
         # Add the EngineCoreRequest to EngineCore (separate process).
         await self.engine_core.add_request_async(request)
@@ -261,9 +288,7 @@ async def generate(
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
-            if self.output_handler is None:
-                self.output_handler = asyncio.create_task(
-                    self._run_output_handler())
+            self._run_output_handler()
 
             q = await self.add_request(
                 request_id,
@@ -288,62 +313,96 @@ async def generate(
                 finished = out.finished
                 yield out
 
-        # If the request is disconnected by the client, the
-        # generate() task will be canceled. So, we abort the
-        # request if we end up here.
+        # If the request is disconnected by the client, generate()
+        # is cancelled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
             raise
 
-    async def _run_output_handler(self):
-        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+        # Engine is dead. Do not abort since we shut down.
+        except EngineDeadError:
+            if self.log_requests:
+                logger.info("Request %s failed (engine dead).", request_id)
+            raise
 
-        try:
-            while True:
-                # 1) Pull EngineCoreOutputs from the EngineCore.
-                outputs = await self.engine_core.get_output_async()
-                num_outputs = len(outputs.outputs)
-
-                iteration_stats = IterationStats() if (
-                    self.log_stats and num_outputs) else None
-
-                # Split outputs into chunks of at most
-                # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
-                # event loop for too long.
-                if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
-                    slices = (outputs.outputs, )
-                else:
-                    slices = np.array_split(
-                        outputs.outputs,
-                        cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
-
-                for i, outputs_slice in enumerate(slices):
-                    # 2) Process EngineCoreOutputs.
-                    processed_outputs = self.output_processor.process_outputs(
-                        outputs_slice, outputs.timestamp, iteration_stats)
-                    # NOTE: RequestOutputs are pushed to their queues.
-                    assert not processed_outputs.request_outputs
-
-                    # Allow other asyncio tasks to run between chunks
-                    if i + 1 < len(slices):
-                        await asyncio.sleep(0)
-
-                    # 3) Abort any reqs that finished due to stop strings.
-                    await self.engine_core.abort_requests_async(
-                        processed_outputs.reqs_to_abort)
-
-                # 4) Logging.
-                # TODO(rob): make into a coroutine and launch it in
-                # background thread once Prometheus overhead is non-trivial.
-                self._record_stats(
-                    engine_index=outputs.engine_index,
-                    scheduler_stats=outputs.scheduler_stats,
-                    iteration_stats=iteration_stats,
-                )
+        # Request validation error.
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed (bad request).", request_id)
+            raise
 
+        # Unexpected error in the generate() task (possibly recoverable).
         except Exception as e:
-            logger.exception("EngineCore output handler hit an error: %s", e)
-            kill_process_tree(os.getpid())
+            await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise EngineGenerateError() from e
+
+    def _run_output_handler(self):
+        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+
+        if self.output_handler is not None:
+            return
+
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        stat_loggers = self.stat_loggers if log_stats else None
+
+        async def output_handler():
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    num_outputs = len(outputs.outputs)
+
+                    iteration_stats = IterationStats() if (
+                        log_stats and num_outputs) else None
+
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs, )
+                    else:
+                        slices = np.array_split(
+                            outputs.outputs,
+                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats)
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
+
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
+
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(
+                            processed_outputs.reqs_to_abort)
+
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if stat_loggers:
+                        assert outputs.scheduler_stats is not None
+                        AsyncLLM._record_stats(
+                            stat_loggers[outputs.engine_index],
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+
+        self.output_handler = asyncio.create_task(output_handler())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
@@ -354,17 +413,15 @@ async def abort(self, request_id: str) -> None:
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
 
+    @staticmethod
     def _record_stats(
-        self,
-        scheduler_stats: Optional[SchedulerStats],
+        stat_loggers: list[StatLoggerBase],
+        scheduler_stats: SchedulerStats,
         iteration_stats: Optional[IterationStats],
-        engine_index: int = 0,
     ):
-        if not self.log_stats:
-            return
-
-        assert scheduler_stats is not None
-        for stat_logger in self.stat_loggers[engine_index]:
+        """static so that it can be used from the output_handler task
+        without a circular ref to AsyncLLM."""
+        for stat_logger in stat_loggers:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
 
@@ -449,18 +506,30 @@ async def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return await self.engine_core.pin_lora_async(lora_id)
 
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine_core.collective_rpc_async(
+            method, timeout, args, kwargs)
+
     @property
     def is_running(self) -> bool:
-        return True
+        # Is None before the loop is started.
+        return self.output_handler is None or not self.output_handler.done()
 
     @property
     def is_stopped(self) -> bool:
-        return False
+        return self.errored
 
     @property
     def errored(self) -> bool:
-        return False
+        return self.engine_core.resources.engine_dead or not self.is_running
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception()  # TODO: implement
+        return EngineDeadError()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f642e51001a..e772615b786 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,19 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 import os
 import queue
 import signal
 import sys
 import threading
 import time
+from collections import deque
 from concurrent.futures import Future
 from inspect import isclass, signature
 from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union
 
 import msgspec
-import psutil
 import zmq
-import zmq.asyncio
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
@@ -22,8 +22,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
-                        zmq_socket_ctx)
+from vllm.utils import resolve_obj_by_qualname, zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -50,12 +49,11 @@
 class EngineCore:
     """Inner loop of vLLM's Engine."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-    ):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 executor_fail_callback: Optional[Callable] = None):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
@@ -65,6 +63,9 @@ def __init__(
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(
+                executor_fail_callback)
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
@@ -93,12 +94,8 @@ def __init__(
                 vllm_config.scheduler_config.scheduler_cls)
 
         self.scheduler: SchedulerInterface = Scheduler(
-            scheduler_config=vllm_config.scheduler_config,
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
+            vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
-            speculative_config=vllm_config.speculative_config,
             structured_output_manager=self.structured_output_manager,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,
@@ -120,6 +117,7 @@ def __init__(
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
+        self.vllm_config = vllm_config
 
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -215,10 +213,10 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
         Note that if nothing to output in this step, None is returned.
 
         The execution flow is as follows:
-        1. Try to schedule a new batch if there are unscheduled requests
-        and the job queue is not full. If a new batch is scheduled, directly
-        return an empty engine core output. In other words, we won't check
-        and return model outputs before the batch queue is full.
+        1. Try to schedule a new batch if the batch queue is not full.
+        If a new batch is scheduled, directly return an empty engine core
+        output. In other words, fulfilling the batch queue has a higher priority
+        than getting model outputs.
         2. If there is no new scheduled batch, meaning that the batch queue
         is full or no other requests can be scheduled, we block until the first
         batch in the job queue is finished.
@@ -228,10 +226,10 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
 
         engine_core_outputs = None
         scheduler_output = None
-        # If there are unscheduled requests and the job queue
-        # is not full, schedule a new batch. Note that this is not blocking.
-        if (self.scheduler.get_num_unscheduled_requests() > 0
-                and not self.batch_queue.full()):
+        # Try to schedule a new batch if the batch queue is not full, but
+        # the scheduler may return an empty batch if all requests are scheduled.
+        # Note that this is not blocking.
+        if not self.batch_queue.full():
             scheduler_output = self.scheduler.schedule()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 future = self.model_executor.execute_model(scheduler_output)
@@ -243,6 +241,10 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
 
         # If no more requests can be scheduled and the job queue is not empty,
         # block until the first batch in the job queue is finished.
+        # TODO(comaniac): Ideally we should peek the first batch in the
+        # job queue to check if it's finished before scheduling a new batch,
+        # but peeking the first element in a queue is not thread-safe,
+        # so we need more work.
         if not scheduled_batch and not self.batch_queue.empty():
             future, scheduler_output = self.batch_queue.get_nowait()
             # Blocking until the first result is available.
@@ -254,7 +256,11 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
         return engine_core_outputs
 
     def shutdown(self):
-        self.model_executor.shutdown()
+        self.structured_output_manager.clear_backend()
+        if self.model_executor:
+            self.model_executor.shutdown()
+        if self.scheduler:
+            self.scheduler.shutdown()
 
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
@@ -308,6 +314,8 @@ def collective_rpc(self,
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
+    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
     def __init__(
         self,
         input_path: str,
@@ -317,27 +325,33 @@ def __init__(
         log_stats: bool,
         engine_index: int = 0,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+
+        executor_fail_callback = lambda: input_queue.put_nowait(
+            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         executor_fail_callback)
 
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
-
-        self.global_unfinished_reqs = False
+        self.engines_running = False
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                            Any]] = queue.Queue()
-        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.input_queue = input_queue
+        self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, engine_index),
                          daemon=True).start()
-        threading.Thread(target=self.process_output_socket,
-                         args=(output_path, engine_index),
-                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_socket,
+            args=(output_path, engine_index),
+            daemon=True)
+        self.output_thread.start()
 
     @staticmethod
     def run_engine_core(*args,
@@ -364,7 +378,6 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
-        parent_process = psutil.Process().parent()
         engine_core: Optional[EngineCoreProc] = None
         try:
             parallel_config: ParallelConfig = kwargs[
@@ -380,13 +393,15 @@ def signal_handler(signum, frame):
             engine_core.run_busy_loop()
 
         except SystemExit:
-            logger.debug("EngineCore interrupted.")
-
-        except Exception:
-            traceback = get_exception_traceback()
-            logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGUSR1)
-
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception as e:
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
+            else:
+                logger.exception("EngineCore encountered a fatal error.")
+                engine_core._send_engine_dead()
+            raise e
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
@@ -405,8 +420,7 @@ def _process_input_queue(self):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.global_unfinished_reqs and not (
-                self.scheduler.has_requests()):
+        while not self.engines_running and not (self.scheduler.has_requests()):
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
@@ -414,10 +428,7 @@ def _process_input_queue(self):
             self._handle_client_request(*req)
 
         if waited:
-            logger.debug(
-                "EngineCore loop active - local unfinished: %s, finished: %s.",
-                self.scheduler.has_unfinished_requests(),
-                self.scheduler.has_finished_requests())
+            logger.debug("EngineCore loop active.")
 
         # Handle any more client requests.
         while not self.input_queue.empty():
@@ -441,10 +452,6 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
             self.add_request(request)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
-        elif request_type == EngineCoreRequestType.START_DP:
-            if not self.global_unfinished_reqs:
-                logger.debug("EngineCore starting idle loop.")
-                self.global_unfinished_reqs = True
         elif request_type == EngineCoreRequestType.UTILITY:
             call_id, method_name, args = request
             output = UtilityOutput(call_id)
@@ -458,6 +465,11 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
                                           f" failed: {str(e)}")
             self.output_queue.put_nowait(
                 EngineCoreOutputs(utility_output=output))
+        elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
+            raise RuntimeError("Executor failed.")
+        else:
+            logger.error("Unrecognized input request type encountered: %s",
+                         request_type)
 
     @staticmethod
     def _convert_msgspec_args(method, args):
@@ -473,6 +485,18 @@ def _convert_msgspec_args(method, args):
             and not isinstance(v, p.annotation) else v
             for v, p in zip(args, arg_types))
 
+    def _send_engine_dead(self):
+        """Send EngineDead status to the EngineCoreClient."""
+
+        # Put ENGINE_CORE_DEAD in the queue.
+        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
+
+        # Wait until msg sent by the daemon before shutdown.
+        self.output_thread.join(timeout=5.0)
+        if self.output_thread.is_alive():
+            logger.fatal("vLLM shutdown signal from EngineCore failed "
+                         "to send. Please report this issue.")
+
     def process_input_socket(self, input_path: str, engine_index: int):
         """Input socket IO thread."""
 
@@ -487,7 +511,12 @@ def process_input_socket(self, input_path: str, engine_index: int):
                             bind=False) as socket:
 
             # Send ready message to front-end once input socket is connected.
-            socket.send(b'READY')
+            message_dict = {
+                'type': 'READY',
+                'num_gpu_blocks': self.vllm_config.cache_config.num_gpu_blocks,
+            }
+            message = json.dumps(message_dict).encode('utf-8')
+            socket.send(message)
 
             while True:
                 # (RequestType, RequestData)
@@ -508,18 +537,40 @@ def process_output_socket(self, output_path: str, engine_index: int):
 
         # Msgpack serialization encoding.
         encoder = MsgpackEncoder()
-        # Reuse send buffer.
-        buffer = bytearray()
-
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+        # Send buffers to reuse.
+        reuse_buffers: list[bytearray] = []
+        # Keep references to outputs and buffers until zmq is finished
+        # with them (outputs may contain tensors/np arrays whose
+        # backing buffers were extracted for zero-copy send).
+        pending = deque[tuple[zmq.MessageTracker, Any, bytearray]]()
+
+        # We must set linger to ensure the ENGINE_CORE_DEAD
+        # message is sent prior to closing the socket.
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
+                            linger=4000) as socket:
             while True:
                 outputs = self.output_queue.get()
+                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
+                    socket.send(outputs, copy=False)
+                    break
+                assert not isinstance(outputs, bytes)
                 outputs.engine_index = engine_index
-                buffers = encoder.encode_into(outputs, buffer)
-                socket.send_multipart(buffers, copy=False)
 
+                # Reclaim buffers that zmq is finished with.
+                while pending and pending[-1][0].done:
+                    reuse_buffers.append(pending.pop()[2])
 
-ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
+                buffer = reuse_buffers.pop() if reuse_buffers else bytearray()
+                buffers = encoder.encode_into(outputs, buffer)
+                tracker = socket.send_multipart(buffers,
+                                                copy=False,
+                                                track=True)
+                if not tracker.done:
+                    ref = outputs if len(buffers) > 1 else None
+                    pending.appendleft((tracker, ref, buffer))
+                elif len(reuse_buffers) < 2:
+                    # Keep at most 2 buffers to reuse.
+                    reuse_buffers.append(buffer)
 
 
 class DPEngineCoreProc(EngineCoreProc):
@@ -558,7 +609,9 @@ def __init__(
                 for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
                                tp_size))
 
+        self.local_dp_rank = local_dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.current_wave = 0
 
         # Initialize the engine after setting up environment.
         super().__init__(input_path, output_path, vllm_config, executor_class,
@@ -573,6 +626,31 @@ def shutdown(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
+    def add_request(self, request: EngineCoreRequest):
+        if request.current_wave != self.current_wave:
+            if request.current_wave > self.current_wave:
+                self.current_wave = request.current_wave
+            elif not self.engines_running:
+                # Request received for an already-completed wave, notify
+                # front-end that we need to start the next one.
+                self.output_queue.put_nowait(
+                    EngineCoreOutputs(start_wave=self.current_wave))
+
+        super().add_request(request)
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        if request_type == EngineCoreRequestType.START_DP_WAVE:
+            new_wave: int = request
+            if new_wave >= self.current_wave:
+                self.current_wave = new_wave
+                if not self.engines_running:
+                    logger.debug("EngineCore starting idle loop for wave %d.",
+                                 new_wave)
+                    self.engines_running = True
+        else:
+            super()._handle_client_request(request_type, request)
+
     def run_busy_loop(self):
         """Core busy loop of the EngineCore for data parallel case."""
 
@@ -599,7 +677,7 @@ def run_busy_loop(self):
                     # up-to-date state is returned in the engine outputs.
                     self._process_engine_step()
 
-                if not self.global_unfinished_reqs:
+                if not self.engines_running:
                     # All engines are idle.
                     continue
 
@@ -608,18 +686,23 @@ def run_busy_loop(self):
                 self.execute_dummy_batch()
 
             # 3) All-reduce operation to determine global unfinished reqs.
-            self.global_unfinished_reqs = self._has_global_unfinished_reqs(
+            self.engines_running = self._has_global_unfinished_reqs(
                 local_unfinished_reqs)
 
-            if not self.global_unfinished_reqs:
-                # Notify client that we are pausing the loop.
-                self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
+            if not self.engines_running:
+                if self.local_dp_rank == 0:
+                    # Notify client that we are pausing the loop.
+                    logger.debug("Wave %d finished, pausing engine loop.",
+                                 self.current_wave)
+                    self.output_queue.put_nowait(
+                        EngineCoreOutputs(wave_complete=self.current_wave))
+                self.current_wave += 1
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
-        # Optimization - only perform finish-sync all-reduce every 16 steps.
+        # Optimization - only perform finish-sync all-reduce every 24 steps.
         self.counter += 1
-        if self.counter != 16:
+        if self.counter != 24:
             return True
         self.counter = 0
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a96ebc7edb5..0d5d92f7253 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,14 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
-import os
+import contextlib
+import json
 import queue
-import signal
-import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
-from collections.abc import Awaitable
+from collections import deque
+from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass, field
 from threading import Thread
@@ -21,10 +20,11 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path,
-                        kill_process_tree, make_zmq_socket)
+                        make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 from vllm.v1.utils import BackgroundProcHandle
@@ -305,14 +305,23 @@ class BackgroundResources:
     core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    output_queue_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
 
+    # Set if any of the engines are dead. Here so that the output
+    # processing threads can access it without holding a ref to the client.
+    engine_dead: bool = False
+
     def __call__(self):
         """Clean up background resources."""
 
+        self.engine_dead = True
         for core_engine in self.core_engines:
             core_engine.close()
 
+        if self.output_queue_task is not None:
+            self.output_queue_task.cancel()
+
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
         if self.output_socket is not None:
@@ -327,6 +336,12 @@ def __call__(self):
                 # Send shutdown signal.
                 shutdown_sender.send(b'')
 
+    def validate_alive(self, frames: Sequence[zmq.Frame]):
+        if len(frames) == 1 and (frames[0].buffer
+                                 == EngineCoreProc.ENGINE_CORE_DEAD):
+            self.engine_dead = True
+            raise EngineDeadError()
+
 
 class MPClient(EngineCoreClient):
     """
@@ -348,27 +363,7 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
     ):
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO(rob): rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling vLLM.
-        def sigusr1_handler(signum, frame):
-            logger.fatal("Got fatal signal from worker processes, shutting "
-                         "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        if threading.current_thread() == threading.main_thread():
-            signal.signal(signal.SIGUSR1, sigusr1_handler)
-        else:
-            logger.warning("SIGUSR1 handler not installed because we are not "
-                           "running in the main thread. In this case the "
-                           "forked engine process may not be killed when "
-                           "an exception is raised, and you need to handle "
-                           "the engine process shutdown manually.")
-
+        self.vllm_config = vllm_config
         # Serialization setup.
         self.encoder = MsgpackEncoder()
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
@@ -378,32 +373,43 @@ def sigusr1_handler(signum, frame):
         self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
-        # when the client is garbage collected,  even if an
+        # when the client is garbage collected, even if an
         # exception is raised mid-construction.
         self.resources = BackgroundResources(ctx=sync_ctx)
         self._finalizer = weakref.finalize(self, self.resources)
-
-        # Paths and sockets for IPC.
-        self.output_path = get_open_zmq_ipc_path()
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(self.ctx,
-                                            input_path,
-                                            zmq.ROUTER,
-                                            bind=True)
-        self.resources.input_socket = self.input_socket
-
-        new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
-            vllm_config, executor_class, log_stats, input_path, self.
-            output_path, index, local_dp_rank)
-
-        # Start engine core process(es).
-        self._init_core_engines(vllm_config, new_core_engine,
-                                self.resources.core_engines)
-
-        # Wait for engine core process(es) to start.
-        self._wait_for_engine_startup()
-
-        self.utility_results: dict[int, AnyFuture] = {}
+        success = False
+        try:
+            # Paths and sockets for IPC.
+            self.output_path = get_open_zmq_ipc_path()
+            input_path = get_open_zmq_ipc_path()
+            self.input_socket = make_zmq_socket(self.ctx,
+                                                input_path,
+                                                zmq.ROUTER,
+                                                bind=True)
+            self.resources.input_socket = self.input_socket
+
+            new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+                vllm_config, executor_class, log_stats, input_path, self.
+                output_path, index, local_dp_rank)
+
+            # Start engine core process(es).
+            self._init_core_engines(vllm_config, new_core_engine,
+                                    self.resources.core_engines)
+
+            # Wait for engine core process(es) to start.
+            self._wait_for_engine_startup()
+
+            self.utility_results: dict[int, AnyFuture] = {}
+
+            # Request objects which may contain pytorch-allocated tensors
+            # that we need to keep references to until zmq is done with the
+            # underlying data.
+            self.pending_messages = deque[tuple[zmq.MessageTracker, Any]]()
+
+            success = True
+        finally:
+            if not success:
+                self._finalizer()
 
     def _wait_for_engine_startup(self):
         # Get a sync handle to the socket which can be sync or async.
@@ -426,14 +432,19 @@ def _wait_for_engine_startup(self):
                 raise RuntimeError("Engine core initialization failed. "
                                    "See root cause above.")
 
-            eng_id_bytes, msg = sync_input_socket.recv_multipart()
+            eng_id_bytes, data = sync_input_socket.recv_multipart()
             eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
             if eng_id not in identities:
                 raise RuntimeError(f"Unexpected or duplicate engine: {eng_id}")
-            if msg != b'READY':
-                raise RuntimeError(f"Engine {eng_id} failed: {msg.decode()}")
+            message_dict = json.loads(data.decode('utf-8'))
+            if message_dict['type'] != 'READY':
+                raise RuntimeError(f"Engine {eng_id} failed: {data.decode()}")
             logger.info("Core engine process %d ready.", eng_id)
             identities.discard(eng_id)
+            # Setup KV cache config with initialization state from
+            # engine core process.
+            self.vllm_config.cache_config.num_gpu_blocks = message_dict[
+                'num_gpu_blocks']
 
     def _init_core_engines(
         self,
@@ -443,16 +454,34 @@ def _init_core_engines(
     ) -> None:
 
         # Default case - single core engine.
-        dp_rank = vllm_config.parallel_config.data_parallel_rank
-        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
         core_engine = new_core_engine(
-            dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank)
+            vllm_config.parallel_config.data_parallel_rank,
+            vllm_config.parallel_config.data_parallel_rank_local,
+        )
         core_engines.append(core_engine)
         self.core_engine = core_engine
 
     def shutdown(self):
+        # Terminate background resources.
         self._finalizer()
 
+    def _format_exception(self, e: Exception) -> Exception:
+        """If errored, use EngineDeadError so root cause is clear."""
+        return EngineDeadError(
+            suppress_context=True) if self.resources.engine_dead else e
+
+    def ensure_alive(self):
+        if self.resources.engine_dead:
+            raise EngineDeadError()
+
+    def add_pending_message(self, tracker: zmq.MessageTracker, msg: Any):
+        if not tracker.done:
+            self.pending_messages.appendleft((tracker, msg))
+
+    def free_pending_messages(self):
+        while self.pending_messages and self.pending_messages[-1][0].done:
+            self.pending_messages.pop()
+
 
 def _process_utility_output(output: UtilityOutput,
                             utility_results: dict[int, AnyFuture]):
@@ -476,7 +505,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
             log_stats=log_stats,
         )
 
-        self.outputs_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -487,7 +516,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         outputs_queue = self.outputs_queue
 
         shutdown_path = get_open_zmq_inproc_path()
-        self.resources.shutdown_path = shutdown_path
+        resources = self.resources
+        resources.shutdown_path = shutdown_path
 
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
@@ -506,12 +536,15 @@ def process_outputs_socket():
                         break
 
                     frames = out_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
                     outputs = decoder.decode(frames)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
             finally:
                 # Close sockets.
                 shutdown_socket.close(linger=0)
@@ -524,13 +557,28 @@ def process_outputs_socket():
         self.output_queue_thread.start()
 
     def get_output(self) -> EngineCoreOutputs:
-        return self.outputs_queue.get()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        outputs = self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
+        self.ensure_alive()
+        self.free_pending_messages()
         # (Identity, RequestType, SerializedRequest)
         msg = (self.core_engine.identity, request_type.value,
                *self.encoder.encode(request))
-        self.input_socket.send_multipart(msg, copy=False)
+
+        if len(msg) <= 3:
+            # No auxiliary buffers => no tensor backing buffers in request.
+            self.input_socket.send_multipart(msg, copy=False)
+            return
+
+        tracker = self.input_socket.send_multipart(msg, copy=False, track=True)
+        self.add_pending_message(tracker, request)
 
     def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
@@ -542,13 +590,10 @@ def call_utility(self, method: str, *args) -> Any:
         return future.result()
 
     def add_request(self, request: EngineCoreRequest) -> None:
-        # NOTE: text prompt is not needed in the core engine as it has been
-        # tokenized.
-        request.prompt = None
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
@@ -608,71 +653,111 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
             log_stats=log_stats,
         )
 
-        self.outputs_queue: Optional[asyncio.Queue[EngineCoreOutputs]] = None
-        self.queue_task: Optional[asyncio.Task] = None
-
-        self.outputs_handler: Optional[Callable[
-            [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
+        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
+                                                 Exception]]()
+        try:
+            # If we are running in an asyncio event loop, start the queue task.
+            # Otherwise, it will be started lazily. If it is not started here,
+            # we could miss EXECUTOR_FAILED messages from engine core if they
+            # occur prior to any requests being sent.
+            asyncio.get_running_loop()
+            self._ensure_output_queue_task()
+        except RuntimeError:
+            pass
 
     def _ensure_output_queue_task(self):
-        if self.outputs_queue is not None:
+        resources = self.resources
+        if resources.output_queue_task is not None:
             return
 
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
-        self.outputs_queue = asyncio.Queue()
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
-        output_handler = self.outputs_handler
+        output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs],
+                                          Awaitable[None]]] = getattr(
+                                              self.__class__,
+                                              "process_engine_outputs", None)
         _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
-        self.resources.output_socket = output_socket
+        resources.output_socket = output_socket
 
         async def process_outputs_socket():
-            while True:
-                frames = await output_socket.recv_multipart(copy=False)
-                outputs: EngineCoreOutputs = decoder.decode(frames)
-                if outputs.utility_output:
-                    _process_utility_output(outputs.utility_output,
-                                            utility_results)
-                    continue
-
-                if output_handler is not None:
-                    assert _self_ref is not None
-                    _self = _self_ref()
-                    if not _self:
-                        # Client has been garbage collected, abort.
-                        return
-                    await output_handler(_self, outputs)
-
-                if outputs.outputs or outputs.scheduler_stats:
-                    outputs_queue.put_nowait(outputs)
-
-        self.queue_task = asyncio.create_task(process_outputs_socket(),
-                                              name="EngineCoreOutputQueueTask")
+            try:
+                while True:
+                    frames = await output_socket.recv_multipart(copy=False)
+                    resources.validate_alive(frames)
+                    outputs: EngineCoreOutputs = decoder.decode(frames)
+                    if outputs.utility_output:
+                        _process_utility_output(outputs.utility_output,
+                                                utility_results)
+                        continue
+
+                    if output_handler is not None:
+                        assert _self_ref is not None
+                        _self = _self_ref()
+                        if not _self:
+                            # Client has been garbage collected, abort.
+                            return
+                        await output_handler(_self, outputs)
+
+                    if outputs.outputs or outputs.scheduler_stats:
+                        outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
+
+        resources.output_queue_task = asyncio.create_task(
+            process_outputs_socket(), name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
         self._ensure_output_queue_task()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
         assert self.outputs_queue is not None
-        return await self.outputs_queue.get()
+        outputs = await self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
     def _send_input(self,
                     request_type: EngineCoreRequestType,
                     request: Any,
-                    engine: Optional[CoreEngine] = None) -> Awaitable[None]:
+                    engine: Optional[CoreEngine] = None) -> Awaitable[Any]:
+        self.ensure_alive()
         if engine is None:
             engine = self.core_engine
 
         message = (request_type.value, *self.encoder.encode(request))
-        return self._send_input_message(message, engine)
+        return self._send_input_message(message, engine, request)
+
+    def _send_input_message(self, message: tuple[bytestr,
+                                                 ...], engine: CoreEngine,
+                            objects: Any) -> Awaitable[Any]:
+        """
+        objects is a reference to retain until zmq is finished with the
+        buffers, in case they were extracted from tensors in the request.
+        """
+        self.ensure_alive()
+        self.free_pending_messages()
+
+        msg = (engine.identity, ) + message
+        if not objects or len(msg) <= 3:
+            # No auxiliary buffers => no tensor backing buffers in request.
+            return self.input_socket.send_multipart(msg, copy=False)
+
+        future: asyncio.Future[zmq.MessageTracker]
+        future = self.input_socket.send_multipart(msg, copy=False, track=True)
 
-    def _send_input_message(self, message: tuple[bytestr, ...],
-                            engine: CoreEngine) -> Awaitable[None]:
-        message = (engine.identity, ) + message
-        return self.input_socket.send_multipart(message, copy=False)
+        def add_pending(f: asyncio.Future[zmq.MessageTracker]):
+            with contextlib.suppress(BaseException):
+                self.add_pending_message(f.result(), objects)
+
+        future.add_done_callback(add_pending)
+        return future
 
     async def call_utility_async(self, method: str, *args) -> Any:
         return await self._call_utility_async(method,
@@ -686,19 +771,16 @@ async def _call_utility_async(self, method: str, *args,
         self.utility_results[call_id] = future
         message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
             (call_id, method, args)))
-        await self._send_input_message(message, engine)
+        await self._send_input_message(message, engine, args)
         self._ensure_output_queue_task()
         return await future
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
-        # NOTE: text prompt is not needed in the core engine as it has been
-        # tokenized.
-        request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
         self._ensure_output_queue_task()
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
@@ -754,18 +836,14 @@ class DPAsyncMPClient(AsyncMPClient):
 
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
-        super().__init__(vllm_config, executor_class, log_stats)
-
-        assert len(self.core_engines) > 1
-
-        # Control message used for triggering dp idle mode loop.
-        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
-                             *self.encoder.encode(None))
 
-        self.num_engines_running = 0
+        self.current_wave = 0
+        self.engines_running = False
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
-        self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
+        super().__init__(vllm_config, executor_class, log_stats)
+
+        assert len(self.core_engines) > 1
 
     def _init_core_engines(
         self,
@@ -790,26 +868,23 @@ async def call_utility_async(self, method: str, *args) -> Any:
         ]))[0]
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
-        # NOTE: text prompt is not needed in the core engine as it has been
-        # tokenized.
-        request.prompt = None
-
-        msg = (EngineCoreRequestType.ADD.value, *self.encoder.encode(request))
+        request.current_wave = self.current_wave
 
         chosen_engine = self.get_core_engine_for_request()
         self.reqs_in_flight[request.request_id] = chosen_engine
         chosen_engine.num_reqs_in_flight += 1
-        if self.num_engines_running >= len(self.core_engines):
-            await self._send_input_message(msg, chosen_engine)
-        else:
+
+        to_await = self._send_input(EngineCoreRequestType.ADD, request,
+                                    chosen_engine)
+        if not self.engines_running:
             # Send request to chosen engine and dp start loop
             # control message to all other engines.
-            self.num_engines_running += len(self.core_engines)
-            await asyncio.gather(*[
-                self._send_input_message(
-                    msg if engine is chosen_engine else self.start_dp_msg,
-                    engine) for engine in self.core_engines
-            ])
+            self.engines_running = True
+            to_await = asyncio.gather(
+                to_await,  # type: ignore[assignment]
+                *self._start_wave_coros(exclude_index=chosen_engine.index))
+
+        await to_await
 
         self._ensure_output_queue_task()
 
@@ -824,21 +899,31 @@ async def process_engine_outputs(self: "DPAsyncMPClient",
                 if engine := self.reqs_in_flight.pop(req_id, None):
                     engine.num_reqs_in_flight -= 1
 
-        if outputs.engine_paused:
-            assert self.num_engines_running >= 1
-            self.num_engines_running -= 1
-            if not self.num_engines_running and self.reqs_in_flight:
-                # If there are requests in flight here, they must have
-                # been sent after the engines paused. We must make
-                # sure to start the other engines:
-                self.num_engines_running = len(self.core_engines)
-                coros = [
-                    self._send_input_message(self.start_dp_msg, engine)
-                    for engine in self.core_engines
-                    if not engine.num_reqs_in_flight
-                ]
-                if coros:
-                    await asyncio.gather(*coros)
+        if outputs.wave_complete is not None:
+            # Current wave is complete, move to next wave number
+            # and mark engines as paused.
+            if self.current_wave <= outputs.wave_complete:
+                self.current_wave = outputs.wave_complete + 1
+                self.engines_running = False
+
+        elif outputs.start_wave is not None and (
+                outputs.start_wave > self.current_wave or
+            (outputs.start_wave == self.current_wave
+             and not self.engines_running)):
+            # Engine received request for a non-current wave so we must ensure
+            # that other engines progress to the next wave.
+            self.current_wave = outputs.start_wave
+            self.engines_running = True
+            await asyncio.gather(*self._start_wave_coros(
+                exclude_index=outputs.engine_index))
+
+    def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]:
+        logger.debug("Sending start DP wave %d.", self.current_wave)
+        return [
+            self._send_input(EngineCoreRequestType.START_DP_WAVE,
+                             self.current_wave, engine)
+            for engine in self.core_engines if engine.index != exclude_index
+        ]
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if not request_ids:
@@ -859,5 +944,6 @@ async def abort_requests_async(self, request_ids: list[str]) -> None:
 
     async def _abort_requests(self, request_ids: list[str],
                               engine: CoreEngine) -> None:
-        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
-                               engine)
+        if not self.resources.engine_dead:
+            await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                                   engine)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index bf06a17507b..dca327cc5d0 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,8 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import dataclass, field
+from abc import ABC, abstractmethod
 from typing import Optional
 
+import tokenizers
+from packaging import version
+from tokenizers import Tokenizer
+from tokenizers.decoders import DecodeStream
+from transformers import PreTrainedTokenizerFast
+
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
@@ -12,39 +17,22 @@
 logger = init_logger(__name__)
 
 
-@dataclass
 class IncrementalDetokenizer:
 
-    # Generation data
-    token_ids: list[int]
-    output_text: str = ""
-    tokens: list[str] = field(default_factory=list)
-    prompt_len: int = 0
-
-    # Stop strings
-    stop: list[str] = field(default_factory=list)
-    include_stop_str_in_output: bool = False
-
-    # Metadata for incremental detokenization
-    prefix_offset: int = 0
-    read_offset: int = 0
-
-    # Parameters for detokenization
-    skip_special_tokens: bool = True
-    spaces_between_special_tokens: bool = True
-
-    # Tokenizer for this request,
-    # None if detokenization is disabled.
-    tokenizer: Optional[AnyTokenizer] = None
-
-    # Accounting for stop string buffering
-    stop_buffer_length: int = 0
-    _last_output_text_offset: int = 0
+    def __init__(self):
+        self.token_ids: list[int] = []
 
     @property
     def output_token_ids(self) -> list[int]:
-        return self.token_ids if not self.prompt_len else (
-            self.token_ids[self.prompt_len:])
+        return self.token_ids
+
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
+        self.token_ids.extend(new_token_ids)
+        return None
+
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
+        return ""
 
     @classmethod
     def from_new_request(
@@ -54,39 +42,39 @@ def from_new_request(
     ) -> "IncrementalDetokenizer":
 
         if tokenizer is None:
-            return cls(token_ids=[])
+            # No tokenizer => skipping detokenization.
+            return IncrementalDetokenizer()
+
+        if (isinstance(tokenizer, PreTrainedTokenizerFast) and version.parse(
+                tokenizers.__version__) >= version.parse("0.21.1")):
+            # Fast tokenizer => use tokenizers library DecodeStream.
+            # And only tokenizers >= 0.21.1 supports Fast Detokenizer.
+            return FastIncrementalDetokenizer(tokenizer, request)
+
+        # Fall back to slow python-based incremental detokenization.
+        return SlowIncrementalDetokenizer(tokenizer, request)
+
+
+class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
+
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__()
 
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=tokenizer,
-            prompt_ids=request.prompt_token_ids,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-        )
+        # Stop strings
+        params = request.sampling_params
+        self.stop = stop = params.stop
+        self.include_stop_str_in_output = params.include_stop_str_in_output
 
-        stops = request.sampling_params.stop
         # Number of chars to hold back when stop strings are to be excluded
         # from streamed output.
-        if stops and not request.sampling_params.include_stop_str_in_output:
-            stop_buffer_length = max(len(s) for s in stops) - 1
+        if stop and not self.include_stop_str_in_output:
+            self.stop_buffer_length = max(len(s) for s in stop) - 1
         else:
-            stop_buffer_length = 0
-
-        return cls(
-            tokens=tokens,
-            # Detokenizer mutates this list, so need a unique copy.
-            # NOTE(Nick): could we take ownership of it though?
-            token_ids=request.prompt_token_ids.copy(),
-            stop=stops,
-            include_stop_str_in_output=request.sampling_params.
-            include_stop_str_in_output,
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=request.sampling_params.skip_special_tokens,
-            spaces_between_special_tokens=request.sampling_params.
-            spaces_between_special_tokens,
-            prompt_len=len(request.prompt_token_ids),
-            tokenizer=tokenizer,
-            stop_buffer_length=stop_buffer_length,
-        )
+            self.stop_buffer_length = 0
+        self._last_output_text_offset: int = 0
+
+        # Generation data
+        self.output_text = ""
 
     def update(self, new_token_ids: list[int],
                stop_terminated: bool) -> Optional[str]:
@@ -98,11 +86,7 @@ def update(self, new_token_ids: list[int],
         Return matched stop string or None.
         """
         if not new_token_ids:
-            # Skip detokenization if no new token ids
-            return None
-        if self.tokenizer is None:
-            # Skip detokenization if no tokenizer
-            self.token_ids.extend(new_token_ids)
+            # Skip detokenization if no new token ids.
             return None
 
         if stop_terminated and not self.include_stop_str_in_output:
@@ -116,34 +100,16 @@ def update(self, new_token_ids: list[int],
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
-        decoded_text = ""
+        offset_before = len(self.output_text)
         for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
-            (new_tokens, new_decoded_token_text, prefix_offset,
-             read_offset) = detokenize_incrementally(
-                 tokenizer=self.tokenizer,
-                 all_input_ids=self.token_ids,
-                 prev_tokens=self.tokens,
-                 prefix_offset=self.prefix_offset,
-                 read_offset=self.read_offset,
-                 skip_special_tokens=self.skip_special_tokens,
-                 spaces_between_special_tokens=self.
-                 spaces_between_special_tokens,
-             )
-
-            self.tokens.extend(new_tokens)
-            self.prefix_offset = prefix_offset
-            self.read_offset = read_offset
-
-            decoded_text += new_decoded_token_text
-
-        self.output_text += decoded_text
+            self.output_text += self.decode_next(new_token_id)
 
         if stop_terminated:
             if skipped_stop_token_id is not None:
-                # Cleanup after skipping detokenization
+                # Cleanup after skipping detokenization.
                 self.token_ids.append(skipped_stop_token_id)
-            # Stop token triggered; skip stop string check
+            # Stop token triggered; skip stop string check.
             return None
 
         # 2) Evaluate stop strings.
@@ -151,7 +117,7 @@ def update(self, new_token_ids: list[int],
         if self.stop:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
-                new_char_count=len(decoded_text),
+                new_char_count=len(self.output_text) - offset_before,
                 stop=self.stop,
                 include_in_output=self.include_stop_str_in_output,
             )
@@ -162,6 +128,10 @@ def update(self, new_token_ids: list[int],
 
         return stop_string
 
+    @abstractmethod
+    def decode_next(self, next_token_id: int) -> str:
+        raise NotImplementedError
+
     def get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
         this method is returned"""
@@ -177,3 +147,114 @@ def get_next_output_text(self, finished: bool, delta: bool) -> str:
             self._last_output_text_offset = length
             return self.output_text[last_offset:length]
         return ""
+
+
+class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: PreTrainedTokenizerFast,
+                 request: EngineCoreRequest):
+        super().__init__(request)
+
+        sampling_params = request.sampling_params
+        self.stream = DecodeStream(
+            skip_special_tokens=sampling_params.skip_special_tokens)
+
+        self.tokenizer: Tokenizer = tokenizer._tokenizer
+
+        # Find a safe place to start.
+        prompt_suffix = request.prompt_token_ids
+        prompt_len = len(prompt_suffix)
+        if prompt_len > 4:
+            for i in range(4, min(prompt_len + 1, 24)):
+                suffix = request.prompt_token_ids[-i:]
+                if '�' not in self.tokenizer.decode(suffix):
+                    prompt_suffix = suffix
+                    break
+
+        # Prime the stream.
+        for tid in prompt_suffix:
+            self.stream.step(self.tokenizer, tid)
+
+        self.spaces_between_special_tokens = (
+            sampling_params.skip_special_tokens
+            or sampling_params.spaces_between_special_tokens)
+
+        if not self.spaces_between_special_tokens:
+            # Store dict of added token ids so that we can suppress
+            # the spaces between them.
+            if (added_token_ids := getattr(self.tokenizer, "added_token_ids",
+                                           None)) is None:
+                self.tokenizer.added_token_ids = added_token_ids = {
+                    tid: tok.content
+                    for tid, tok in
+                    self.tokenizer.get_added_tokens_decoder().items()
+                }
+
+            if added_token_ids:
+                self.last_special = False
+                self.added_token_ids = added_token_ids
+            else:
+                # No added tokens.
+                self.spaces_between_special_tokens = True
+
+    def decode_next(self, next_token_id: int) -> str:
+        token = self.stream.step(self.tokenizer, next_token_id)
+
+        if not self.spaces_between_special_tokens:
+            special_token = self.added_token_ids.get(next_token_id)
+            is_special = special_token is not None
+            if is_special and self.last_special:
+                # Return raw token string without any prefixed spaces.
+                token = special_token
+            self.last_special = is_special
+
+        return token or ""
+
+
+class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+        super().__init__(request)
+
+        self.tokenizer = tokenizer
+
+        # Metadata for incremental detokenization.
+        self.tokens, self.prefix_offset, self.read_offset = (
+            convert_prompt_ids_to_tokens(
+                tokenizer=tokenizer,
+                prompt_ids=request.prompt_token_ids,
+                skip_special_tokens=request.sampling_params.
+                skip_special_tokens,
+            ))
+
+        self.token_ids.extend(request.prompt_token_ids)
+        self.prompt_len = len(request.prompt_token_ids)
+
+        params = request.sampling_params
+        self.skip_special_tokens = params.skip_special_tokens
+        self.spaces_between_special_tokens = (
+            params.spaces_between_special_tokens)
+
+    @property
+    def output_token_ids(self) -> list[int]:
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
+
+    def decode_next(self, next_token_id: int) -> str:
+        new_tokens, decoded_text, prefix_offset, read_offset = (
+            detokenize_incrementally(
+                tokenizer=self.tokenizer,
+                all_input_ids=self.token_ids,
+                prev_tokens=self.tokens,
+                prefix_offset=self.prefix_offset,
+                read_offset=self.read_offset,
+                skip_special_tokens=self.skip_special_tokens,
+                spaces_between_special_tokens=self.
+                spaces_between_special_tokens,
+            ))
+
+        self.tokens.extend(new_tokens)
+        self.prefix_offset = prefix_offset
+        self.read_offset = read_offset
+
+        return decoded_text
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
new file mode 100644
index 00000000000..97dd31d5e52
--- /dev/null
+++ b/vllm/v1/engine/exceptions.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
+    pass
+
+
+class EngineDeadError(Exception):
+    """Raised when the EngineCore dies. Unrecoverable."""
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c05319f3d80..b471b153657 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -10,7 +10,6 @@
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -20,7 +19,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+    TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -28,10 +27,10 @@
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.metrics.loggers import StatLoggerFactory
 
 logger = init_logger(__name__)
 
-_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _R = TypeVar("_R", default=Any)
 
 
@@ -44,7 +43,7 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
@@ -56,6 +55,11 @@ def __init__(
                 "LLMEngine.from_vllm_config(...) or explicitly set "
                 "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
+        if stat_loggers is not None:
+            raise NotImplementedError(
+                "Passing StatLoggers to LLMEngine in V1 is not yet supported. "
+                "Set VLLM_USE_V1=0 and file and issue on Github.")
+
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -73,9 +77,7 @@ def __init__(
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(vllm_config=vllm_config,
@@ -104,14 +106,9 @@ def from_vllm_config(
         cls,
         vllm_config: VllmConfig,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
         disable_log_stats: bool = False,
     ) -> "LLMEngine":
-        if stat_loggers is not None:
-            raise NotImplementedError(
-                "Passing StatLoggers to V1 is not yet supported. "
-                "Set VLLM_USE_V1=0 and file and issue on Github.")
-
         return cls(vllm_config=vllm_config,
                    executor_class=Executor.get_class(vllm_config),
                    log_stats=(not disable_log_stats),
@@ -124,7 +121,7 @@ def from_engine_args(
         cls,
         engine_args: EngineArgs,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
         enable_multiprocessing: bool = False,
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
@@ -178,22 +175,22 @@ def add_request(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
         # Process raw inputs into the request.
-        request = self.processor.process_inputs(request_id, prompt, params,
-                                                arrival_time, lora_request,
-                                                trace_headers,
-                                                prompt_adapter_request,
-                                                priority)
+        prompt_str, request = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            tokenization_kwargs, trace_headers, prompt_adapter_request,
+            priority)
 
         n = params.n if isinstance(params, SamplingParams) else 1
 
         if n == 1:
             # Make a new RequestState and queue.
-            self.output_processor.add_request(request, None, 0)
+            self.output_processor.add_request(request, prompt_str, None, 0)
             # Add the request to EngineCore.
             self.engine_core.add_request(request)
             return
@@ -207,7 +204,8 @@ def add_request(
             child_request.sampling_params = params
 
             # Make a new RequestState and queue.
-            self.output_processor.add_request(child_request, parent_req, idx)
+            self.output_processor.add_request(child_request, prompt_str,
+                                              parent_req, idx)
             # Add the request to EngineCore.
             self.engine_core.add_request(child_request)
 
@@ -254,21 +252,12 @@ def wake_up(self, tags: Optional[list[str]] = None):
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
 
-    def get_tokenizer_group(
-        self,
-        group_type: type[_G] = BaseTokenizerGroup,
-    ) -> _G:
-        tokenizer_group = self.tokenizer
-
-        if tokenizer_group is None:
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
             raise ValueError("Unable to get tokenizer because "
                              "skip_tokenizer_init is True")
-        if not isinstance(tokenizer_group, group_type):
-            raise TypeError("Invalid type of tokenizer group. "
-                            f"Expected type: {group_type}, but "
-                            f"found type: {type(tokenizer_group)}")
 
-        return tokenizer_group
+        return self.tokenizer
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index c765c1bbffc..64ece840fc4 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -33,7 +33,10 @@
 class MirroredProcessingCache:
 
     def __init__(self, model_config):
-        self.use_cache = not model_config.disable_mm_preprocessor_cache
+        mm_config = model_config.multimodal_config
+        disable_mm_preprocessor_cache = mm_config is not None and \
+            not mm_config.disable_mm_preprocessor_cache
+        self.use_cache = not disable_mm_preprocessor_cache
         self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                       MultiModalKwargs)
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 70f072d3c93..5f5ffe6e09d 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
@@ -28,32 +28,37 @@ class RequestOutputCollector:
 
     def __init__(self, output_kind: RequestOutputKind):
         self.aggregate = output_kind == RequestOutputKind.DELTA
-        self.output: Optional[RequestOutput] = None
+        self.output: Optional[Union[RequestOutput, Exception]] = None
         self.ready = asyncio.Event()
 
-    def put(self, output: RequestOutput) -> None:
-        if self.output is None:
+    def put(self, output: Union[RequestOutput, Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
             self.output = output
             self.ready.set()
-        elif self.aggregate:
-            # Coalesce the outputs in delta case.
-            self.output.add(output)
-        else:
-            # Just replace latest in non-delta case.
-            self.output = output
+        elif isinstance(self.output, RequestOutput):
+            # This ensures that request outputs with different request indexes
+            # (if n > 1) do not override each other.
+            self.output.add(output, aggregate=self.aggregate)
 
     async def get(self) -> RequestOutput:
+        """Get operation blocks on put event."""
         while (output := self.output) is None:
             await self.ready.wait()
         self.output = None
         self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
     def get_nowait(self) -> Optional[RequestOutput]:
+        """Non-blocking get operation."""
         output = self.output
         if output is not None:
             self.output = None
             self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
 
@@ -104,6 +109,7 @@ def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
+        prompt: Optional[str],
         parent_req: Optional[ParentRequest],
         request_index: int,
         queue: Optional[RequestOutputCollector],
@@ -118,7 +124,7 @@ def from_new_request(
             lora_name=(request.lora_request.name
                        if request.lora_request is not None else None),
             output_kind=request.sampling_params.output_kind,
-            prompt=request.prompt,
+            prompt=prompt,
             prompt_token_ids=request.prompt_token_ids,
             logprobs_processor=LogprobsProcessor.from_new_request(
                 tokenizer=tokenizer,
@@ -220,7 +226,7 @@ class OutputProcessor:
 
     def __init__(
         self,
-        tokenizer: BaseTokenizerGroup,
+        tokenizer: TokenizerGroup,
         log_stats: bool,
     ):
         self.log_stats = log_stats
@@ -235,6 +241,13 @@ def get_num_unfinished_requests(self):
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put(e)
+
     def abort_requests(
         self,
         request_ids: Iterable[str],
@@ -255,6 +268,7 @@ def abort_requests(
     def add_request(
         self,
         request: EngineCoreRequest,
+        prompt: Optional[str],
         parent_req: Optional[ParentRequest] = None,
         request_index: int = 0,
         queue: Optional[RequestOutputCollector] = None,
@@ -266,6 +280,7 @@ def add_request(
         req_state = RequestState.from_new_request(
             tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
             request=request,
+            prompt=prompt,
             parent_req=parent_req,
             request_index=request_index,
             queue=queue,
@@ -293,7 +308,7 @@ def process_outputs(
             * If there is no queue (for usage with LLMEngine), 
               return a list of RequestOutput objects.
 
-        ****************** NOTE FOR DEVELOPERS ******************
+        NOTE FOR DEVELOPERS
 
         vLLM V1 minimizes the number of python loops over the full
         batch to ensure system overheads are minimized. This is the 
@@ -301,8 +316,6 @@ def process_outputs(
 
         If you need to touch every element of the batch, do it from
         within the loop below.
-        
-        **********************************************************
         """
 
         request_outputs: list[RequestOutput] = []
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6d3290f1656..27d70a78147 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -2,7 +2,7 @@
 
 import time
 from collections.abc import Mapping, Sequence
-from typing import Literal, Optional, Union
+from typing import Any, Literal, Optional, Union
 
 from vllm.config import VllmConfig
 from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
@@ -17,13 +17,13 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
-from vllm.v1.structured_output.utils import (
-    validate_structured_output_request_xgrammar)
+from vllm.v1.structured_output.backend_xgrammar import (
+    validate_xgrammar_grammar)
 
 
 class Processor:
@@ -31,7 +31,7 @@ class Processor:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        tokenizer: BaseTokenizerGroup,
+        tokenizer: TokenizerGroup,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
@@ -51,8 +51,7 @@ def __init__(
         self.mm_input_cache_client = MirroredProcessingCache(self.model_config)
 
         # Multi-modal hasher (for images)
-        self.use_hash = (
-            not self.model_config.disable_mm_preprocessor_cache) or \
+        self.use_hash = self.mm_input_cache_client.use_cache or \
             self.cache_config.enable_prefix_caching
 
     def _validate_logprobs(
@@ -145,48 +144,52 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        supported_backends = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
-            "guidance:disable-any-whitespace", "auto"
-        ]
-        engine_level_backend = self.decoding_config.guided_decoding_backend
-        if engine_level_backend not in supported_backends:
-            raise ValueError(f"Only {supported_backends} structured output is "
-                             "supported in V1.")
+        engine_level_backend = self.decoding_config.backend
         if params.guided_decoding.backend:
-            if params.guided_decoding.backend != engine_level_backend:
-                raise ValueError("Request-level structured output backend "
-                                 "must match engine-level backend. "
-                                 f"{params.guided_decoding.backend}"
-                                 f" != {engine_level_backend}")
+            # Request-level backend selection is not supported in V1.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_auto` option set on the backend in the params.
+            if (params.guided_decoding.backend != engine_level_backend
+                    and not (engine_level_backend == "auto"
+                             and params.guided_decoding.backend_was_auto)):
+                raise ValueError(
+                    "Request-level structured output backend selection is no "
+                    "longer supported. The request specified "
+                    f"'{params.guided_decoding.backend}', but vLLM was "
+                    f"initialised with '{engine_level_backend}'. This error "
+                    "can be resolved by removing backend selection from the "
+                    "request.")
         else:
             params.guided_decoding.backend = engine_level_backend
 
         # Request content validation
         if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
-            validate_structured_output_request_xgrammar(params)
-            params.guided_decoding.backend = engine_level_backend
-        elif engine_level_backend == "auto":
+            validate_xgrammar_grammar(params)
+        elif engine_level_backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            validate_guidance_grammar(params, tokenizer=None)
+        else:
+            # NOTE: engine_level_backend must be "auto" here, because we have
+            # checked supported_backends above.
             # "auto" is an opt-in to opinionated behavior where we try to
             # choose a backend based on request contents. This is not the
             # default as it is less predictable and subject to change
             # between releases as feature support changes.
             try:
-                validate_structured_output_request_xgrammar(params)
+                validate_xgrammar_grammar(params)
                 params.guided_decoding.backend = "xgrammar"
             except ValueError:
                 # The request includes some jsonschema feature(s) that
                 # are not supported in xgrammar. Fall back to guidance.
                 params.guided_decoding.backend = "guidance"
-
-        if engine_level_backend.startswith("guidance"):
-            # TODO ideally we would have the LLTokenizer here as Lark syntax
-            # allows <|special_token|> and similar, see
-            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
-            # Without tokenizer these are disallowed in grammars.
-            validate_guidance_grammar(params, tokenizer=None)
-            params.guided_decoding.backend = engine_level_backend
+            # Remember that this backend was set automatically
+            params.guided_decoding.backend_was_auto = True
 
     def process_inputs(
         self,
@@ -195,19 +198,14 @@ def process_inputs(
         params: Union[SamplingParams, PoolingParams],
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> EngineCoreRequest:
+    ) -> tuple[Optional[str], EngineCoreRequest]:
 
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
-
-        from vllm.platforms import current_platform
-        current_platform.validate_request(
-            prompt=prompt,
-            params=params,
-        )
         self._validate_lora(lora_request)
         self._validate_params(params)
         if priority != 0:
@@ -227,10 +225,17 @@ def process_inputs(
         # 3. Apply prompt adapter to prompt token ids if one exists.
         processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
             prompt,
+            tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=self.use_hash,
         )
+        from vllm.platforms import current_platform
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         self._validate_model_inputs(processed_inputs, lora_request)
@@ -302,9 +307,8 @@ def process_inputs(
             else:
                 sorted_mm_inputs = orig_sorted_mm_inputs
 
-        return EngineCoreRequest(
+        return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
-            prompt=decoder_inputs.get("prompt"),
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
             mm_inputs=sorted_mm_inputs,
             mm_hashes=sorted_mm_hashes,
@@ -313,6 +317,7 @@ def process_inputs(
             eos_token_id=eos_token_id,
             arrival_time=arrival_time,
             lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
         )
 
     def _validate_model_inputs(self,
@@ -351,7 +356,7 @@ def _validate_model_input(
             raise ValueError(f"Token id {max_input_id} is out of vocabulary")
 
         max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) >= max_prompt_len:
+        if len(prompt_ids) > max_prompt_len:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index e3a4cd98c1f..3b9feb0d329 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from concurrent.futures import Future
-from typing import Union
+from typing import Callable, Union
 
 import torch
 import torch.distributed as dist
@@ -15,6 +15,8 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
+FailureCallback = Callable[[], None]
+
 
 class Executor(ExecutorBase):
     """
@@ -62,6 +64,13 @@ def initialize_from_config(self,
                             args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
+    def register_failure_callback(self, callback: FailureCallback):
+        """
+        Register a function to be called if the executor enters a permanent
+        failed state.
+        """
+        pass
+
     def determine_available_memory(self) -> list[int]:  # in bytes
         output = self.collective_rpc("determine_available_memory")
         return output
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e854c2a44ff..cb125bf4bf1 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,21 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import multiprocessing
 import os
 import pickle
 import signal
 import sys
+import threading
 import time
 import traceback
 import weakref
+from concurrent.futures import Future
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import partial
+from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
-from typing import Any, Callable, Optional, Union
+from threading import Thread
+from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
-import psutil
-import zmq
 
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
@@ -26,8 +28,9 @@
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_mp_context,
-                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
-from vllm.v1.executor.abstract import Executor
+                        get_open_port)
+from vllm.v1.executor.abstract import Executor, FailureCallback
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -35,6 +38,8 @@
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
+EXECUTE_MODEL_TIMEOUT_S = 40
+
 
 class MultiprocExecutor(Executor):
 
@@ -42,19 +47,9 @@ def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
-
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen.
-        def sigusr1_handler(signum, frame):
-            logger.fatal(
-                "MulitprocExecutor got fatal signal from worker processes, "
-                "shutting down. See stack trace above for root cause issue.")
-            # Propagate error up to parent process.
-            parent_process = psutil.Process().parent()
-            parent_process.send_signal(signal.SIGUSR1)
-            self.shutdown()
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        self.is_failed = False
+        self.shutdown_event = threading.Event()
+        self.failure_callback: Optional[FailureCallback] = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -78,26 +73,92 @@ def sigusr1_handler(signum, frame):
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
-        self.workers: list[WorkerProcHandle] = []
-        for rank in range(self.world_size):
-            worker = WorkerProc.make_worker_process(self.vllm_config, rank,
-                                                    rank,
-                                                    distributed_init_method,
-                                                    scheduler_output_handle)
-            self.workers.append(worker)
-
-        # Ensure message queues are ready. Will deadlock if re-ordered
-        # Must be kept consistent with the WorkerProc
-        self.rpc_broadcast_mq.wait_until_ready()
-        for w in self.workers:
-            w.worker_response_mq.wait_until_ready()
+        unready_workers: list[UnreadyWorkerProcHandle] = []
+        success = False
+        try:
+            for rank in range(self.world_size):
+                unready_workers.append(
+                    WorkerProc.make_worker_process(
+                        vllm_config=self.vllm_config,
+                        local_rank=rank,
+                        rank=rank,
+                        distributed_init_method=distributed_init_method,
+                        input_shm_handle=scheduler_output_handle,
+                    ))
+
+            # Workers must be created before wait_for_ready to avoid
+            # deadlock, since worker.init_device() does a device sync.
+            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered
+            # Must be kept consistent with the WorkerProc.
+            self.rpc_broadcast_mq.wait_until_ready()
+            for w in self.workers:
+                w.worker_response_mq.wait_until_ready()
+
+            self.start_worker_monitor()
+            success = True
+        finally:
+            if not success:
+                # Clean up the worker procs if there was a failure.
+                self._ensure_worker_termination(
+                    [w.proc for w in unready_workers])
+
+    def start_worker_monitor(self):
+        workers = self.workers
+        self_ref = weakref.ref(self)
+
+        # Monitors worker process liveness. If any die unexpectedly,
+        # logs an error, shuts down the executor and invokes the failure
+        # callback to inform the engine.
+        def monitor_workers():
+            sentinels = [h.proc.sentinel for h in workers]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or getattr(_self, 'shutting_down', False):
+                return
+            _self.is_failed = True
+            proc_name = next(h.proc.name for h in workers
+                             if h.proc.sentinel == died[0])
+            logger.error(
+                "Worker proc %s died unexpectedly, "
+                "shutting down executor.", proc_name)
+            _self.shutdown()
+            callback = _self.failure_callback
+            if callback is not None:
+                _self.failure_callback = None
+                callback()
+
+        Thread(target=monitor_workers,
+               daemon=True,
+               name="MultiprocWorkerMonitor").start()
+
+    def register_failure_callback(self, callback: FailureCallback):
+        if self.is_failed:
+            callback()
+        else:
+            self.failure_callback = callback
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        (output, ) = self.collective_rpc("execute_model",
+                                         args=(scheduler_output, ),
+                                         rank0_reply_only=True,
+                                         timeout=EXECUTE_MODEL_TIMEOUT_S)
+        return output
 
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
                        args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
-        start_time = time.monotonic()
+                       kwargs: Optional[dict] = None,
+                       rank0_reply_only: bool = False) -> list[Any]:
+        if self.is_failed:
+            raise RuntimeError("Executor failed.")
+
+        deadline = None if timeout is None else time.monotonic() + timeout
         kwargs = kwargs or {}
 
         # NOTE: If the args are heterogeneous, then we pack them into a list,
@@ -109,30 +170,30 @@ def collective_rpc(self,
             else:
                 send_method = cloudpickle.dumps(
                     method, protocol=pickle.HIGHEST_PROTOCOL)
-            self.rpc_broadcast_mq.enqueue((send_method, args, kwargs))
-
-            responses = [None] * self.world_size
-            for w in self.workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time
-                                             ) if timeout is not None else None
+            self.rpc_broadcast_mq.enqueue(
+                (send_method, args, kwargs, rank0_reply_only))
+
+            workers = (self.workers[0], ) if rank0_reply_only else self.workers
+            responses = [None] * len(workers)
+            for w in workers:
+                dequeue_timeout = None if deadline is None else (
+                    deadline - time.monotonic())
                 status, result = w.worker_response_mq.dequeue(
-                    timeout=dequeue_timeout)
+                    timeout=dequeue_timeout, cancel=self.shutdown_event)
 
                 if status != WorkerProc.ResponseStatus.SUCCESS:
                     raise RuntimeError(
-                        "Worker failed with error %s, please check the"
-                        " stack trace above for the root cause", result)
+                        f"Worker failed with error '{result}', please check the"
+                        " stack trace above for the root cause")
 
                 responses[w.rank] = result
 
             return responses
         except TimeoutError as e:
             raise TimeoutError(f"RPC call to {method} timed out.") from e
-        except Exception as e:
-            # Re-raise any other exceptions
-            raise e
 
-    def _ensure_worker_termination(self):
+    @staticmethod
+    def _ensure_worker_termination(worker_procs: list[BaseProcess]):
         """Ensure that all worker processes are terminated. Assumes workers have
         received termination requests. Waits for processing, then sends
         termination and kill signals if needed."""
@@ -150,7 +211,7 @@ def wait_for_termination(procs, timeout):
             return False
 
         # Send SIGTERM if still running
-        active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
+        active_procs = [proc for proc in worker_procs if proc.is_alive()]
         for p in active_procs:
             p.terminate()
         if not wait_for_termination(active_procs, 4):
@@ -159,22 +220,14 @@ def wait_for_termination(procs, timeout):
             for p in active_procs:
                 p.kill()
 
-        self._cleanup_sockets()
-
-    def _cleanup_sockets(self):
-        for w in self.workers:
-            # Remove the zmq ipc socket file
-            socket_path = w.ready_path.replace("ipc://", "")
-            if os and os.path.exists(socket_path):
-                os.remove(socket_path)
-
     def shutdown(self):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
+            self.shutdown_event.set()
             for w in self.workers:
                 w.worker_response_mq = None
-            self._ensure_worker_termination()
+            self._ensure_worker_termination([w.proc for w in self.workers])
 
         self.rpc_broadcast_mq = None
 
@@ -183,13 +236,30 @@ def check_health(self) -> None:
         return
 
 
+@dataclass
+class UnreadyWorkerProcHandle:
+    """WorkerProcess handle before READY."""
+    proc: BaseProcess
+    rank: int
+    ready_pipe: Connection
+
+
 @dataclass
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    ready_path: str
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
 
+    @classmethod
+    def from_unready_handle(
+            cls, unready_handle: UnreadyWorkerProcHandle,
+            worker_response_mq: MessageQueue) -> "WorkerProcHandle":
+        return cls(
+            proc=unready_handle.proc,
+            rank=unready_handle.rank,
+            worker_response_mq=worker_response_mq,
+        )
+
 
 class WorkerProc:
     """Wrapper that runs one Worker in a separate process."""
@@ -203,7 +273,6 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         input_shm_handle: Handle,
-        ready_path: str,
     ):
         self.rank = rank
         wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
@@ -231,18 +300,8 @@ def __init__(
 
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
-        worker_response_mq_handle = self.worker_response_mq.export_handle()
-
-        # Send Readiness signal to EngineCore process.
-        # Set linger here because we want to ensure the message has
-        # been sent before the context is closed.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH,
-                            linger=10000) as ready_socket:
-            payload = pickle.dumps(worker_response_mq_handle,
-                                   protocol=pickle.HIGHEST_PROTOCOL)
-            ready_socket.send_string(WorkerProc.READY_STR)
-            ready_socket.send(payload)
 
+        # Initialize device and loads weights
         self.worker.init_device()
         self.worker.load_model()
 
@@ -253,12 +312,10 @@ def make_worker_process(
             rank: int,
             distributed_init_method: str,
             input_shm_handle,  # Receive SchedulerOutput
-    ) -> WorkerProcHandle:
+    ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
-
-        # ZMQ path for worker to send ready message and shm_broadcast handle
-        # back to core process.
-        ready_path = get_open_zmq_ipc_path()
+        # (reader, writer)
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "vllm_config": vllm_config,
@@ -266,24 +323,57 @@ def make_worker_process(
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_path": ready_path,
+            "ready_pipe": (reader, writer),
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
                                kwargs=process_kwargs,
+                               name=f"VllmWorker-{rank}",
                                daemon=True)
 
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as ready_socket:
-            proc.start()
-
-            # Wait for startup
-            worker_response_mq_handle = WorkerProc.wait_for_startup(
-                proc, ready_socket)
-
-        worker_response_mq = MessageQueue.create_from_handle(
-            worker_response_mq_handle, 0)
+        proc.start()
+        writer.close()
+        return UnreadyWorkerProcHandle(proc, rank, reader)
 
-        return WorkerProcHandle(proc, rank, ready_path, worker_response_mq)
+    @staticmethod
+    def wait_for_ready(
+        unready_proc_handles: list[UnreadyWorkerProcHandle]
+    ) -> list[WorkerProcHandle]:
+
+        e = Exception("WorkerProc initialization failed due to "
+                      "an exception in a background process. "
+                      "See stack trace for root cause.")
+
+        pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
+        ready_proc_handles: list[Optional[WorkerProcHandle]] = (
+            [None] * len(unready_proc_handles))
+        while pipes:
+            ready = multiprocessing.connection.wait(pipes.keys())
+            for pipe in ready:
+                assert isinstance(pipe, Connection)
+                try:
+                    # Wait until the WorkerProc is ready.
+                    unready_proc_handle = pipes.pop(pipe)
+                    response: dict[str, Any] = pipe.recv()
+                    if response["status"] != "READY":
+                        raise e
+
+                    # Extract the message queue handle.
+                    worker_response_mq = MessageQueue.create_from_handle(
+                        response["handle"], 0)
+                    ready_proc_handles[unready_proc_handle.rank] = (
+                        WorkerProcHandle.from_unready_handle(
+                            unready_proc_handle, worker_response_mq))
+
+                except EOFError:
+                    e.__suppress_context__ = True
+                    raise e from None
+
+                finally:
+                    # Close connection.
+                    pipe.close()
+
+        return cast(list[WorkerProcHandle], ready_proc_handles)
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -312,51 +402,51 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGINT, signal_handler)
 
         worker = None
+        # tuple[Connection, Connection]
+        reader, ready_writer = kwargs.pop("ready_pipe")
         try:
+            reader.close()
             worker = WorkerProc(*args, **kwargs)
 
+            # Send READY once we know everything is loaded
+            ready_writer.send({
+                "status":
+                WorkerProc.READY_STR,
+                "handle":
+                worker.worker_response_mq.export_handle(),
+            })
+
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
             worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
+            ready_writer.close()
+            ready_writer = None
 
             worker.worker_busy_loop()
 
-        except SystemExit:
-            logger.debug("Worker interrupted.")
-
         except Exception:
-            # worker_busy_loop sends exceptions to Executor
-            # for shutdown, but if there is an error in startup or an
-            # error with IPC itself, we need to alert the parent.
-            psutil.Process().parent().send_signal(signal.SIGUSR1)
-            raise
+            # NOTE: if an Exception arises in busy_loop, we send
+            # a FAILURE message over the MQ RPC to notify the Executor,
+            # which triggers system shutdown.
+            # TODO(rob): handle case where the MQ itself breaks.
+
+            if ready_writer is not None:
+                logger.exception("WorkerProc failed to start.")
+            else:
+                logger.exception("WorkerProc failed.")
+
+            # The parent sends a SIGTERM to all worker processes if
+            # any worker dies. Set this value so we don't re-throw
+            # SystemExit() to avoid zmq exceptions in __del__.
+            shutdown_requested = True
 
         finally:
+            if ready_writer is not None:
+                ready_writer.close()
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
-                worker = None
-
-    @staticmethod
-    def wait_for_startup(
-        proc: BaseProcess,
-        ready_socket: zmq.Socket,
-    ) -> Optional[Handle]:
-        """Wait until the Worker is ready."""
-
-        # Wait for Worker to send READY.
-        while ready_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-            logger.debug("Waiting for WorkerProc to startup.")
-
-            if not proc.is_alive():
-                raise RuntimeError("WorkerProc failed to start.")
-
-        message = ready_socket.recv_string()
-        assert message == WorkerProc.READY_STR
-        handle_frame = ready_socket.recv(copy=False)
-        handle = pickle.loads(handle_frame.buffer)
-        return handle
 
     class ResponseStatus(Enum):
         SUCCESS = auto()
@@ -365,7 +455,7 @@ class ResponseStatus(Enum):
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
         while True:
-            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+            method, args, kwargs, rank0_only = self.rpc_broadcast_mq.dequeue()
 
             try:
                 if isinstance(method, str):
@@ -377,12 +467,14 @@ def worker_busy_loop(self):
                 # Notes have been introduced in python 3.11
                 if hasattr(e, "add_note"):
                     e.add_note(traceback.format_exc())
-                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
+                logger.exception("WorkerProc hit an exception.")
                 # exception might not be serializable, so we convert it to
                 # string, only for logging purpose.
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                if not rank0_only or self.rank == 0:
+                    self.worker_response_mq.enqueue(
+                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
                 continue
 
-            self.worker_response_mq.enqueue(
-                (WorkerProc.ResponseStatus.SUCCESS, output))
+            if not rank0_only or self.rank == 0:
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.SUCCESS, output))
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3959be40b72..9109bdcf42f 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import time
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Callable, Optional
 
 import numpy as np
 import prometheus_client
@@ -12,34 +13,53 @@
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
-from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
+from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
 
 logger = init_logger(__name__)
 
 _LOCAL_LOGGING_INTERVAL_SEC = 5.0
 
+StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
+
 
 class StatLoggerBase(ABC):
+    """Interface for logging metrics.
+
+    API users may define custom loggers that implement this interface.
+    However, note that the `SchedulerStats` and `IterationStats` classes
+    are not considered stable interfaces and may change in future versions.
+    """
+
+    @abstractmethod
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
+        ...
 
     @abstractmethod
     def record(self, scheduler_stats: SchedulerStats,
                iteration_stats: Optional[IterationStats]):
         ...
 
+    @abstractmethod
+    def log_engine_initialized(self):
+        ...
+
     def log(self):  # noqa
         pass
 
 
 class LoggingStatLogger(StatLoggerBase):
 
-    def __init__(self, engine_index: int = 0):
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.engine_index = engine_index
+        self.vllm_config = vllm_config
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
         # Prefix cache metrics. This cannot be reset.
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
-        self.spec_decoding_metrics = SpecDecodingMetrics()
+        self.spec_decoding_logging = SpecDecodingLogging()
+        self.last_prompt_throughput: float = 0.0
+        self.last_generation_throughput: float = 0.0
 
     def _reset(self, now):
         self.last_log_time = now
@@ -68,7 +88,7 @@ def record(self, scheduler_stats: SchedulerStats,
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_metrics.observe(
+            self.spec_decoding_logging.observe(
                 scheduler_stats.spec_decoding_stats)
 
         self.last_scheduler_stats = scheduler_stats
@@ -83,8 +103,17 @@ def log(self):
 
         scheduler_stats = self.last_scheduler_stats
 
+        log_fn = logger.info
+        if not any(
+            (prompt_throughput, generation_throughput,
+             self.last_prompt_throughput, self.last_generation_throughput)):
+            # Avoid log noise on an idle production system
+            log_fn = logger.debug
+        self.last_generation_throughput = generation_throughput
+        self.last_prompt_throughput = prompt_throughput
+
         # Format and print output.
-        logger.info(
+        log_fn(
             "Engine %03d: "
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
@@ -101,14 +130,21 @@ def log(self):
         )
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_metrics.log()
+            self.spec_decoding_logging.log(log_fn=log_fn)
+
+    def log_engine_initialized(self):
+        logger.info(
+            "vllm cache_config_info with initialization " \
+            "after num_gpu_blocks is: %d",
+            self.vllm_config.cache_config.num_gpu_blocks)
 
 
 class PrometheusStatLogger(StatLoggerBase):
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self._unregister_vllm_metrics()
-
+        self.vllm_config = vllm_config
+        self.engine_index = engine_index
         # Use this flag to hide metrics that were deprecated in
         # a previous release and which will be removed future
         self.show_hidden_metrics = \
@@ -122,6 +158,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        self.spec_decoding_prom = SpecDecodingProm(
+            vllm_config.speculative_config, labelnames, labelvalues)
+
         #
         # Scheduler state
         #
@@ -205,7 +244,10 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
             prometheus_client.Histogram(
                 name="vllm:iteration_tokens_total",
                 documentation="Histogram of number of tokens per engine_step.",
-                buckets=build_cudagraph_buckets(vllm_config),
+                buckets=[
+                    1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192,
+                    16384
+                ],
                 labelnames=labelnames).labels(*labelvalues)
 
         self.histogram_max_num_generation_tokens_request = \
@@ -312,31 +354,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
                         self.labelname_running_lora_adapters,
                     ])
 
-        #
-        # Speculative Decoding metrics
-        # The acceptance rate can be calculated using a PromQL query:
-        #
-        #   rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
-        #   rate(vllm:spec_decode_num_draft_tokens_total[$interval])
-        #
-        self.counter_spec_decode_num_draft_tokens = \
-            prometheus_client.Counter(
-                name="vllm:spec_decode_num_draft_tokens_total",
-                documentation="Number of draft tokens.",
-                labelnames=labelnames).labels(*labelvalues)
-        self.counter_spec_decode_num_accepted_tokens = \
-            prometheus_client.Counter(
-                name="vllm:spec_decode_num_accepted_tokens_total",
-                documentation="Number of accepted tokens.",
-                labelnames=labelnames).labels(*labelvalues)
-
-        #
-        # Cache config info metric
-        #
-        self.log_metrics_info("cache_config", vllm_config.cache_config)
-
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
         metrics_info = config_obj.metrics_info()
+        metrics_info["engine"] = self.engine_index
 
         name, documentation = None, None
         if type == "cache_config":
@@ -367,10 +387,8 @@ def record(self, scheduler_stats: SchedulerStats,
             scheduler_stats.prefix_cache_stats.hits)
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.counter_spec_decode_num_draft_tokens.inc(
-                scheduler_stats.spec_decoding_stats.num_draft_tokens)
-            self.counter_spec_decode_num_accepted_tokens.inc(
-                scheduler_stats.spec_decoding_stats.num_accepted_tokens)
+            self.spec_decoding_prom.observe(
+                scheduler_stats.spec_decoding_stats)
 
         if iteration_stats is None:
             return
@@ -432,6 +450,9 @@ def _unregister_vllm_metrics():
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 prometheus_client.REGISTRY.unregister(collector)
 
+    def log_engine_initialized(self):
+        self.log_metrics_info("cache_config", self.vllm_config.cache_config)
+
 
 def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
     """
@@ -460,11 +481,29 @@ def build_1_2_5_buckets(max_value: int) -> list[int]:
     return build_buckets([1, 2, 5], max_value)
 
 
-def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]:
-    if not vllm_config.model_config.enforce_eager:
-        buckets = vllm_config.compilation_config.\
-            cudagraph_capture_sizes.copy()
-        buckets.sort()
-        return buckets
+def setup_default_loggers(
+    vllm_config: VllmConfig,
+    log_stats: bool,
+    engine_num: int,
+    custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
+) -> list[list[StatLoggerBase]]:
+    """Setup logging and prometheus metrics."""
+    if not log_stats:
+        return []
+
+    factories: list[StatLoggerFactory]
+    if custom_stat_loggers is not None:
+        factories = custom_stat_loggers
     else:
-        return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
+        factories = [PrometheusStatLogger]
+        if logger.isEnabledFor(logging.INFO):
+            factories.append(LoggingStatLogger)
+
+    stat_loggers: list[list[StatLoggerBase]] = []
+    for i in range(engine_num):
+        per_engine_stat_loggers: list[StatLoggerBase] = []
+        for logger_factory in factories:
+            per_engine_stat_loggers.append(logger_factory(vllm_config, i))
+        stat_loggers.append(per_engine_stat_loggers)
+
+    return stat_loggers
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 6be72431dde..fde366d61c7 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -20,7 +20,6 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        prompt: Optional[str],
         prompt_token_ids: list[int],
         multi_modal_inputs: Optional[list[MultiModalKwargs]],
         multi_modal_hashes: Optional[list[str]],
@@ -30,6 +29,7 @@ def __init__(
         arrival_time: float,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
+        cache_salt: Optional[str] = None,
     ) -> None:
         self.request_id = request_id
         self.sampling_params = sampling_params
@@ -46,13 +46,13 @@ def __init__(
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
 
-        self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: list[int] = []
         self._all_token_ids: list[int] = self.prompt_token_ids.copy()
         self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
+        self.cache_salt: Optional[str] = cache_salt
 
         # Multi-modal related
         self.mm_positions = multi_modal_placeholders or []
@@ -81,7 +81,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
 
         return cls(
             request_id=request.request_id,
-            prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
             multi_modal_inputs=request.mm_inputs,
             multi_modal_hashes=request.mm_hashes,
@@ -92,6 +91,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             lora_request=request.lora_request,
             structured_output_request=StructuredOutputRequest(
                 sampling_params=request.sampling_params),
+            cache_salt=request.cache_salt,
         )
 
     def append_output_token_ids(
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index f69623edd63..745b81ded3f 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -72,14 +72,7 @@ def __init__(self):
                     "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         elif current_platform.is_tpu():
-            if envs.VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION:
-                logger.warning(
-                    "TPU-specific optimization for top-k & top-p sampling are "
-                    "disabled, falling back to PyTorch-native implementation "
-                    "which could be very slow.")
-                self.forward = self.forward_native
-            else:
-                self.forward = self.forward_tpu
+            self.forward = self.forward_tpu
         else:
             self.forward = self.forward_native
 
@@ -146,12 +139,22 @@ def apply_top_k_top_p_tpu(
     chance of being chosen during final sampling, so we can consider the tie
     being broken then.
     """
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+
     if k is not None:
-        logits = apply_top_k_only(logits, k)
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
 
     if p is not None:
-        probs = logits.softmax(dim=-1)
-        probs_sort, _ = probs.sort(dim=-1, descending=False)
         cumprob = torch.cumsum(probs_sort, dim=-1)
         top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
         top_p_mask[:, -1] = False  # at least one
@@ -224,7 +227,7 @@ def apply_top_k_only(
     max_top_k = k.max()
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
-    k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
+    k_index = k.sub_(1).unsqueeze(1)
     top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 3cf7fde5cd0..b25443dd45e 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -75,7 +75,7 @@ def forward(
                 outside of the rejection sampler with the default sampling
                 strategy. It allows for more flexibility in the sampling
                 process such as top_p, top_k sampling.
-            sampling_metadata (SamplingMetadata):
+            sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
                 Additional metadata needed for sampling, such as temperature,
                 top-k/top-p parameters, or other relevant information.
         Returns:
@@ -226,7 +226,7 @@ def rejection_sample(
         is_greedy,
         max_spec_len,
         vocab_size,
-        IS_NGRAM=draft_probs is None,
+        NO_DRAFT_PROBS=draft_probs is None,
         num_warps=1,
     )
     return output_token_ids
@@ -423,7 +423,7 @@ def sample_recovered_tokens(
         q,
         vocab_size,
         triton.next_power_of_2(vocab_size),
-        IS_NGRAM=draft_probs is None,
+        NO_DRAFT_PROBS=draft_probs is None,
     )
     return recovered_token_ids
 
@@ -490,7 +490,7 @@ def rejection_random_sample_kernel(
     is_greedy_ptr,  # [batch_size]
     max_spec_len,
     vocab_size,
-    IS_NGRAM: tl.constexpr,
+    NO_DRAFT_PROBS: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
     is_greedy = tl.load(is_greedy_ptr + req_idx)
@@ -509,7 +509,7 @@ def rejection_random_sample_kernel(
     for pos in range(num_draft_tokens):
         if not rejected:
             draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
-            if IS_NGRAM:
+            if NO_DRAFT_PROBS:
                 draft_prob = 1
             else:
                 draft_prob = tl.load(draft_probs_ptr +
@@ -575,7 +575,7 @@ def sample_recovered_tokens_kernel(
     q_ptr,  # [batch_size, vocab_size]
     vocab_size,
     PADDED_VOCAB_SIZE: tl.constexpr,
-    IS_NGRAM: tl.constexpr,
+    NO_DRAFT_PROBS: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
     if req_idx == 0:
@@ -591,7 +591,7 @@ def sample_recovered_tokens_kernel(
         return
 
     vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
-    if IS_NGRAM:
+    if NO_DRAFT_PROBS:
         draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
         orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
                             draft_token_id)
@@ -624,7 +624,7 @@ def sample_recovered_tokens_kernel(
     recovered_id = tl.argmax(prob / q, axis=-1)
     tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
 
-    if IS_NGRAM:
+    if NO_DRAFT_PROBS:
         # Restore the original probability.
         tl.store(
             target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 3950fda3e5e..d4ea8c2dee0 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -10,8 +10,8 @@
     temperature=-1.0,
     min_p=0.0,
     # strictly disabled for now
-    # top_k=-1,
-    # top_p=0.0,
+    top_k=0,
+    top_p=1.0,
     # frequency_penalties=0.0,
     # presence_penalties=0.0,
     # repetition_penalties=0.0,
@@ -26,11 +26,9 @@ class TPUSupportedSamplingMetadata:
     temperature: torch.Tensor = None
 
     min_p: torch.Tensor = None
-    # Still too slow on forward_native!
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
-    # Greedy sampling flag for compiling single xla graph.
     all_greedy: bool = True
 
     # unsupported, you need to return an extra tensor of static size BxV
@@ -99,11 +97,12 @@ def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
 
         fill_slice(input_batch.temperature_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["temperature"])
-        # TODO Temporarily disabled until sampling options are enabled
-        # fill_slice(input_batch.top_p_cpu_tensor)
-        # fill_slice(input_batch.top_k_cpu_tensor)
         fill_slice(input_batch.min_p_cpu_tensor,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
+        fill_slice(input_batch.top_k_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_k"])
+        fill_slice(input_batch.top_p_cpu_tensor,
+                   DEFAULT_SAMPLING_PARAMS["top_p"])
 
         # Slice persistent device tensors to a fixed pre-compiled padded shape.
         return cls(
@@ -111,7 +110,9 @@ def fill_slice(cpu_tensor: torch.Tensor, fill_val) -> torch.Tensor:
             to(xla_device),
             all_greedy=input_batch.all_greedy,
             # TODO enable more and avoid returning None values
-            top_p=None,  # input_batch.top_p[:padded_num_reqs],
-            top_k=None,  # input_batch.top_k[:padded_num_reqs],
+            top_p=input_batch.top_p_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
+            top_k=input_batch.top_k_cpu_tensor[:padded_num_reqs].to(
+                xla_device),
             min_p=input_batch.min_p_cpu_tensor[:padded_num_reqs].to(
                 xla_device))
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3af6793fde7..e00ecde66af 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import dataclasses
 import pickle
 from collections.abc import Sequence
 from inspect import isclass
@@ -12,12 +13,26 @@
 import zmq
 from msgspec import msgpack
 
+from vllm import envs
+from vllm.multimodal.inputs import (BaseMultiModalField,
+                                    MultiModalBatchedField,
+                                    MultiModalFieldConfig, MultiModalFieldElem,
+                                    MultiModalFlatField, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
+
 CUSTOM_TYPE_PICKLE = 1
 CUSTOM_TYPE_CLOUDPICKLE = 2
 CUSTOM_TYPE_RAW_VIEW = 3
 
-# TODO calibrate this size
-MIN_NOCOPY_BUF_SIZE = 512
+# MultiModalField class serialization type map.
+# These need to list all possible field types and match them
+# to factory methods in `MultiModalFieldConfig`.
+MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
+    MultiModalFlatField: "flat",
+    MultiModalSharedField: "shared",
+    MultiModalBatchedField: "batched",
+}
 
 bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
 
@@ -27,14 +42,23 @@ class MsgpackEncoder:
 
     Note that unlike vanilla `msgspec` Encoders, this interface is generally
     not thread-safe when encoding tensors / numpy arrays.
+
+    By default, arrays below 256B are serialized inline Larger will get sent 
+    via dedicated messages. Note that this is a per-tensor limit.
     """
 
-    def __init__(self):
+    def __init__(self,
+                 size_threshold: Optional[int] = None,
+                 allow_pickle: bool = True):
+        if size_threshold is None:
+            size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
         # This is used as a local stash of buffers that we can then access from
         # our custom `msgspec` hook, `enc_hook`. We don't have a way to
         # pass custom data to the hook otherwise.
         self.aux_buffers: Optional[list[bytestr]] = None
+        self.size_threshold = size_threshold
+        self.allow_pickle = allow_pickle
 
     def encode(self, obj: Any) -> Sequence[bytestr]:
         try:
@@ -59,12 +83,34 @@ def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
 
     def enc_hook(self, obj: Any) -> Any:
         if isinstance(obj, torch.Tensor):
-            return self._encode_ndarray(obj.numpy())
+            return self._encode_tensor(obj)
 
         # Fall back to pickle for object or void kind ndarrays.
         if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'):
             return self._encode_ndarray(obj)
 
+        if isinstance(obj, MultiModalKwargs):
+            mm: MultiModalKwargs = obj
+            if not mm.modalities:
+                # just return the main dict if there are no modalities.
+                return dict(mm)
+
+            # ignore the main dict, it will be re-indexed.
+            # Encode a list of MultiModalKwargsItems as plain dicts
+            # + special handling for .field.
+            # Any tensors *not* indexed by modality will be ignored.
+            return [[{
+                "modality": elem.modality,
+                "key": elem.key,
+                "data": self._encode_nested_tensors(elem.data),
+                "field": self._encode_mm_field(elem.field),
+            } for elem in item.values()]
+                    for itemlist in mm._items_by_modality.values()
+                    for item in itemlist]
+
+        if not self.allow_pickle:
+            raise TypeError(f"Object of type {type(obj)} is not serializable")
+
         if isinstance(obj, FunctionType):
             # `pickle` is generally faster than cloudpickle, but can have
             # problems serializing methods.
@@ -77,8 +123,9 @@ def _encode_ndarray(
         self, obj: np.ndarray
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
+        # If the array is non-contiguous, we need to copy it first
         arr_data = obj.data if obj.data.c_contiguous else obj.tobytes()
-        if not obj.shape or obj.nbytes < MIN_NOCOPY_BUF_SIZE:
+        if not obj.shape or obj.nbytes < self.size_threshold:
             # Encode small arrays and scalars inline. Using this extension type
             # ensures we can avoid copying when decoding.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
@@ -92,6 +139,44 @@ def _encode_ndarray(
         # backing buffers that we've stashed in `aux_buffers`.
         return obj.dtype.str, obj.shape, data
 
+    def _encode_tensor(
+        self, obj: torch.Tensor
+    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+        assert self.aux_buffers is not None
+        # this creates a copy of the tensor if it's not already contiguous
+        obj = obj.contiguous()
+        #  view the tensor as a 1D array of bytes
+        arr = obj.view((obj.numel(), )).view(torch.uint8).numpy()
+        if obj.nbytes < self.size_threshold:
+            # Smaller tensors are encoded inline, just like ndarrays.
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
+        else:
+            # Otherwise encode index of backing buffer to avoid copy.
+            data = len(self.aux_buffers)
+            self.aux_buffers.append(arr.data)
+        dtype = str(obj.dtype)[6:]  # remove 'torch.' prefix
+        return dtype, obj.shape, data
+
+    def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
+        if isinstance(nt, torch.Tensor):
+            return self._encode_tensor(nt)
+        if isinstance(nt, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return nt
+        return [self._encode_nested_tensors(x) for x in nt]
+
+    def _encode_mm_field(self, field: BaseMultiModalField):
+        # Figure out the factory name for the field type.
+        name = MMF_CLASS_TO_FACTORY.get(field.__class__)
+        if not name:
+            raise TypeError(f"Unsupported field type: {field.__class__}")
+        # We just need to copy all of the field values in order
+        # which will be then used to reconstruct the field.
+        field_values = (getattr(field, f.name)
+                        for f in dataclasses.fields(field))
+        return name, *field_values
+
 
 class MsgpackDecoder:
     """Decoder with custom torch tensor and numpy array serialization.
@@ -100,12 +185,13 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Optional[Any] = None):
+    def __init__(self, t: Optional[Any] = None, allow_pickle: bool = True):
         args = () if t is None else (t, )
         self.decoder = msgpack.Decoder(*args,
                                        ext_hook=self.ext_hook,
                                        dec_hook=self.dec_hook)
         self.aux_buffers: Sequence[bytestr] = ()
+        self.allow_pickle = allow_pickle
 
     def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
         if isinstance(bufs, (bytes, bytearray, memoryview, zmq.Frame)):
@@ -125,21 +211,73 @@ def dec_hook(self, t: type, obj: Any) -> Any:
             if issubclass(t, np.ndarray):
                 return self._decode_ndarray(obj)
             if issubclass(t, torch.Tensor):
-                return torch.from_numpy(self._decode_ndarray(obj))
+                return self._decode_tensor(obj)
+            if issubclass(t, MultiModalKwargs):
+                if isinstance(obj, list):
+                    return MultiModalKwargs.from_items(
+                        self._decode_mm_items(obj))
+                return MultiModalKwargs({
+                    k: self._decode_nested_tensors(v)
+                    for k, v in obj.items()
+                })
         return obj
 
     def _decode_ndarray(self, arr: Any) -> np.ndarray:
         dtype, shape, data = arr
+        # zero-copy decode. We assume the ndarray will not be kept around,
+        # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
         return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
 
+    def _decode_tensor(self, arr: Any) -> torch.Tensor:
+        dtype, shape, data = arr
+        # Copy from inline representation, to decouple the memory storage
+        # of the message from the original buffer. And also make Torch
+        # not complain about a readonly memoryview.
+        buffer = self.aux_buffers[data] if isinstance(data, int) \
+            else bytearray(data)
+        # Create numpy wrapper around the bytes
+        arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), ))
+        torch_dtype = getattr(torch, dtype)
+        assert isinstance(torch_dtype, torch.dtype)
+        # Convert back to proper shape & type
+        return torch.from_numpy(arr).view(torch_dtype).view(shape)
+
+    def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
+        decoded_items = []
+        for item in obj:
+            elems = []
+            for v in item:
+                v["data"] = self._decode_nested_tensors(v["data"])
+                # Reconstruct the field processor using MultiModalFieldConfig
+                factory_meth_name, *field_args = v["field"]
+                factory_meth = getattr(MultiModalFieldConfig,
+                                       factory_meth_name)
+                v["field"] = factory_meth(None, *field_args).field
+                elems.append(MultiModalFieldElem(**v))
+            decoded_items.append(MultiModalKwargsItem.from_elems(elems))
+        return decoded_items
+
+    def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
+        if isinstance(obj, (int, float)):
+            # Although it violates NestedTensors type, MultiModalKwargs
+            # values are sometimes floats.
+            return obj
+        if not isinstance(obj, list):
+            raise TypeError(f"Unexpected NestedTensors contents: {type(obj)}")
+        if obj and isinstance(obj[0], str):
+            return self._decode_tensor(obj)
+        return [self._decode_nested_tensors(x) for x in obj]
+
     def ext_hook(self, code: int, data: memoryview) -> Any:
         if code == CUSTOM_TYPE_RAW_VIEW:
             return data
-        if code == CUSTOM_TYPE_PICKLE:
-            return pickle.loads(data)
-        if code == CUSTOM_TYPE_CLOUDPICKLE:
-            return cloudpickle.loads(data)
+
+        if self.allow_pickle:
+            if code == CUSTOM_TYPE_PICKLE:
+                return pickle.loads(data)
+            if code == CUSTOM_TYPE_CLOUDPICKLE:
+                return cloudpickle.loads(data)
 
         raise NotImplementedError(
             f"Extension type code {code} is not supported")
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 2322463c071..07097d7da68 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -4,14 +4,20 @@
 import triton
 import triton.language as tl
 
-from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config
 from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models.llama_eagle import EagleLlamaForCausalLM
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.sample.metadata import SamplingMetadata
 
+logger = init_logger(__name__)
+
+PADDING_SLOT_ID = -1
+
 
 class EagleProposer:
 
@@ -21,9 +27,39 @@ def __init__(
         device: torch.device,
     ):
         self.vllm_config = vllm_config
+        self.method = self.vllm_config.speculative_config.method
         self.num_speculative_tokens = (
             vllm_config.speculative_config.num_speculative_tokens)
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
+
+        self.dtype = vllm_config.model_config.dtype
+
+        self.max_num_tokens = vllm_config.scheduler_config \
+            .max_num_batched_tokens
+
+        self.hidden_size = vllm_config.model_config.get_hidden_size()
+
+        self.use_cuda_graph = (self.vllm_config.compilation_config.level
+                               == CompilationLevel.PIECEWISE and
+                               not self.vllm_config.model_config.enforce_eager)
+
+        self.cudagraph_batch_sizes = list(
+            reversed(
+                self.vllm_config.compilation_config.cudagraph_capture_sizes))
+
+        # persistent buffers for cuda graph
+        self.input_ids = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int32,
+                                     device=device)
+        self.positions = torch.zeros(self.max_num_tokens,
+                                     dtype=torch.int64,
+                                     device=device)
+
+        self.hidden_states = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=device)
         # We need +1 here because the arange is used to set query_start_loc,
         # which has one more element than batch_size.
         self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
@@ -48,18 +84,23 @@ def propose(
         # [batch_size, max_num_blocks_per_req]
         block_table: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
         last_token_indices = cu_num_tokens[1:] - 1
 
-        input_ids = torch.empty_like(target_token_ids)
+        if self.method == "eagle3":
+            assert isinstance(self.model, Eagle3LlamaForCausalLM)
+            target_hidden_states = self.model.combine_hidden_states(
+                target_hidden_states)
+            assert target_hidden_states.shape[-1] == self.hidden_size
+
         # Shift the input ids by one token.
         # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
-        input_ids[:-1] = target_token_ids[1:]
+        self.input_ids[:num_tokens - 1] = target_token_ids[1:]
         # Replace the last token with the next token.
         # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
-        input_ids[last_token_indices] = next_token_ids
+        self.input_ids[last_token_indices] = next_token_ids
 
         # FA requires seq_len to have dtype int32.
         seq_lens = (target_positions[last_token_indices] + 1).int()
@@ -82,64 +123,112 @@ def propose(
             prefix_kv_lens=None,
             suffix_kv_lens=None,
         )
-
-        with set_forward_context(attn_metadata, self.vllm_config):
-            hidden_states = self.model(
-                input_ids=input_ids,
-                hidden_states=target_hidden_states,
-                positions=target_positions,
+        if self.use_cuda_graph and \
+            num_tokens <= self.cudagraph_batch_sizes[-1]:
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+        else:
+            num_input_tokens = num_tokens
+        # copy inputs to buffer for cudagraph
+        self.positions[:num_tokens] = target_positions
+
+        self.hidden_states[:num_tokens] = target_hidden_states
+
+        with set_forward_context(attn_metadata,
+                                 self.vllm_config,
+                                 num_tokens=num_input_tokens):
+            last_hidden_states, hidden_states = self.model(
+                input_ids=self.input_ids[:num_input_tokens],
+                positions=self.positions[:num_input_tokens],
+                hidden_states=self.hidden_states[:num_input_tokens],
             )
-        sample_hidden_states = hidden_states[last_token_indices]
+        sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
-        draft_token_ids, draft_probs = compute_probs_and_sample_next_token(
-            logits, sampling_metadata)
+        draft_token_ids = logits.argmax(dim=-1)
 
         # Early exit if there is only one draft token to be generated.
         if self.num_speculative_tokens == 1:
-            # [batch_size, 1] and [batch_size, 1, vocab_size]
-            return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1)
+            # [batch_size, 1]
+            return draft_token_ids.view(-1, 1)
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
-        draft_probs_list = [draft_probs]
 
         positions = target_positions[last_token_indices]
-        hidden_states = sample_hidden_states
+        hidden_states = hidden_states[last_token_indices]
+        if self.use_cuda_graph and \
+            batch_size <= self.cudagraph_batch_sizes[-1]:
+            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+        else:
+            input_batch_size = batch_size
         attn_metadata.num_actual_tokens = batch_size
         attn_metadata.max_query_len = 1
         attn_metadata.query_start_loc = self.arange[:batch_size + 1]
         for _ in range(self.num_speculative_tokens - 1):
             # Update the inputs.
-            input_ids = draft_token_ids_list[-1]
+            # cast to int32 is crucial when eagle model is compiled.
+            # tensor.argmax() returns int64 by default.
+            input_ids = draft_token_ids_list[-1].int()
             positions += 1
+
+            # NOTE(woosuk): We should handle the case where the draft model
+            # generates tokens beyond the max model length. Since it is complex
+            # to remove such requests from the batch, we keep them in the batch
+            # but adjust the position ids and slot mappings to avoid the
+            # out-of-range access during the model execution. The draft tokens
+            # generated with this adjustment should be ignored.
+            exceeds_max_model_len = positions >= self.max_model_len
+            # Mask out the position ids that exceed the max model length.
+            # Otherwise, we may get out-of-range error in RoPE.
+            clamped_positions = torch.where(exceeds_max_model_len, 0,
+                                            positions)
+
+            # Increment the sequence lengths.
             attn_metadata.max_seq_len += 1
             attn_metadata.seq_lens += 1
+            # Consider max model length.
+            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
+                                            self.max_model_len)
+            # For the requests that exceed the max model length, we set the
+            # sequence length to 1 to minimize their overheads in attention.
+            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+
             # Compute the slot mapping.
-            block_numbers = positions // self.block_size
+            block_numbers = clamped_positions // self.block_size
             block_ids = block_table.gather(dim=1,
                                            index=block_numbers.view(-1, 1))
             block_ids = block_ids.view(-1)
             attn_metadata.slot_mapping = (block_ids * self.block_size +
-                                          positions % self.block_size)
+                                          clamped_positions % self.block_size)
+            # Mask out the slot mappings that exceed the max model length.
+            # Otherwise, the KV cache will be inadvertently updated with the
+            # padding tokens.
+            attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
+                                                    PADDING_SLOT_ID)
+
+            # copy inputs to buffer for cudagraph
+            self.input_ids[:batch_size] = input_ids
+            self.positions[:batch_size] = clamped_positions
+
+            self.hidden_states[:batch_size] = hidden_states
 
             # Run the model.
-            with set_forward_context(attn_metadata, self.vllm_config):
-                hidden_states = self.model(
-                    input_ids=input_ids,
-                    hidden_states=hidden_states,
-                    positions=positions,
+            with set_forward_context(attn_metadata,
+                                     self.vllm_config,
+                                     num_tokens=input_batch_size):
+                last_hidden_states, hidden_states = self.model(
+                    input_ids=self.input_ids[:input_batch_size],
+                    positions=self.positions[:input_batch_size],
+                    hidden_states=self.hidden_states[:input_batch_size],
                 )
-            logits = self.model.compute_logits(hidden_states, None)
-            draft_token_ids, probs = compute_probs_and_sample_next_token(
-                logits, sampling_metadata)
+            hidden_states = hidden_states[:batch_size]
+            logits = self.model.compute_logits(last_hidden_states[:batch_size],
+                                               None)
+            draft_token_ids = logits.argmax(dim=-1)
             draft_token_ids_list.append(draft_token_ids)
-            draft_probs_list.append(probs)
 
         # [batch_size, num_speculative_tokens]
         draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
-        # [batch_size, num_speculative_tokens, vocab_size]
-        draft_probs = torch.stack(draft_probs_list, dim=1)
-        return draft_token_ids, draft_probs
+        return draft_token_ids
 
     @staticmethod
     def prepare_inputs(
@@ -198,17 +287,41 @@ def load_model(self, target_model: nn.Module) -> None:
         with set_default_torch_dtype(
                 draft_model_config.dtype), set_current_vllm_config(
                     self.vllm_config):
-            self.model = EagleLlamaForCausalLM(
-                model_config=draft_model_config,
+            draft_model_cls, arch = ModelRegistry.resolve_model_cls(
+                draft_model_config.architectures)
+            self.model = draft_model_cls(
+                vllm_config=self.vllm_config,
                 start_layer_id=target_layer_num).to(target_device)
 
-        self.model.load_weights(
-            loader.get_all_weights(
-                self.vllm_config.speculative_config.draft_model_config,
-                self.model))
-        self.model.lm_head = target_model.lm_head
+        loaded_weights = self.model.load_weights(
+            loader.get_all_weights(draft_model_config, self.model))
+        if self.vllm_config.speculative_config.method == "eagle3":
+            if "model.embed_tokens.weight" not in loaded_weights:
+                logger.info(
+                    "Loading EAGLE embedding weights from the target model.")
+                self.model.model.embed_tokens = target_model.model.embed_tokens
+        else:
+            logger.info("Loading EAGLE LM head weights from the target model.")
+            self.model.lm_head = target_model.lm_head
+
+    @torch.inference_mode()
+    def dummy_run(
+        self,
+        num_tokens: int,
+    ) -> None:
+        with set_forward_context(None, self.vllm_config,
+                                 num_tokens=num_tokens):
+            self.model(
+                input_ids=self.input_ids[:num_tokens],
+                positions=self.positions[:num_tokens],
+                hidden_states=self.hidden_states[:num_tokens],
+            )
 
 
+# NOTE(woosuk): Currently, the below code is not used and we always use argmax
+# to sample the draft tokens. We will use this after we find a way to manage
+# the draft prob tensor.
+# Refer to https://github.com/vllm-project/vllm/pull/16899 for the details.
 # FIXME(woosuk): The logic here is duplicated with the main sampling code.
 # We should refactor this to reuse the same sampling implementation.
 def compute_probs_and_sample_next_token(
@@ -235,7 +348,9 @@ def compute_probs_and_sample_next_token(
     # TODO(woosuk): Consider seeds.
     q = torch.empty_like(probs)
     q.exponential_()
-    next_token_ids = probs.div_(q).argmax(dim=-1).view(-1)
+    # NOTE(woosuk): We shouldn't use `probs.div_(q)` because the draft_probs
+    # will be used later for rejection sampling.
+    next_token_ids = probs.div(q).argmax(dim=-1).view(-1)
     if not sampling_metadata.all_random:
         greedy_token_ids = probs.argmax(dim=-1)
         next_token_ids = torch.where(
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 7bb3c209d1d..33ce98284e2 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import Optional
 
 import numpy as np
+import prometheus_client
 
+from vllm.config import SpeculativeConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -11,52 +14,151 @@
 
 @dataclass
 class SpecDecodingStats:
+    """Per-step iteration decoding stats from scheduler.
+
+    Each scheduler step, statistics on spec decoding performance are
+    aggregated across requests by the scheduler and returned to the
+    frontend in EngineCoreOutputs->SchedulerStats.
+    """
+
+    num_spec_tokens: int
+    num_drafts: int = 0
     num_draft_tokens: int = 0
     num_accepted_tokens: int = 0
+    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
 
-    def take(self):
-        copied = SpecDecodingStats(self.num_draft_tokens,
-                                   self.num_accepted_tokens)
-        self.reset()
-        return copied
-
-    def reset(self):
-        self.num_draft_tokens = 0
-        self.num_accepted_tokens = 0
+    @classmethod
+    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
+        return cls(num_spec_tokens=num_spec_tokens,
+                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
 
-    def observe(self, num_draft_tokens: int, num_accepted_tokens: int):
+    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_drafts += 1
         self.num_draft_tokens += num_draft_tokens
         self.num_accepted_tokens += num_accepted_tokens
+        assert num_accepted_tokens <= self.num_spec_tokens
+        for i in range(num_accepted_tokens):
+            self.num_accepted_tokens_per_pos[i] += 1
+
 
+class SpecDecodingLogging:
+    """Aggregate and log spec decoding metrics.
 
-class SpecDecodingMetrics:
+    LoggingStatLogger aggregates per-iteration metrics over a set
+    time interval using observe() and then logs them using log()
+    before resetting to zero.
+    """
 
     def __init__(self):
         self.reset()
 
     def reset(self):
+        self.num_drafts: list[int] = []
         self.num_draft_tokens: list[int] = []
         self.num_accepted_tokens: list[int] = []
+        self.accepted_tokens_per_pos_lists: list[list[int]] = []
 
     def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_drafts.append(spec_decoding_stats.num_drafts)
         self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
         self.num_accepted_tokens.append(
             spec_decoding_stats.num_accepted_tokens)
+        self.accepted_tokens_per_pos_lists.append(
+            spec_decoding_stats.num_accepted_tokens_per_pos)
 
-    def log(self):
+    def log(self, log_fn=logger.info):
+        num_drafts = np.sum(self.num_drafts)
         num_draft_tokens = np.sum(self.num_draft_tokens)
         num_accepted_tokens = np.sum(self.num_accepted_tokens)
 
         draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
                                  100 if num_draft_tokens > 0 else float("nan"))
+        mean_acceptance_length = (num_accepted_tokens / num_drafts)
 
-        logger.info(
+        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
+        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
+        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
+
+        log_fn(
             "SpecDecoding metrics: "
             "Draft acceptance rate: %.1f%%, "
+            "Mean acceptance length: %.2f, "
             "Accepted: %d tokens, "
-            "Drafted: %d tokens",
+            "Drafted: %d tokens, "
+            "Per-position acceptance rate: %s",
             draft_acceptance_rate,
+            mean_acceptance_length,
             num_accepted_tokens,
             num_draft_tokens,
+            rates_str,
         )
         self.reset()
+
+
+class SpecDecodingProm:
+    """Record spec decoding metrics in Prometheus.
+
+    The acceptance rate can be calculated using a PromQL query:
+
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+
+    The mean acceptance length can be calculated using:
+
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_drafts[$interval])
+
+    A per-position acceptance rate vector can be computed using
+
+      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
+      vllm:spec_decode_num_drafts[$interval]
+    """
+
+    def __init__(self, speculative_config: Optional[SpeculativeConfig],
+                 labelnames: list[str], labelvalues: list[str]):
+        self.spec_decoding_enabled = speculative_config is not None
+        if not self.spec_decoding_enabled:
+            return
+
+        self.counter_spec_decode_num_drafts = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_drafts_total",
+                documentation="Number of spec decoding drafts.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
+        assert speculative_config is not None
+        num_spec_tokens = (speculative_config.num_speculative_tokens
+                           if self.spec_decoding_enabled else 0)
+        pos_labelnames = labelnames + ["position"]
+        base_counter = prometheus_client.Counter(
+            name="vllm:spec_decode_num_accepted_tokens_per_pos",
+            documentation="Accepted tokens per draft position.",
+            labelnames=pos_labelnames)
+        self.counter_spec_decode_num_accepted_tokens_per_pos: \
+            list[prometheus_client.Counter] = []
+        for pos in range(num_spec_tokens):
+            pos_labelvalues = labelvalues + [str(pos)]
+            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
+                base_counter.labels(*pos_labelvalues))
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
+        self.counter_spec_decode_num_draft_tokens.inc(
+            spec_decoding_stats.num_draft_tokens)
+        self.counter_spec_decode_num_accepted_tokens.inc(
+            spec_decoding_stats.num_accepted_tokens)
+        for pos, counter in enumerate(
+                self.counter_spec_decode_num_accepted_tokens_per_pos):
+            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 7e548bb48b5..704153d43a2 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -18,6 +18,9 @@ def __init__(self, vllm_config: VllmConfig):
         # tokens follow the match, we will return the maximum amount of
         # tokens until the end.
         self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Maximum length of the model.
+        self.max_model_len = vllm_config.model_config.max_model_len
+
         # Trigger Numba JIT compilation for N-gram proposer.
         # This usually takes less than 1 second.
         self.propose(np.zeros(1024, dtype=np.int32))
@@ -50,9 +53,14 @@ def propose(
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
+        # Do not generate draft tokens beyond the max model length.
+        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
+        if k <= 0:
+            return None
+
         # TODO(woosuk): Optimize this.
         for n in range(self.max_n, self.min_n - 1, -1):
-            result = _find_subarray_kmp(context_token_ids, n, self.k)
+            result = _find_subarray_kmp(context_token_ids, n, k)
             if result is not None:
                 return result
         return None
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 218af43deb6..3183edb7c94 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -27,6 +27,7 @@ class StructuredOutputManager:
     def __init__(self, vllm_config: VllmConfig):
         self.backend: Optional[StructuredOutputBackend] = None
         self.vllm_config = vllm_config
+
         self._grammar_bitmask: Optional[torch.Tensor] = None
 
         # The default max_workers if not specified is the number of CPUs * 5,
@@ -45,17 +46,17 @@ def grammar_init(self, request: Request) -> None:
         # NOTE: We only support a single backend. We do NOT support different
         # backends on a per-request basis in V1 (for now, anyway...).
         if self.backend is None:
-            backend_name = request.sampling_params.guided_decoding.backend_name
-            if backend_name == "xgrammar":
+            backend = request.sampling_params.guided_decoding.backend
+            if backend == "xgrammar":
                 from vllm.v1.structured_output.backend_xgrammar import (
                     XgrammarBackend)
 
                 self.backend = XgrammarBackend(self.vllm_config)
-            elif backend_name == "guidance":
+            elif backend == "guidance":
                 self.backend = GuidanceBackend(self.vllm_config)
             else:
                 raise ValueError(
-                    f"Unsupported structured output backend: {backend_name}")
+                    f"Unsupported structured output backend: {backend}")
 
         grammar = self.executor.submit(self._async_create_grammar, request)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
@@ -80,7 +81,7 @@ def grammar_bitmask(
         self,
         requests: dict[str, Request],
         structured_output_request_ids: dict[str, int],
-        batch_len: int,
+        scheduled_spec_decode_tokens: dict[str, list[int]],
     ) -> Optional[npt.NDArray[np.int32]]:
         # Prepare the structured output bitmask for this batch.
         if not structured_output_request_ids:
@@ -88,22 +89,58 @@ def grammar_bitmask(
 
         if self._grammar_bitmask is None:
             assert self.backend is not None
-            self._grammar_bitmask = self.backend.allocate_token_bitmask(
-                self.vllm_config.scheduler_config.max_num_seqs)
-
-        # Fill the bitmask using the index of each request equal to its
-        # position in the batch. Resize the bitmask down to the size of
-        # the batch.
-        bitmask_tensor = self._grammar_bitmask
-        for req_id, batch_index in structured_output_request_ids.items():
+            max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+            if self.vllm_config.speculative_config is not None:
+                max_num_spec_tokens = self.vllm_config.\
+                    speculative_config.num_speculative_tokens
+            else:
+                max_num_spec_tokens = 0
+
+            # Allocate a bitmask for each token needing to be checked:
+            # one for each speculative position, and one more for the
+            # bonus token / non-speculative token.
+            self._grammar_bitmask = \
+                self.backend.allocate_token_bitmask(
+                    max_batch_size * (1 + max_num_spec_tokens))
+
+        # Generate a batched bitmask for all structured output requests.
+        # When speculative decoding is enabled, we need to include multiple
+        # masks for each request, one for each possible bonus token position.
+        # These are stored inline in the tensor and unpacked by the gpu runner.
+        cumulative_index = 0
+        ordered_seq = sorted(structured_output_request_ids.items(),
+                             key=lambda x: x[1])
+        # NOTE: This outer loop can likely be parallelized to improve
+        # performance of bitmask generation for large batches.
+        for req_id, _ in ordered_seq:
             request = requests[req_id].structured_output_request
             assert request is not None and request.grammar is not None
-            if not request.grammar.is_terminated():
-                request.grammar.fill_bitmask(bitmask_tensor, batch_index)
-        if batch_len < self._grammar_bitmask.shape[0]:
-            bitmask_tensor = self._grammar_bitmask[:batch_len]
+            state_advancements = 0
+            req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None]
+            for i, token in enumerate(req_tokens):
+                if not request.grammar.is_terminated():
+                    request.grammar.fill_bitmask(self._grammar_bitmask,
+                                                 cumulative_index)
+                    if token is not None:
+                        # In order to generate the correct bitmask for each
+                        # position in the speculative sequence, we advance
+                        # the FSM state for each speculative token and rollback
+                        # to restore the previous state when we are finished.
+                        assert request.grammar.accept_tokens(req_id, [token])
+                        state_advancements += 1
+                cumulative_index += 1
+            if state_advancements > 0:
+                request.grammar.rollback(state_advancements)
+
+        bitmask_tensor = self._grammar_bitmask
+        if cumulative_index < self._grammar_bitmask.shape[0]:
+            bitmask_tensor = self._grammar_bitmask[:cumulative_index]
 
         # After finishing with the xgrammar operations, we convert to
         # np.ndarray, because that is much more efficient for serialization
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
+
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 9150a28570b..0ab175e781e 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
+import json
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 
@@ -29,6 +31,29 @@
 logger = init_logger(__name__)
 
 
+def _walk_json_for_additional_properties(data: object):
+    if isinstance(data, dict):
+        for value in data.values():
+            _walk_json_for_additional_properties(value)
+        if 'additionalProperties' not in data and \
+            ('properties' in data or 'patternProperties' in data):
+            data['additionalProperties'] = False
+    elif isinstance(data, list):
+        for item in data:
+            _walk_json_for_additional_properties(item)
+
+
+def process_for_additional_properties(
+        guide_json: Union[str, dict[str, Any]]) -> dict[str, Any]:
+    if isinstance(guide_json, str):
+        guide_json_obj = json.loads(guide_json)
+    else:
+        # copy for modifications
+        guide_json_obj = copy.deepcopy(guide_json)
+    _walk_json_for_additional_properties(guide_json_obj)
+    return guide_json_obj
+
+
 class GuidanceBackend(StructuredOutputBackend):
 
     def __init__(self, vllm_config: VllmConfig):
@@ -36,14 +61,14 @@ def __init__(self, vllm_config: VllmConfig):
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
         self.vllm_config = vllm_config
         self.vocab_size = vllm_config.model_config.get_vocab_size()
-        self.disable_any_whitespace = (
-            "disable-any-whitespace"
-            in vllm_config.decoding_config.guided_decoding_backend)
+
+        self.disable_any_whitespace = \
+            vllm_config.decoding_config.disable_any_whitespace
+        self.disable_additional_properties = \
+            vllm_config.decoding_config.disable_additional_properties
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
@@ -52,7 +77,8 @@ def __init__(self, vllm_config: VllmConfig):
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
         self.serialized_grammar = serialize_guidance_grammar(
-            request_type, grammar_spec, self.disable_any_whitespace)
+            request_type, grammar_spec, self.disable_any_whitespace,
+            self.disable_additional_properties)
 
         ll_matcher = llguidance.LLMatcher(
             self.ll_tokenizer,
@@ -73,6 +99,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
         return llguidance_torch.allocate_token_bitmask(
             max_num_seqs, self.ll_tokenizer.vocab_size)
 
+    def destroy(self):
+        pass
+
 
 @dataclass
 class GuidanceGrammar(StructuredOutputGrammar):
@@ -115,6 +144,27 @@ def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
 
         return r
 
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the parser in sequence.
+        Will not advance the parser.
+
+        Returns the prefix list of tokens that are accepted by the parser.
+        """
+        if len(tokens) == 0:
+            return []
+        if self.ll_matcher.is_stopped():
+            return []
+
+        num_tokens = self.ll_matcher.validate_tokens(tokens)
+
+        self.check_error()
+
+        return tokens[:num_tokens]
+
+    def rollback(self, num_tokens: int) -> None:
+        self.ll_matcher.rollback(num_tokens)
+        self.check_error()
+
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         # this will automatically return [EOS] mask if the matcher is stopped
         # or otherwise in an error state
@@ -129,15 +179,24 @@ def reset(self):
         self.ll_matcher.reset()
 
 
-def serialize_guidance_grammar(request_type: StructuredOutputOptions,
-                               grammar_spec: str,
-                               disable_any_whitespace: bool = False) -> str:
-    if request_type == StructuredOutputOptions.JSON:
+def serialize_guidance_grammar(
+    request_type: StructuredOutputOptions,
+    grammar_spec: Union[str, dict[str, Any]],
+    disable_any_whitespace: bool = False,
+    disable_additional_properties: bool = False,
+) -> str:
+
+    def _process_schema(grammar_spec: Union[str, dict[str, Any]], ) -> str:
+        if disable_additional_properties:
+            grammar_spec = process_for_additional_properties(grammar_spec)
         return llguidance.LLMatcher.grammar_from_json_schema(
             grammar_spec,
             defaults={
                 "whitespace_flexible": not disable_any_whitespace,
             })
+
+    if request_type == StructuredOutputOptions.JSON:
+        return _process_schema(grammar_spec)
     elif request_type == StructuredOutputOptions.JSON_OBJECT:
         return llguidance.LLMatcher.grammar_from_json_schema(
             '{"type": "object"}',
@@ -151,6 +210,30 @@ def serialize_guidance_grammar(request_type: StructuredOutputOptions,
             tp = "grammar"
         elif request_type == StructuredOutputOptions.CHOICE:
             tp = "choice"
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            if isinstance(grammar_spec, str):
+                s_tag = json.loads(grammar_spec)
+            else:
+                s_tag = grammar_spec
+            triggers: list[str] = s_tag["triggers"]
+            tags: list[llguidance.StructTag] = []
+            for s in s_tag["structures"]:
+                begin: str = s["begin"]
+                trig = next((t for t in triggers if begin.startswith(t)), None)
+                if trig is None:
+                    raise ValueError(
+                        f"Trigger {begin} not found in triggers {triggers}")
+                tags.append(
+                    llguidance.StructTag(
+                        trigger=trig,
+                        begin=s["begin"],
+                        grammar=_process_schema(s["schema"]),
+                        end=s["end"],
+                    ))
+            if not tags:
+                raise ValueError(
+                    "No structural tags found in the grammar spec.")
+            return llguidance.StructTag.to_grammar(tags)
         else:
             logger.error("Validation should have already occurred. "
                          "Please file an issue.")
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 6dc2a92411d..33ca9f8cf48 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -12,6 +12,7 @@ class StructuredOutputOptions(enum.Enum):
     REGEX = enum.auto()
     GRAMMAR = enum.auto()
     CHOICE = enum.auto()
+    STRUCTURAL_TAG = enum.auto()
 
 
 StructuredOutputKey = tuple[StructuredOutputOptions, str]
@@ -34,6 +35,30 @@ def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
             bool: True if the tokens are accepted, False otherwise.
         """
 
+    @abstractmethod
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """
+        Validates the provided tokens against the grammar.
+        Will not advance the FSM.
+
+        Args:
+            tokens (list[int]): A list of token IDs to validate.
+
+        Returns:
+            list[int]: A list of accepted token IDs. Will be a prefix
+                of the input tokens, and empty if none are accepted.
+        """
+
+    @abstractmethod
+    def rollback(self, num_tokens: int) -> None:
+        """
+        Rolls back the state of the grammar by a specified number of tokens.
+        Will also revert counters for the number of processed tokens.
+
+        Args:
+            num_tokens (int): The number of tokens to roll back.
+        """
+
     @abstractmethod
     def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
         """
@@ -87,3 +112,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
             max_num_seqs (int): The maximum number of sequences for which
               to allocate the bitmask.
         """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        Backend-specific cleanup.
+        """
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 83f2c6436ed..c82a3cab2fa 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,19 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import torch
 
 import vllm.envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar,
                                                      StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (choice_as_grammar,
+                                             convert_lark_to_ebnf,
+                                             grammar_is_likely_lark)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -27,15 +32,18 @@ class XgrammarBackend(StructuredOutputBackend):
 
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
-        self.disable_any_whitespace = (
-            "disable-any-whitespace"
-            in vllm_config.decoding_config.guided_decoding_backend)
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
+
+        self.disable_any_whitespace = \
+            vllm_config.decoding_config.disable_any_whitespace
+
+        self.num_speculative_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            self.num_speculative_tokens = \
+                self.vllm_config.speculative_config.num_speculative_tokens
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.vocab_size = vllm_config.model_config.get_vocab_size()
@@ -97,6 +105,16 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
             ctx = self.compiler.compile_grammar(grammar_spec)
         elif request_type == StructuredOutputOptions.REGEX:
             ctx = self.compiler.compile_regex(grammar_spec)
+        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
+            s_tag = json.loads(grammar_spec)
+            tags = [
+                xgr.StructuralTagItem(
+                    begin=s["begin"],
+                    schema=json.dumps(s["schema"]),
+                    end=s["end"],
+                ) for s in s_tag["structures"]
+            ]
+            ctx = self.compiler.compile_structural_tag(tags, s_tag["triggers"])
         else:
             logger.error(
                 "Validation should have already occurred. Please file an issue."
@@ -105,7 +123,10 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
                 f"grammar is not of valid supported types. ({request_type!s})")
 
         return XgrammarGrammar(
-            matcher=xgr.GrammarMatcher(ctx),
+            matcher=xgr.GrammarMatcher(
+                ctx,
+                max_rollback_tokens=self.num_speculative_tokens,
+            ),
             vocab_size=self.vocab_size,
             ctx=ctx,
         )
@@ -113,6 +134,9 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
     def allocate_token_bitmask(self, max_num_seqs: int):
         return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
 
+    def destroy(self):
+        del self.compiler
+
 
 @dataclass
 class XgrammarGrammar(StructuredOutputGrammar):
@@ -120,7 +144,6 @@ class XgrammarGrammar(StructuredOutputGrammar):
     # supporting different backends, in the future.
     # For now, just xgrammar.
     #
-    # TODO: support max_rollback_tokens
     # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
     # for jump-forward decoding
 
@@ -147,6 +170,27 @@ def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
             self.num_processed_tokens += 1
         return True
 
+    def validate_tokens(self, tokens: list[int]) -> list[int]:
+        """Checks if the list of tokens are accepted by the FSM in sequence.
+        Will not advance the FSM.
+
+        Returns the prefix list of tokens that are accepted by the FSM.
+        """
+        accepted_tokens = []
+        for token in tokens:
+            if self.matcher.accept_token(token):
+                accepted_tokens.append(token)
+            else:
+                break
+        if len(accepted_tokens) > 0:
+            # Rollback the FSM to the initial state
+            self.matcher.rollback(len(accepted_tokens))
+        return accepted_tokens
+
+    def rollback(self, num_tokens: int) -> None:
+        self.matcher.rollback(num_tokens)
+        self.num_processed_tokens -= num_tokens
+
     def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
         self.matcher.fill_next_token_bitmask(bitmask, idx)
 
@@ -156,3 +200,120 @@ def is_terminated(self) -> bool:
     def reset(self):
         self.num_processed_tokens = 0
         self.matcher.reset()
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj
+                for key in ("uniqueItems", "contains", "minContains",
+                            "maxContains", "minItems", "maxItems")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and "format" in obj:
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        try:
+            xgr.Grammar.from_regex(gd_params.regex)
+        except Exception as err:
+            raise ValueError("Failed to transform regex into a grammar: "
+                             f"{err}") from err
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
+        return
+
+    if gd_params.structural_tag:
+        try:
+            s_tag = json.loads(gd_params.structural_tag)
+            tags = [
+                xgr.StructuralTagItem(
+                    begin=s["begin"],
+                    schema=json.dumps(s["schema"]),
+                    end=s["end"],
+                ) for s in s_tag["structures"]
+            ]
+            xgr.Grammar.from_structural_tag(tags, s_tag["triggers"])
+        except Exception as e:
+            raise ValueError("Invalid structural tag specification.") from e
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 9e54b8bf028..6ef472eb896 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -78,5 +78,7 @@ def get_structured_output_key(
         return (StructuredOutputOptions.CHOICE, json_str)
     elif params.grammar is not None:
         return (StructuredOutputOptions.GRAMMAR, params.grammar)
+    elif params.structural_tag is not None:
+        return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag)
     else:
         raise ValueError("No valid structured output parameter found")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 56eed95944e..f33f4972e10 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -2,67 +2,7 @@
 
 from __future__ import annotations
 
-import json
 import re
-from typing import TYPE_CHECKING, Any
-
-from vllm.sampling_params import SamplingParams
-from vllm.utils import LazyLoader
-
-if TYPE_CHECKING:
-    import xgrammar as xgr
-else:
-    xgr = LazyLoader("xgr", globals(), "xgrammar")
-
-
-def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
-    """Check if JSON schema contains features unsupported by xgrammar."""
-
-    def check_object(obj: dict[str, Any]) -> bool:
-        if not isinstance(obj, dict):
-            return False
-
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
-        # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj
-                for key in ("minimum", "maximum", "exclusiveMinimum",
-                            "exclusiveMaximum", "multipleOf")):
-            return True
-
-        # Check for array unsupported keywords
-        if obj.get("type") == "array" and any(
-                key in obj
-                for key in ("uniqueItems", "contains", "minContains",
-                            "maxContains", "minItems", "maxItems")):
-            return True
-
-        # Unsupported keywords for strings
-        if obj.get("type") == "string" and "format" in obj:
-            return True
-
-        # Unsupported keywords for objects
-        if obj.get("type") == "object" and any(
-                key in obj for key in ("minProperties", "maxProperties",
-                                       "propertyNames", "patternProperties")):
-            return True
-
-        # Recursively check all nested objects and arrays
-        for value in obj.values():
-            if isinstance(value, dict):
-                if check_object(value):
-                    return True
-            elif isinstance(value, list):
-                for item in value:
-                    if isinstance(item, dict) and check_object(item):
-                        return True
-
-        return False
-
-    return check_object(schema)
 
 
 def grammar_is_likely_lark(grammar_str: str) -> bool:
@@ -232,63 +172,3 @@ def escape_ebnf_string(s: str) -> str:
     escaped_choices = (escape_ebnf_string(c) for c in choice)
     grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
     return grammar
-
-
-def validate_structured_output_request_xgrammar(
-        sampling_params: SamplingParams) -> None:
-    """Validate that the request is supported by structured output.
-
-    Raises ValueError if the request is not supported.
-    """
-    if sampling_params.guided_decoding is None:
-        return
-
-    gd_params = sampling_params.guided_decoding
-
-    if gd_params.regex:
-        try:
-            xgr.Grammar.from_regex(gd_params.regex)
-        except Exception as err:
-            raise ValueError("Failed to transform regex into a grammar: "
-                             f"{err}") from err
-
-    if gd_params.choice:
-        choice_grammar = choice_as_grammar(gd_params.choice)
-        try:
-            xgr.Grammar.from_ebnf(choice_grammar)
-        except Exception as err:
-            raise ValueError("Failed to transform choices into a grammar: "
-                             "{err}") from err
-        gd_params.choice = None
-        gd_params.grammar = choice_grammar
-        return
-
-    if gd_params.json:
-        if isinstance(gd_params.json, str):
-            try:
-                schema = json.loads(gd_params.json)
-            except json.JSONDecodeError as e:
-                raise ValueError("Invalid JSON grammar specification.") from e
-        else:
-            schema = gd_params.json
-
-        if has_xgrammar_unsupported_json_features(schema):
-            raise ValueError("The provided JSON schema contains features not "
-                             "supported by xgrammar.")
-        return
-
-    if gd_params.grammar:
-        if grammar_is_likely_lark(gd_params.grammar):
-            # xgrammar supports EBNF grammars only
-            try:
-                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to convert the grammar from Lark to EBNF. ") from e
-
-        # Test parsing EBNF grammar, possibly already converted from Lark
-        try:
-            # parse the grammar, but we aren't compiling it.
-            xgr.Grammar.from_ebnf(gd_params.grammar)
-        except Exception as e:
-            raise ValueError("Invalid grammar specification.") from e
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 32d8101f681..9c238c3aad8 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -12,6 +12,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
 from vllm.utils import get_mp_context, kill_process_tree
 
 if TYPE_CHECKING:
@@ -134,8 +136,8 @@ def shutdown(proc: Process, input_path: str, output_path: str):
         proc.terminate()
         proc.join(5)
 
-        if proc.is_alive():
-            kill_process_tree(proc.pid)
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
 
     # Remove zmq ipc socket files.
     ipc_sockets = [output_path, input_path]
@@ -201,3 +203,47 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
     Returns the sliced target tensor.
     """
     return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
+
+
+def report_usage_stats(
+        vllm_config,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
+    """Report usage statistics if enabled."""
+
+    if not is_usage_stats_enabled():
+        return
+
+    from vllm.model_executor.model_loader import get_architecture_class_name
+
+    usage_message.report_usage(
+        get_architecture_class_name(vllm_config.model_config),
+        usage_context,
+        extra_kvs={
+            # Common configuration
+            "dtype":
+            str(vllm_config.model_config.dtype),
+            "tensor_parallel_size":
+            vllm_config.parallel_config.tensor_parallel_size,
+            "block_size":
+            vllm_config.cache_config.block_size,
+            "gpu_memory_utilization":
+            vllm_config.cache_config.gpu_memory_utilization,
+
+            # Quantization
+            "quantization":
+            vllm_config.model_config.quantization,
+            "kv_cache_dtype":
+            str(vllm_config.cache_config.cache_dtype),
+
+            # Feature flags
+            "enable_lora":
+            bool(vllm_config.lora_config),
+            "enable_prompt_adapter":
+            bool(vllm_config.prompt_adapter_config),
+            "enable_prefix_caching":
+            vllm_config.cache_config.enable_prefix_caching,
+            "enforce_eager":
+            vllm_config.model_config.enforce_eager,
+            "disable_custom_all_reduce":
+            vllm_config.parallel_config.disable_custom_all_reduce,
+        })
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c118ad92aec..c2343e44a35 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -28,7 +28,6 @@ class CachedRequestState:
 
     req_id: str
     prompt_token_ids: list[int]
-    prompt: Optional[str]
     mm_inputs: list[MultiModalKwargs]
     mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b38c0cde1c6..dc1d4882811 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -13,11 +13,13 @@
 
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.layer import Attention
-from vllm.config import CompilationLevel, VllmConfig
+from vllm.config import (CompilationLevel, VllmConfig,
+                         get_layers_from_vllm_config)
+from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+                                          has_kv_transfer_group)
 from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -38,6 +40,7 @@
 from vllm.v1.sample.logits_processor import BatchUpdate
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.sample.sampler import Sampler
 from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -153,6 +156,9 @@ def __init__(
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size
 
+        # Sampler
+        self.sampler = Sampler()
+
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
         self.kv_caches: list[torch.Tensor] = []
@@ -161,14 +167,17 @@ def __init__(
 
         # Set up speculative decoding.
         self.use_spec_decode = False
+        self.use_aux_hidden_state_outputs = False
         if self.speculative_config:
             self.use_spec_decode = True
             if get_pp_group().is_last_rank:
                 if self.speculative_config.method == "ngram":
                     self.drafter = NgramProposer(self.vllm_config)
-                elif self.speculative_config.method == "eagle":
+                elif self.speculative_config.use_eagle():
                     self.drafter = EagleProposer(self.vllm_config,
                                                  self.device)  # type: ignore
+                    if self.speculative_config.method == "eagle3":
+                        self.use_aux_hidden_state_outputs = True
                 else:
                     raise ValueError("Unknown speculative decoding method: "
                                      f"{self.speculative_config.method}")
@@ -241,10 +250,11 @@ def __init__(
             device=self.device)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        # Keep in int64 to avoid overflow with long context
         self.arange_np = np.arange(max(self.max_num_reqs + 1,
                                        self.max_model_len,
                                        self.max_num_tokens),
-                                   dtype=np.int32)
+                                   dtype=np.int64)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
         # not make any assumptions about the values in these tensors.
@@ -339,7 +349,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
-                prompt=new_req_data.prompt,
                 mm_inputs=new_req_data.mm_inputs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
@@ -355,6 +364,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 image_grid_thw = []
                 video_grid_thw = []
                 second_per_grid_ts = []
+                audio_feature_lengths = []
+                use_audio_in_video = False
                 for mm_input in self.requests[req_id].mm_inputs:
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.extend(
@@ -365,6 +376,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                     if mm_input.get("second_per_grid_ts") is not None:
                         second_per_grid_ts.extend(
                             mm_input["second_per_grid_ts"])
+                    if mm_input.get("audio_feature_lengths") is not None:
+                        audio_feature_lengths.extend(
+                            mm_input["audio_feature_lengths"])
+                    if mm_input.get("use_audio_in_video") is True:
+                        use_audio_in_video = True
 
                 hf_config = self.model_config.hf_config
 
@@ -376,6 +392,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
                         second_per_grid_ts=second_per_grid_ts,
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
                     )
 
             req_ids_to_add.append(req_id)
@@ -447,7 +465,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # The smaller empty indices are filled first.
         removed = removed_req_indices
         added = []
-        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        removed_req_indices.sort(reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
             if removed_req_indices:
@@ -486,7 +504,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                                self.input_batch.nongreedy_logits_procs):
             processor.update_states(batch_update)
 
-        if batch_changed:
+        # Some attention backends (namely MLA) may want to separate requests
+        # based on if the attention computation will be compute-bound or
+        # memory-bound. This gives them a hook to do that.
+        batch_reordered = self.attn_metadata_builder.reorder_batch(
+            self.input_batch, scheduler_output)
+
+        if batch_changed or batch_reordered:
             self.input_batch.refresh_sampling_metadata()
 
     def _prepare_inputs(
@@ -560,9 +584,6 @@ def _prepare_inputs(
         # because M (max_model_len) is not necessarily divisible by block_size.
         block_table_indices = (req_indices * self.max_num_blocks_per_req +
                                positions_np // self.block_size)
-        # NOTE(woosuk): We use torch.index_select instead of np.take here
-        # because torch.index_select is much faster than np.take for large
-        # tensors.
         block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
         block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
         block_offsets = positions_np % self.block_size
@@ -710,7 +731,7 @@ def _compute_cascade_attn_prefix_len(
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (common_prefix_len // self.block_size *
                              self.block_size)
-        use_cascade = self.attn_backend.use_cascade_attention(
+        use_cascade = self.attn_metadata_builder.use_cascade_attention(
             common_prefix_len=common_prefix_len,
             query_lens=num_scheduled_tokens,
             num_query_heads=self.num_query_heads,
@@ -964,46 +985,58 @@ def apply_grammar_bitmask(
         scheduler_output: "SchedulerOutput",
         logits: torch.Tensor,
     ):
-        # Serialization of np.ndarray is much more efficient than a tensor,
-        # so we receive it in that format.
         grammar_bitmask = scheduler_output.grammar_bitmask
         if grammar_bitmask is None:
             return
 
-        # We receive the structured output bitmask from the scheduler, but the
-        # indices of the requests in the batch may not match the indices of
-        # the bitmask since the scheduler doesn't know how the gpu runner is
-        # ordering the requests in the batch. We need to sort the bitmask to
-        # match the order of the requests used here.
+        # We receive the structured output bitmask from the scheduler,
+        # compacted to contain bitmasks only for structured output requests.
+        # The order of the requests in the bitmask is not guaranteed to be the
+        # same as the order of the requests in the gpu runner's batch. We need
+        # to sort the bitmask to match the order of the requests used here.
+
+        # Get the batch indices of the structured output requests.
+        # Keep track of the number of speculative tokens scheduled for every
+        # request in the batch, as the logit indices are offset by this amount.
         struct_out_req_batch_indices: dict[str, int] = {}
-        indices_match = True
-        for req_id in self.input_batch.req_ids:
-            mask_index = scheduler_output.structured_output_request_ids.get(
-                req_id)
-            if mask_index is None:
-                # not a structured output request
-                continue
-            batch_index = self.input_batch.req_id_to_index[req_id]
-            if batch_index != mask_index:
-                indices_match = False
-            struct_out_req_batch_indices[req_id] = batch_index
-
-        if not indices_match:
-            # Sort the bitmask to match the order of the requests
-            sorted_bitmask = np.zeros_like(grammar_bitmask)
-            for req_id, batch_index in struct_out_req_batch_indices.items():
-                orig_index = scheduler_output.structured_output_request_ids[
-                    req_id]
-                sorted_bitmask[batch_index] = grammar_bitmask[orig_index]
-            grammar_bitmask = sorted_bitmask
+        cumulative_offset = 0
+        seq = sorted(self.input_batch.req_id_to_index.items(),
+                     key=lambda x: x[1])
+        for req_id, batch_index in seq:
+            logit_index = batch_index + cumulative_offset
+            cumulative_offset += len(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+            if req_id in scheduler_output.structured_output_request_ids:
+                struct_out_req_batch_indices[req_id] = logit_index
+
+        out_indices = []
+
+        # Reorder the bitmask to match the order of the requests in the batch.
+        sorted_bitmask = np.zeros_like(grammar_bitmask,
+                                       shape=(logits.shape[0],
+                                              grammar_bitmask.shape[1]))
+        cumulative_index = 0
+        seq = sorted(scheduler_output.structured_output_request_ids.items(),
+                     key=lambda x: x[1])
+        for req_id, _ in seq:
+            logit_index = struct_out_req_batch_indices[req_id]
+            num_spec_tokens = len(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+            for i in range(1 + num_spec_tokens):
+                sorted_bitmask[logit_index + i] = \
+                    grammar_bitmask[cumulative_index + i]
+                out_indices.append(logit_index + i)
+            cumulative_index += 1 + num_spec_tokens
+        grammar_bitmask = sorted_bitmask
 
+        # Serialization of np.ndarray is much more efficient than a tensor,
+        # so we receive it in that format.
         grammar_bitmask = torch.from_numpy(grammar_bitmask)
 
-        # TODO: compatibility with spec decode
         xgr.apply_token_bitmask_inplace(
             logits,
             grammar_bitmask.to(self.device, non_blocking=True),
-            indices=list(struct_out_req_batch_indices.values()),
+            indices=out_indices,
         )
 
     @torch.inference_mode()
@@ -1012,6 +1045,11 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
+        # Update KVConnector with the KVConnector metadata forward().
+        if has_kv_transfer_group():
+            get_kv_transfer_group().bind_connector_metadata(
+                scheduler_output.kv_connector_metadata)
+
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
             # Return empty ModelRunnerOutput if there's no work to do.
@@ -1029,8 +1067,15 @@ def execute_model(
                 num_scheduled_tokens)
         else:
             # Eager mode.
-            num_input_tokens = num_scheduled_tokens
-        attn_metadata.num_input_tokens = num_input_tokens
+            # Pad tokens to multiple of tensor_parallel_size when
+            # enabled collective fusion for SP
+            tp_size = self.vllm_config.parallel_config.tensor_parallel_size
+            if self.vllm_config.compilation_config.pass_config. \
+                enable_sequence_parallelism and tp_size > 1:
+                from vllm.utils import round_up
+                num_input_tokens = round_up(num_scheduled_tokens, tp_size)
+            else:
+                num_input_tokens = num_scheduled_tokens
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
@@ -1082,18 +1127,25 @@ def execute_model(
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
-        with set_forward_context(attn_metadata, self.vllm_config):
-            hidden_states = self.model(
+        with set_forward_context(attn_metadata,
+                                 self.vllm_config,
+                                 num_tokens=num_input_tokens):
+            output = self.model(
                 input_ids=input_ids,
                 positions=positions,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
             )
+
+        if self.use_aux_hidden_state_outputs:
+            hidden_states, aux_hidden_states = output
+        else:
+            hidden_states = output
+
         if not get_pp_group().is_last_rank:
             # For mid-pipeline stages, return the hidden states.
             return hidden_states
 
-        hidden_states = hidden_states[:num_scheduled_tokens]
         sample_hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
 
@@ -1104,7 +1156,7 @@ def execute_model(
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if spec_decode_metadata is None:
-            sampler_output = self.model.sample(
+            sampler_output = self.sampler(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
             )
@@ -1114,7 +1166,7 @@ def execute_model(
             # logits tensor. This means any in-place operations on bonus_logits
             # won't affect the original logits tensor.
             bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
-            sampler_output = self.model.sample(
+            sampler_output = self.sampler(
                 logits=bonus_logits,
                 sampling_metadata=sampling_metadata,
             )
@@ -1159,7 +1211,7 @@ def execute_model(
 
         # Compute prompt logprobs if needed.
         prompt_logprobs_dict = self._get_prompt_logprobs_dict(
-            hidden_states,
+            hidden_states[:num_scheduled_tokens],
             scheduler_output,
         )
 
@@ -1186,7 +1238,7 @@ def execute_model(
             assert isinstance(self.drafter, NgramProposer)
             spec_token_ids = self.generate_draft_token_ids(
                 valid_sampled_token_ids, sampling_metadata)
-        elif self.speculative_config.method == "eagle":
+        elif self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
             # TODO(woosuk): Refactor the loop.
             next_token_ids: list[int] = []
@@ -1209,12 +1261,14 @@ def execute_model(
 
             if spec_decode_metadata is None:
                 # input_ids can be None for multimodal models.
-                # We need to slice token_ids, positions, and hidden_states
-                # because the eagle head does not use cuda graph and should
-                # not include padding.
                 target_token_ids = self.input_ids[:num_scheduled_tokens]
                 target_positions = positions[:num_scheduled_tokens]
-                target_hidden_states = hidden_states[:num_scheduled_tokens]
+                if self.use_aux_hidden_state_outputs:
+                    target_hidden_states = torch.cat(
+                        [h[:num_scheduled_tokens] for h in aux_hidden_states],
+                        dim=-1)
+                else:
+                    target_hidden_states = hidden_states[:num_scheduled_tokens]
                 target_slot_mapping = attn_metadata.slot_mapping
                 cu_num_tokens = attn_metadata.query_start_loc
             else:
@@ -1235,10 +1289,14 @@ def execute_model(
                 )
                 target_token_ids = self.input_ids[token_indices]
                 target_positions = positions[token_indices]
-                target_hidden_states = hidden_states[token_indices]
+                if self.use_aux_hidden_state_outputs:
+                    target_hidden_states = torch.cat(
+                        [h[token_indices] for h in aux_hidden_states], dim=-1)
+                else:
+                    target_hidden_states = hidden_states[token_indices]
                 target_slot_mapping = attn_metadata.slot_mapping[token_indices]
 
-            draft_token_ids, draft_probs = self.drafter.propose(
+            draft_token_ids = self.drafter.propose(
                 target_token_ids=target_token_ids,
                 target_positions=target_positions,
                 target_hidden_states=target_hidden_states,
@@ -1249,9 +1307,10 @@ def execute_model(
                 sampling_metadata=sampling_metadata,
             )
             spec_token_ids = draft_token_ids.tolist()
-            # TODO(woosuk): Cache draft_probs and use it for rejection sampling
-            # in the next step.
-            del draft_probs
+
+        # Clear KVConnector state after all KVs are generated.
+        if has_kv_transfer_group():
+            get_kv_transfer_group().clear_connector_metadata()
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -1276,7 +1335,8 @@ def generate_draft_token_ids(
                 draft_token_ids.append([])
                 continue
 
-            # Skip requests that require top-p, top-k, etc.
+            # Skip requests that require sampling parameters that are not
+            # supported with speculative decoding.
             req_id = self.input_batch.req_ids[i]
             if not is_spec_decode_supported(req_id, self.input_batch):
                 draft_token_ids.append([])
@@ -1285,6 +1345,11 @@ def generate_draft_token_ids(
             # Add sampled_token_ids to token_ids_cpu.
             start_idx = self.input_batch.num_tokens_no_spec[i]
             end_idx = start_idx + num_sampled_ids
+            if end_idx >= self.max_model_len:
+                # Skip requests that have already reached the max model length.
+                draft_token_ids.append([])
+                continue
+
             self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
                 self.input_batch.token_ids_cpu[i, :end_idx])
@@ -1308,6 +1373,9 @@ def load_model(self) -> None:
             if hasattr(self, "drafter"):
                 logger.info("Loading drafter model...")
                 self.drafter.load_model(self.model)
+            if self.use_aux_hidden_state_outputs:
+                self.model.set_aux_hidden_state_layers(
+                    self.model.get_eagle3_aux_hidden_state_layers())
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
         logger.info("Model loading took %.4f GiB and %.6f seconds",
@@ -1384,8 +1452,8 @@ def _get_prompt_logprobs_dict(
             tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
 
             # Compute prompt logprobs.
-            logprobs = self.model.sampler.compute_logprobs(logits)
-            token_ids, logprobs, ranks = self.model.sampler.gather_logprobs(
+            logprobs = self.sampler.compute_logprobs(logits)
+            token_ids, logprobs, ranks = self.sampler.gather_logprobs(
                 logprobs, num_prompt_logprobs, tgt_token_ids)
 
             # Transfer GPU->CPU async.
@@ -1460,12 +1528,21 @@ def _dummy_run(
             with set_forward_context(None,
                                      self.vllm_config,
                                      num_tokens=num_tokens):
-                hidden_states = model(
+                outputs = model(
                     input_ids=input_ids,
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
                     inputs_embeds=inputs_embeds,
                 )
+            if self.use_aux_hidden_state_outputs:
+                hidden_states, _ = outputs
+            else:
+                hidden_states = outputs
+
+            if self.use_spec_decode and \
+                self.speculative_config.method in ('eagle', 'eagle3'):
+                assert isinstance(self.drafter, EagleProposer)
+                self.drafter.dummy_run(num_tokens)
 
         logit_indices = np.cumsum(num_scheduled_tokens) - 1
         return hidden_states[logit_indices]
@@ -1502,8 +1579,8 @@ def _dummy_sampler_run(
             nongreedy_logits_procs=[],
         )
         try:
-            sampler_output = self.model.sample(
-                logits=logits, sampling_metadata=dummy_metadata)
+            sampler_output = self.sampler(logits=logits,
+                                          sampling_metadata=dummy_metadata)
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 raise RuntimeError(
@@ -1702,17 +1779,12 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             format. Layers that do not need KV cache are not included.
         """
 
-        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: dict[str, KVCacheSpec] = {}
-        for layer_name, attn_module in forward_ctx.items():
-            if isinstance(attn_module, FusedMoE):
-                continue
-
-            # TODO: Support other attention modules, e.g., sliding window,
-            # cross-attention
-            assert isinstance(attn_module, Attention)
+        for layer_name, attn_module in layers.items():
+            # TODO: Support other attention modules, e.g., cross-attention
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2972e0ffb3b..ac6861f93a8 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -9,11 +9,12 @@
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -22,6 +23,7 @@
 from vllm.utils import GiB_bytes
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.worker_base import WorkerBase
 
@@ -53,6 +55,9 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
+
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
@@ -72,6 +77,15 @@ def __init__(
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
         allocator = CuMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
@@ -87,6 +101,14 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
         allocator.wake_up(tags)
 
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
+
     def init_device(self):
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
@@ -110,7 +132,7 @@ def init_device(self):
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        init_worker_distributed_environment(self.parallel_config, self.rank,
+        init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
@@ -120,6 +142,10 @@ def init_device(self):
         self.model_runner: GPUModelRunner = GPUModelRunner(
             self.vllm_config, self.device)
 
+        if self.rank == 0:
+            # If usage stat is enabled, collect relevant info.
+            report_usage_stats(self.vllm_config)
+
     # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
     # to hijack tensor allocation.
     def load_model(self) -> None:
@@ -144,9 +170,10 @@ def determine_available_memory(self) -> int:
         Then, it calculate the free memory that can be used for KV cache in
         bytes.
 
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
+        :::{tip}
+        You may limit the usage of GPU memory
+        by adjusting the `gpu_memory_utilization` parameter.
+        :::
         """
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
@@ -285,12 +312,13 @@ def save_sharded_state(
 
 
 def init_worker_distributed_environment(
-    parallel_config: ParallelConfig,
+    vllm_config: VllmConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    parallel_config = vllm_config.parallel_config
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
     init_distributed_environment(parallel_config.world_size, rank,
@@ -299,6 +327,8 @@ def init_worker_distributed_environment(
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
 
+    ensure_kv_transfer_initialized(vllm_config)
+
 
 def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
     # Check if the GPU supports the dtype.
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index a8a19e0e620..3cbab840e96 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -28,20 +28,16 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
                         scheduler_config: SchedulerConfig,
                         lora_config: LoRAConfig, device: str) -> nn.Module:
 
-        assert supports_lora(
-            model), f"{model.__class__.__name__} does not support LoRA yet."
+        if not supports_lora(model):
+            raise ValueError(
+                f"{model.__class__.__name__} does not support LoRA yet.")
 
         if supports_multimodal(model):
             logger.warning("Regarding multimodal models, vLLM currently "
                            "only supports adding LoRA to language model.")
 
-        # It's necessary to distinguish between the max_position_embeddings
-        # of VLMs and LLMs.
-        if hasattr(model.config, "max_position_embeddings"):
-            max_pos_embeddings = model.config.max_position_embeddings
-        else:
-            max_pos_embeddings = (
-                model.config.text_config.max_position_embeddings)
+        # Use get_text_config() in case of multimodal models
+        text_config = model_config.hf_config.get_text_config()
 
         # Add LoRA Manager to the Model Runner
         self.lora_manager = LRUCacheWorkerLoRAManager(
@@ -52,7 +48,7 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
             device,
             model.embedding_modules,
             model.embedding_padding_modules,
-            max_position_embeddings=max_pos_embeddings,
+            max_position_embeddings=text_config.max_position_embeddings,
         )
         return self.lora_manager.create_lora_manager(model)
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index db91f199f11..e7f43edabcd 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import bisect
+import gc
 import time
 from typing import TYPE_CHECKING, Optional, cast
 from unittest.mock import patch
@@ -16,20 +17,22 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+                                    PlaceholderRange)
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec, SlidingWindowSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
@@ -37,8 +40,7 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import sanity_check_mm_encoder_outputs
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -128,10 +130,16 @@ def __init__(
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.max_num_tokens = scheduler_config.max_num_batched_tokens
         # InputBatch needs to work with sampling tensors greater than padding
         # to avoid dynamic shapes. Also, avoid suboptimal alignment.
         self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
+        self.num_tokens_paddings = _get_token_paddings(
+            min_token_size=16,
+            max_token_size=scheduler_config.max_num_batched_tokens,
+            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        # In case `max_num_tokens < max(num_tokens_paddings)` use the actual
+        # padded max value to pre-allocate data structures and pre-compile.
+        self.max_num_tokens = self.num_tokens_paddings[-1]
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -141,6 +149,7 @@ def __init__(
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
+        self.vocab_size = model_config.get_vocab_size()
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -171,7 +180,7 @@ def __init__(
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
-            vocab_size=model_config.get_vocab_size(),
+            vocab_size=self.vocab_size,
         )
 
         # Cached torch/numpy tensor
@@ -192,7 +201,7 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, self.max_num_blocks_per_req),
+            (self.max_num_reqs, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 
@@ -210,14 +219,56 @@ def __init__(
 
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
-        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
-        self.num_tokens_paddings = _get_token_paddings(
-            min_token_size=16,
-            max_token_size=self.max_num_tokens,
-            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+        # Keep in int64 to avoid overflow with long context
+        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # tensors for structured decoding
+        self.grammar_bitmask_cpu = torch.zeros(
+            (self.max_num_reqs, cdiv(self.vocab_size, 32)),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.require_structured_out_cpu = torch.zeros(
+            (self.max_num_reqs, 1),
+            dtype=torch.bool,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.structured_decode_arange = torch.arange(
+            0, 32, device="cpu", pin_memory=self.pin_memory)
+
+        # Get maximum number of mm items per modality (batch size).
+        self.max_num_mm_items_by_modality = dict()
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+            max_tokens_by_modality_dict = (
+                MULTIMODAL_REGISTRY.
+                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            for modality, max_tokens in max_tokens_by_modality_dict.items():
+                # Check how many items of this modality can be supported by
+                # the encoder budget.
+                encoder_budget = min(self.max_num_encoder_input_tokens,
+                                     self.encoder_cache_size)
+
+                max_num_mm_items_encoder_budget = cdiv(encoder_budget,
+                                                       max_tokens)
+
+                # Check how many items of this modality can be supported by
+                # the decoder budget.
+                max_mm_items_per_req = self.mm_registry.\
+                    get_mm_limits_per_prompt(self.model_config)[modality]
+
+                # NOTE: We do not consider max_num_batched_tokens on purpose
+                # because the multimodal embeddings can be generated in advance
+                # and chunked prefilled.
+                max_num_mm_items_decoder_budget = self.max_num_reqs * \
+                    max_mm_items_per_req
+
+                max_num_mm_items = min(max_num_mm_items_encoder_budget,
+                                       max_num_mm_items_decoder_budget)
+                self.max_num_mm_items_by_modality[modality] = max_num_mm_items
+
     def _update_num_xla_graphs(self, case_str):
         check_comp = self.check_recompilation and not self.enforce_eager
         if not check_comp:
@@ -305,7 +356,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
-                prompt=new_req_data.prompt,
                 mm_inputs=new_req_data.mm_inputs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
@@ -379,11 +429,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             format. Layers that do not need KV cache are not included.
         """
 
-        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
-        for layer_name, attn_module in forward_ctx.items():
-            assert isinstance(attn_module, Attention)
+        for layer_name, attn_module in layers.items():
             if attn_module.attn_type == AttentionType.DECODER:
                 if attn_module.sliding_window is not None:
                     kv_cache_spec[layer_name] = SlidingWindowSpec(
@@ -604,29 +653,36 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             # 2. A list or tuple (length: num_items) of tensors, each of shape
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
+            xm.mark_step()
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
+            xm.mark_step()
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
                 expected_num_items=len(grouped_mm_inputs),
             )
 
-            for output in curr_group_outputs:
-                encoder_outputs.append(output)
+            if isinstance(curr_group_outputs, torch.Tensor):
+                encoder_outputs.append(curr_group_outputs)
+            else:
+                assert isinstance(curr_group_outputs, (list, tuple))
+                for output in curr_group_outputs:
+                    encoder_outputs.append(output)
 
         # Cache the encoder outputs.
+        # NOTE (NickLucche) here we diverge from logic in other runners, as we
+        # assume to only have whole mm items to process. Hence we avoid the
+        # intrinsic dynamism that `scatter_mm_placeholders` introduces.
         for (req_id, input_id, pos_info), output in zip(
                 req_ids_pos,
                 encoder_outputs,
         ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
+            assert pos_info.is_embed is None, "Expected all positions to be"\
+                " contiguous and embeddings."
+            self.encoder_cache[req_id][input_id] = output
 
     def _gather_mm_embeddings(
         self,
@@ -639,6 +695,10 @@ def _gather_mm_embeddings(
             req_state = self.requests[req_id]
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
+            # TODO unroll loop and assume/enforce --disable_chunked_mm_input
+            # NOTE (NickLucche) here we diverge from logic in other runners, as
+            # we assume to only have whole mm items to process. Hence we avoid
+            # the intrinsic dynamism that `gather_mm_placeholders` introduces.
             for i, pos_info in enumerate(mm_positions):
                 start_pos = pos_info.offset
                 num_encoder_tokens = pos_info.length
@@ -655,25 +715,33 @@ def _gather_mm_embeddings(
                     # in the decoder's KV cache.
                     continue
 
-                start_idx = max(num_computed_tokens - start_pos, 0)
-                end_idx = min(
-                    num_computed_tokens - start_pos + num_scheduled_tokens,
-                    num_encoder_tokens)
-                assert start_idx < end_idx
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
+                assert pos_info.is_embed is None, "Expected all positions to"\
+                " be contiguous and embeddings."
                 encoder_output = self.encoder_cache[req_id][i]
-
-                if (is_embed := pos_info.is_embed) is not None:
-                    is_embed = is_embed[start_idx:end_idx]
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
-                mm_embeds.append(mm_embeds_item)
+                mm_embeds.append(encoder_output)
         return mm_embeds
 
+    def _get_model_inputs(self, input_ids: torch.Tensor,
+                          mm_embeds: list[torch.Tensor]):
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            if mm_embeds:
+                inputs_embeds = self.model.get_input_embeddings(
+                    input_ids, mm_embeds)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(input_ids)
+            return None, inputs_embeds
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            return input_ids, None
+
     @torch.no_grad()
     def execute_model(
         self,
@@ -692,30 +760,19 @@ def execute_model(
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
             mm_embeds = []
-
+        xm.mark_step()
         # Prepare inputs
         attn_metadata, logits_indices, padded_num_reqs = self._prepare_inputs(
             scheduler_output)
-        if self.is_multimodal_model:
-            # NOTE(woosuk): To unify token ids and soft tokens (vision
-            # embeddings), we always use embeddings (rather than token ids)
-            # as input to the multimodal model, even when the input is text.
-            if mm_embeds:
-                inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, mm_embeds)
-            else:
-                inputs_embeds = self.model.get_input_embeddings(self.input_ids)
-            input_ids = None
-        else:
-            # For text-only models, we use token ids as input.
-            # While it is possible to use embeddings as input just like the
-            # multimodal models, it is not desirable for performance since
-            # then the embedding layer is not included in the CUDA graph.
-            input_ids = self.input_ids
-            inputs_embeds = None
+        input_ids, inputs_embeds = self._get_model_inputs(
+            self.input_ids, mm_embeds)
+        xm.mark_step()
         num_reqs = self.input_batch.num_reqs
         # Run the decoder
-        with set_forward_context(attn_metadata, self.vllm_config):
+        with set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=scheduler_output.total_num_scheduled_tokens):
             hidden_states = self.model(
                 input_ids=input_ids,
                 positions=self.position_ids,
@@ -723,9 +780,16 @@ def execute_model(
             )
         hidden_states = self.select_hidden_states(hidden_states,
                                                   logits_indices)
+        logits = self.compute_logits(hidden_states)
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
             from_input_batch(self.input_batch, padded_num_reqs, self.device)
-        selected_token_ids = self.sample_from_hidden(hidden_states,
+        if scheduler_output.grammar_bitmask is not None:
+            require_struct_decoding, grammar_bitmask_padded, arange = \
+                self.prepare_structured_decoding_input(logits, scheduler_output)
+            logits = self.structured_decode(require_struct_decoding,
+                                            grammar_bitmask_padded, logits,
+                                            arange)
+        selected_token_ids = self.sample_from_logits(logits,
                                                      tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
@@ -888,16 +952,77 @@ def _dummy_run(self, num_tokens: int) -> None:
                              inputs_embeds=inputs_embeds)
         self._hidden_states_dtype = out.dtype
 
+    def _precompile_mm_encoder(self) -> None:
+        # Pre-compile MM encoder for all supported data modalities.
+        hf_config = self.vllm_config.model_config.hf_config
+        for mode, max_items_by_mode in \
+            self.max_num_mm_items_by_modality.items():
+            logger.info(
+                "Compiling Multimodal %s Encoder with different input"
+                " shapes.", mode)
+            start = time.perf_counter()
+            # No padding for MM encoder just yet.
+            for num_items in range(1, max_items_by_mode + 1):
+                logger.info("  -- mode: %s items: %d", mode, num_items)
+                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                    mode, num_items)
+                # Run multimodal encoder.
+                xm.mark_step()
+                mm_embeds = self.model.\
+                    get_multimodal_embeddings(**batched_dummy_mm_inputs)
+                xm.mark_step()
+                num_patches = mm_embeds[0].shape[0]
+                items_size = num_patches * num_items
+
+                # NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
+                # embeddings are present. We assume `--disable-mm-chunked`,
+                # hence only whole items can be scheduled. This implies we just
+                # need to compile when `num_items` fit the (padded) `input_ids`
+                for num_tokens in self.num_tokens_paddings:
+                    if num_tokens >= items_size:
+                        # XLA Workaround: if torch.zeros(..device) is used, XLA
+                        # compiles a scalar+expansion op, which won't match
+                        # the graph generated at runtime. CPU->TPU must be used
+                        placeholders_ids = torch.zeros(num_tokens,
+                                                       dtype=torch.int32,
+                                                       device="cpu")
+                        # Align placeholders and actual num mm_embeddings.
+                        placeholders_ids[:items_size] = \
+                            hf_config.image_token_index
+
+                        placeholders_ids = placeholders_ids.to(self.device)
+                        # Assign outputs or the graph will be cut short.
+                        a, b = self._get_model_inputs(placeholders_ids,
+                                                      [mm_embeds])
+                        assert a is None
+                        xm.mark_step()
+
+            # Pre-compile `get_input_embeddings` when mm_embeddings are not
+            # present. Chunk is only made of text, no mm_placeholders.
+            for num_tokens in self.num_tokens_paddings:
+                placeholders_ids = torch.zeros(num_tokens,
+                                               dtype=torch.int32,
+                                               device="cpu")
+                placeholders_ids = placeholders_ids.to(self.device)
+                a, b = self._get_model_inputs(placeholders_ids, [])
+                assert a is None
+                xm.mark_step()
+
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal %s Encoder compilation finished in in %.2f "
+                "[secs].", mode, end - start)
+
     def _precompile_backbone(self) -> None:
         logger.info("Compiling the model with different input shapes.")
-
         start = time.perf_counter()
         for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
             self._dummy_run(num_tokens)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         self._update_num_xla_graphs("model backbone")
 
     def _precompile_select_hidden_states(self) -> None:
@@ -918,22 +1043,67 @@ def _precompile_select_hidden_states(self) -> None:
                                       device=self.device)
                 torch._dynamo.mark_dynamic(indices, 0)
                 self.select_hidden_states(dummy_hidden, indices)
-            logger.info("  -- num_tokens: %d", num_tokens)
+                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
+                            num_reqs)
+                # Requests can't be more than tokens. But do compile for the
+                # next bigger value in case num_tokens uses bucketed padding.
+                if num_reqs >= min(num_tokens, self.max_num_reqs):
+                    break
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         self._update_num_xla_graphs("select_hidden_states")
 
-    def _precompile_sample_from_hidden(self) -> None:
-        logger.info("Compiling sampling with different input shapes.")
+    def _precompile_compute_logits(self) -> None:
+        logger.info("Compiling compute_logits with different input shapes.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
         for num_reqs in self.num_reqs_paddings:
             dummy_hidden = torch.zeros((num_reqs, hsize),
                                        device=self.device,
                                        dtype=self._hidden_states_dtype)
-            # The first dimension of dummy_hidden cannot be mark_dynamic because
-            # some operations in the sampler require it to be static.
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            self.compute_logits(dummy_hidden)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("compute_logits")
+
+    def _precompile_structured_decoding(self) -> None:
+        logger.info(
+            "Compiling structured_decoding with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            dummy_require_struct_decoding = \
+                self.require_structured_out_cpu[:num_reqs].to(self.device)
+            dummy_grammar_bitmask = \
+                self.grammar_bitmask_cpu[:num_reqs].to(self.device)
+            # The first dimension of the above 3 dummy tensors cannot be
+            # mark_dynamic because some operations in structured_decode require
+            # them to be static.
+            arange = self.structured_decode_arange.to(self.device)
+            self.structured_decode(dummy_require_struct_decoding,
+                                   dummy_grammar_bitmask, dummy_logits, arange)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("structured_decoding")
+
+    def _precompile_sample_from_logits(self) -> None:
+        logger.info(
+            "Compiling sample_from_logits with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            # The first dimension of dummy_logits cannot be mark_dynamic
+            # because some operations in the sampler require it to be static.
             for all_greedy in [False, True]:
                 generate_params_if_all_greedy = not all_greedy
                 sampling_metadata = (
@@ -944,21 +1114,82 @@ def _precompile_sample_from_hidden(self) -> None:
                         generate_params_if_all_greedy,
                     ))
                 sampling_metadata.all_greedy = all_greedy
-                self.sample_from_hidden(dummy_hidden, sampling_metadata)
+                self.sample_from_logits(dummy_logits, sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("sampling")
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sample_from_logits")
 
     def capture_model(self) -> None:
         """
         Precompile all the subgraphs with possible input shapes.
         """
-        # TODO: precompile encoder
+        self._precompile_mm_encoder()
         self._precompile_backbone()
         self._precompile_select_hidden_states()
-        self._precompile_sample_from_hidden()
+        self._precompile_compute_logits()
+        self._precompile_structured_decoding()
+        self._precompile_sample_from_logits()
+
+    def profile_run(
+        self,
+        num_tokens: int,
+    ) -> None:
+        # Profile with multimodal encoder & encoder cache.
+        # TODO: handle encoder-decoder models once we support them.
+        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
+                and self.encoder_cache_size > 0):
+
+            # NOTE: Currently model is profiled with a single non-text
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            dummy_data_modality, max_num_mm_items = max(
+                self.max_num_mm_items_by_modality.items(), key=lambda t: t[1])
+
+            encoder_budget = min(self.max_num_encoder_input_tokens,
+                                 self.encoder_cache_size)
+
+            logger.info(
+                "Encoder cache will be initialized with a budget of %d tokens,"
+                " and profiled with %s %s items of the maximum feature size.",
+                encoder_budget, max_num_mm_items, dummy_data_modality)
+
+            # Create dummy batch of multimodal inputs.
+            batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                dummy_data_modality, max_num_mm_items)
+
+            # Run multimodal encoder.
+            # Isolate encoder graph from post-processing to minimize
+            # impact of recompilation until it's fixed.
+            start = time.perf_counter()
+            xm.mark_step()
+            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
+                **batched_dummy_mm_inputs)
+            xm.mark_step()
+            xm.wait_device_ops()
+            end = time.perf_counter()
+            logger.info(
+                "Multimodal Encoder profiling finished in in %.2f [secs].",
+                end - start)
+
+            assert len(dummy_encoder_outputs) == max_num_mm_items, (
+                "Expected dimension 0 of encoder outputs to match the number "
+                f"of multimodal data items: {max_num_mm_items}, got "
+                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
+                "due to the 'get_multimodal_embeddings' method of the model "
+                "not implemented correctly.")
+
+            # Cache the dummy encoder outputs.
+            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+
+        # Trigger compilation for general shape.
+        self._dummy_run(num_tokens)
+
+        xm.mark_step()
+        xm.wait_device_ops()
+        self.encoder_cache.clear()
+        gc.collect()
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
@@ -980,7 +1211,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 tensor_config = kv_cache_config.tensors[layer_name]
                 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -1015,16 +1246,14 @@ def select_hidden_states(self, hidden_states, indices_do_sample):
         return hidden_states[indices_do_sample]
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def sample_from_hidden(
-        self,
-        sample_hidden_states: torch.Tensor,
-        sampling_metadata: TPUSupportedSamplingMetadata,
-    ) -> torch.Tensor:
-        """
-        Sample with xla-friendly function. This function is to be traced
-        separately from `forward` for lighter compilation overhead.
-        """
-        logits = self.model.compute_logits(sample_hidden_states, None)
+    def compute_logits(self,
+                       sample_hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.model.compute_logits(sample_hidden_states, None)
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def sample_from_logits(
+            self, logits: torch.Tensor,
+            sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
         if sampling_metadata.all_greedy:
             out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
         else:
@@ -1032,12 +1261,101 @@ def sample_from_hidden(
                                       sampling_metadata).sampled_token_ids
         return out_tokens
 
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def structured_decode(self, require_struct_decoding: torch.Tensor,
+                          grammar_bitmask: torch.Tensor, logits: torch.Tensor,
+                          arange: torch.Tensor) -> torch.Tensor:
+        return torch.where(
+            require_struct_decoding,
+            self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
+            logits)
+
+    def apply_grammar_bitmask(self, logits: torch.Tensor,
+                              grammar_bitmask: torch.Tensor,
+                              arange: torch.Tensor):
+        assert (logits.shape[0] == grammar_bitmask.shape[0])
+        logits_cloned = logits.clone()
+        for i in range(logits.shape[0]):
+            unpacked_bitmask = (torch.bitwise_right_shift(
+                grammar_bitmask[i][:, None], arange[None, :]) & 1) == 0
+            unpacked_bitmask = unpacked_bitmask.reshape(-1)[:self.vocab_size]
+            logits_cloned[i] = logits_cloned[i].masked_fill(
+                unpacked_bitmask, -float("inf"))
+        return logits_cloned
+
     def get_multimodal_embeddings(self, *args, **kwargs):
         return self.model.get_multimodal_embeddings(*args, **kwargs)
 
     def get_input_embeddings(self, *args, **kwargs):
         return self.model.get_input_embeddings(*args, **kwargs)
 
+    def prepare_structured_decoding_input(
+        self, logits: torch.Tensor, scheduler_output: "SchedulerOutput"
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        assert grammar_bitmask is not None
+        num_reqs, _ = logits.shape
+
+        # Reset pre-allocated tensors
+        self.grammar_bitmask_cpu.zero_()
+        self.require_structured_out_cpu.zero_()
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the tpu runner is
+        # ordering the requests in the batch. We need to match the order of
+        # bitmask with the order of requests
+        struct_out_indices: list[int] = []
+        mask_indices: list[int] = []
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            struct_out_indices.append(batch_index)
+            mask_indices.append(mask_index)
+        self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy(
+            grammar_bitmask[mask_indices])
+        # It's not guaranteed that all requests in this batch require
+        # structured output, so create a bool tensor to represent
+        # the requests that need structured output.
+        struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long)
+        self.require_structured_out_cpu[struct_out_indices] = True
+        return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
+            self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
+            self.structured_decode_arange.to(logits.device)
+
+    def _get_mm_dummy_batch(self, modality: str,
+                            batch_size: int) -> BatchedTensorInputs:
+        # Dummy data for pre-compiling multimodal models.
+        dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+            model_config=self.model_config,
+            seq_len=self.max_num_tokens,
+        )
+        dummy_mm_data = dummy_request_data.multi_modal_data
+
+        # Dummy data definition in V0 may contain multiple multimodal items
+        # (e.g, multiple images) for a single request, therefore here we
+        # always replicate first item by max_num_mm_items times since in V1
+        # they are scheduled to be processed separately.
+        assert isinstance(dummy_mm_data, MultiModalKwargs), (
+            "Expected dummy multimodal data to be of type "
+            f"MultiModalKwargs, got {type(dummy_mm_data)=} instead. "
+            "This is most likely due to the model not having a merged "
+            "processor.")
+
+        # When models have a merged processor, their dummy data is
+        # already batched `MultiModalKwargs`, therefore we take the first
+        # `MultiModalKwargsItem` from the desired modality to profile on.
+        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+
+        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
+                                                         batch_size)
+        return MultiModalKwargs.as_kwargs(batched_dummy_mm_inputs,
+                                          device=self.device)
+
 
 def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
     logger.info("Preparing request paddings:")
@@ -1081,7 +1399,6 @@ def _get_token_paddings(min_token_size: int, max_token_size: int,
             if num >= max_token_size:
                 break
             num *= 2
-
     else:
         logger.info("Using incremental token paddings:")
         while num <= padding_gap:
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 73c43969b87..de676541eff 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -21,7 +21,7 @@
 from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
-from vllm.v1.utils import bind_kv_cache
+from vllm.v1.utils import bind_kv_cache, report_usage_stats
 from vllm.v1.worker.tpu_model_runner import TPUModelRunner
 
 logger = init_logger(__name__)
@@ -133,6 +133,10 @@ def init_device(self):
         # Init ModelRunner here, so that we have access to self.device.
         self.model_runner = TPUModelRunner(self.vllm_config, self.device)
 
+        if rank == 0:
+            # If usage stat is enabled, collect relevant info.
+            report_usage_stats(self.vllm_config)
+
     def determine_available_memory(self) -> int:
         kv_caches: dict[str, torch.Tensor] = {}
         kv_cache_spec = self.model_runner.get_kv_cache_spec()
@@ -156,8 +160,8 @@ def determine_available_memory(self) -> int:
             self.vllm_config.compilation_config.static_forward_context,
             runner_kv_caches)
 
-        self.model_runner._dummy_run(
-            self.scheduler_config.max_num_batched_tokens)
+        # `max_num_tokens >= max_num_batched_tokens` due to padding.
+        self.model_runner.profile_run(self.model_runner.max_num_tokens)
 
         # Synchronize before measuring the memory usage.
         xm.wait_device_ops()
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e46ca0c90fe..267754036b3 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
     """
     assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
     Scatter the multimodal embeddings into a contiguous tensor that represents
     the placeholder tokens.
 
-    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+    {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
 
     Args:
         embeds: The multimodal embeddings.
@@ -66,7 +66,7 @@ def gather_mm_placeholders(
     """
     Reconstructs the embeddings from the placeholder tokens.
 
-    This is the operation of :func:`scatter_mm_placeholders`.
+    This is the operation of {func}`scatter_mm_placeholders`.
     """
     if is_embed is None:
         return placeholders
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 85ebe8121e5..d48a6957c5d 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -71,19 +71,32 @@ def _allocate_kv_cache(
         device: str,
     ) -> List[torch.Tensor]:
         """Allocates KV cache on the specified device."""
-        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+        kv_cache_generic_shape = self.attn_backend.get_kv_cache_shape(
             num_blocks, self.block_size, self.num_kv_heads, self.head_size)
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
+        try:
+            kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order(
+            )
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_generic_shape)))
+
+        # The allocation respects the backend-defined stride order to ensure
+        # the semantic remains consistent for each backend. We first obtain the
+        # generic kv cache shape and then permute it according to the stride
+        # order which could result in a non-contiguous tensor.
+        kv_cache_allocation_shape = tuple(kv_cache_generic_shape[i]
+                                          for i in kv_cache_stride_order)
 
         for _ in range(self.num_attention_layers):
             # null block in CpuGpuBlockAllocator requires at least that
             # block to be zeroed-out.
             # We zero-out everything for simplicity.
-            layer_kv_cache = torch.zeros(kv_cache_shape,
-                                         dtype=self.dtype,
-                                         pin_memory=pin_memory,
-                                         device=device)
+            layer_kv_cache = torch.zeros(
+                kv_cache_allocation_shape,
+                dtype=self.dtype,
+                pin_memory=pin_memory,
+                device=device).permute(*kv_cache_stride_order)
 
             # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
             # when entry_shape is higher than 1D
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index ac7c93e4839..c2120c03517 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -316,7 +316,7 @@ def execute_model(
             return []
 
         # Sample the next token.
-        output = self.model.sample(
+        output = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 9f4b18869bd..710ca1a13b0 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -19,11 +19,11 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalPlaceholderMap)
+from vllm.multimodal import (BatchedTensorInputs, MultiModalKwargs,
+                             MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
 from vllm.worker.model_runner_base import (
@@ -154,7 +154,6 @@ def __init__(self,
         self.sliding_window = self.runner.sliding_window
         self.block_size = self.runner.block_size
         self.device = self.runner.device
-        self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
         self.enable_lora = self.runner.lora_config is not None
         if self.runner.attn_backend is not None:
             # spec decode (e.g. Medusa) does not have atten backend
@@ -359,22 +358,14 @@ def _compute_multi_modal_input(self,
         computed_len = seq_data.get_num_computed_tokens()
         seq_len = self.input_data.seq_lens[-1]
 
-        # NOTE: mm_data only includes the subset of multi-modal items that
+        # NOTE: mm_kwargs only includes the subset of multi-modal items that
         # intersect with the current prefill positions.
-        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+        mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
             seq_group_metadata, range(computed_len, seq_len))
 
-        if not mm_data:
+        if not mm_kwargs:
             return
 
-        if self.runner.mm_registry.has_processor(self.runner.model_config):
-            mm_kwargs = mm_data
-        else:
-            mm_kwargs = self.multi_modal_input_mapper(
-                mm_data,
-                seq_group_metadata.mm_processor_kwargs,
-            )
-
         # special processing for mrope position deltas.
         if self.runner.model_config.uses_mrope:
             assert not self.chunked_prefill, \
@@ -382,11 +373,17 @@ def _compute_multi_modal_input(self,
 
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+            assert (
+                image_grid_thw is not None or video_grid_thw is not None
+                or audio_feature_lengths is not None), (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw' or "
+                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
             hf_config = self.runner.model_config.hf_config
             token_ids = seq_data.get_token_ids()
 
@@ -398,6 +395,8 @@ def _compute_multi_modal_input(self,
                     video_grid_thw=video_grid_thw,
                     second_per_grid_ts=second_per_grid_ts,
                     context_len=computed_len,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
 
@@ -472,16 +471,11 @@ def __init__(
             use_mla=self.model_config.use_mla,
         ) if needs_attn_backend else None
 
-        # Multi-modal data support
-        self.mm_registry = MULTIMODAL_REGISTRY
-        self.multi_modal_input_mapper = self.mm_registry \
-            .create_input_mapper(self.model_config)
-        self.mm_registry.init_mm_limits_per_prompt(self.model_config)
-
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
+        self.sampler = get_sampler()
 
         if hasattr(self, "_builder_cls"):
             # multi-step model runner does not have `_builder_cls`
@@ -499,13 +493,8 @@ def load_model(self) -> None:
                 logger.warning("Regarding multimodal models, vLLM currently "
                                "only supports adding LoRA to language model.")
 
-            # It's necessary to distinguish between the max_position_embeddings
-            # of VLMs and LLMs.
-            if hasattr(self.model.config, "max_position_embeddings"):
-                max_pos_embeddings = self.model.config.max_position_embeddings
-            else:
-                max_pos_embeddings = (
-                    self.model.config.text_config.max_position_embeddings)
+            # Use get_text_config() in case of multimodal models
+            text_config = self.model_config.hf_config.get_text_config()
 
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
@@ -515,7 +504,7 @@ def load_model(self) -> None:
                 self.device,
                 self.model.embedding_modules,
                 self.model.embedding_padding_modules,
-                max_position_embeddings=max_pos_embeddings,
+                max_position_embeddings=text_config.max_position_embeddings,
             )
             self.model = self.lora_manager.create_lora_manager(self.model)
 
@@ -537,11 +526,6 @@ def _prepare_model_input_tensors(
 
         return self.builder.build()  # type: ignore
 
-    # sampler property will be used by spec_decode_worker
-    @property
-    def sampler(self):
-        return self.model.sampler
-
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
@@ -669,7 +653,7 @@ def execute_model(
             return []
 
         # Sample the next token.
-        output = self.model.sample(
+        output = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 72ff9d66a68..4864163b0de 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -49,6 +49,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
             "input_tokens": self.input_tokens,
+            "inputs_embeds": self.inputs_embeds,
             "input_positions": self.input_positions,
             "encoder_input_tokens": self.encoder_input_tokens,
             "encoder_input_positions": self.encoder_input_positions,
@@ -100,6 +101,8 @@ def __init__(
             vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker,
+            input_registry=input_registry,
+            mm_registry=mm_registry,
         )
 
         # Crash for unsupported encoder/scenarios
@@ -170,10 +173,17 @@ def execute_model(
         if (model_input.attn_metadata is not None
                 and model_input.attn_metadata.prefill_metadata is None
                 and model_input.attn_metadata.decode_metadata.use_cuda_graph):
-            assert model_input.input_tokens is not None
-            graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = self.graph_runners[
-                model_input.virtual_engine][graph_batch_size]
+            if model_input.inputs_embeds is None:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, False)])
+            else:
+                graph_batch_size = model_input.inputs_embeds.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, True)])
         else:
             model_executable = self.model
 
@@ -187,6 +197,7 @@ def execute_model(
                                  model_input.virtual_engine):
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
+                inputs_embeds=model_input.inputs_embeds,
                 positions=model_input.input_positions,
                 encoder_input_ids=model_input.encoder_input_tokens,
                 encoder_positions=model_input.encoder_input_positions,
@@ -205,7 +216,7 @@ def execute_model(
             model_input.async_callback()
 
         # Sample the next token.
-        output: SamplerOutput = self.model.sample(
+        output: SamplerOutput = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7a346b34cef..e25864349e2 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -11,11 +11,9 @@
 import gc
 import itertools
 import math
-import operator
 import os
 import time
 from array import array
-from dataclasses import dataclass, field
 from enum import IntEnum
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
                     Optional, Set, Tuple, Type, TypeVar, Union)
@@ -24,8 +22,9 @@
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
 import torch.nn as nn
+import vllm_hpu_extension.environment as environment
+from vllm_hpu_extension.bucketing.common import get_bucketing_context
 from vllm_hpu_extension.ops import LoraMask as LoraMask
-from vllm_hpu_extension.ops import batch2block, block2batch
 from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
                                          HabanaMemoryProfiler, format_bytes)
 
@@ -41,13 +40,12 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SequenceGroupToSample
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceData, SequenceGroupMetadata,
@@ -74,24 +72,7 @@
 
 LORA_WARMUP_RANK = 8
 
-
-class Singleton(type):
-    _instances: Dict[type, object] = {}
-
-    def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            cls._instances[cls] = super().__call__(*args, **kwargs)
-        return cls._instances[cls]
-
-
-@dataclass
-class HPUBucketingGlobalState(metaclass=Singleton):
-    prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
-    decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
-    prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False)
-    decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False)
-    prompt_buckets: List[Tuple[int, int]] = field(init=False)
-    decode_buckets: List[Tuple[int, int]] = field(init=False)
+DUMMY_TOKEN_ID = -1
 
 
 def subtuple(obj: object,
@@ -113,134 +94,10 @@ def subtuple(obj: object,
     return _TYPE_CACHE[typename](**values)
 
 
-def read_bucket_settings(phase: str, dim: str, **defaults):
-    """Read bucketing configuration from env variables.
-
-    phase is either 'prompt' or 'decode'
-    dim is either 'bs', 'seq' or 'block'
-    param is either 'min', 'step' or 'max'
-    example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
-    """
-    params = ['min', 'step', 'max']
-    env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params]
-    default_values = [defaults[p] for p in params]
-    values = [
-        int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)
-    ]
-    for e, v, d in zip(env_vars, values, default_values):
-        logger.info('%s=%s (default:%s)', e, v, d)
-    return values
-
-
-def warmup_range(config: Tuple[int, int, int]):
-    """Generate a warmup range.
-
-    Start from bmin and multiply by 2 until you reach bstep.
-    Then, increase the values in the range by the value of bstep until you 
-    reach bmax.
-
-    Example:
-    bmin = 2, bstep = 32, bmax = 64
-    => ramp_up = (2, 4, 8, 16)
-    => stable = (32, 64)
-    => return ramp_up + stable => (2, 4, 8, 16, 32, 64)
-    """
-    bmin, bstep, bmax = config
-    assert bmin <= bmax, ("Min. batch size cannot be greater than max. "
-                          "batch size. If you want to skip warmup, "
-                          "set VLLM_SKIP_WARMUP=true")
-    base = itertools.repeat(2)
-    ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
-    ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
-        ramp_up_acc)
-    stable = range(bstep, bmax + 1, bstep)
-    buckets = list(ramp_up_tw) + list(stable)
-    return list(filter(lambda bucket: bucket >= bmin, buckets))
-
-
-def generate_prompt_buckets(bs_bucket_config,
-                            seq_bucket_config,
-                            max_num_batched_tokens=None):
-    buckets = list(
-        itertools.product(warmup_range(bs_bucket_config),
-                          warmup_range(seq_bucket_config)))
-    if len(buckets) == 0:
-        msg = ("No buckets could be captured with following config "
-               f"(min, step, max_warmup): "
-               f"bs:{bs_bucket_config}, "
-               f"seq:{seq_bucket_config}")
-        raise ValueError(msg)
-
-    filtered_buckets = buckets
-    if max_num_batched_tokens is not None:
-        # Remove buckets exceeding batch token budget
-        filtered_buckets = list(
-            filter(
-                lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
-                buckets))
-
-        if len(filtered_buckets) == 0:
-            # we can handle this if we ignore max_num_batched_tokens
-            min_bucket_bs, min_bucket_seq = min(buckets,
-                                                key=lambda b: (b[0] * b[1]))
-            min_reqd_budget = min_bucket_bs * min_bucket_seq
-            msg = (
-                "The current bucketing configuration "
-                f"(min, step, max_warmup): "
-                f"bs:{bs_bucket_config}, "
-                f"seq:{seq_bucket_config} cannot be used with specified "
-                f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
-                f"smallest bucket ({min_reqd_budget}) would exceed token "
-                "budget. Please increase max_num_batched_tokens or decrease "
-                "bucket minimum Ignoring max_num_batched_tokens at risk of "
-                "out-of-memory errors.")
-            logger.error(msg)
-            return list(
-                sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), []
-
-    captured_buckets = list(
-        sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
-    omitted_buckets = list(
-        sorted([x for x in buckets if x not in filtered_buckets]))
-    return captured_buckets, omitted_buckets
-
-
-def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
-                            max_blocks):
-    buckets = []
-    bs_buckets = warmup_range(bs_bucket_config)
-    block_buckets = warmup_range(blocks_bucket_config)
-    bmin, bstep, bmax = blocks_bucket_config
-    last_bucket = round_up(max_blocks, bstep)
-    for bs in bs_buckets:
-        for blocks in block_buckets:
-            if blocks < bs:
-                continue
-            if blocks > last_bucket:
-                break
-            buckets.append((bs, blocks))
-    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
-
-
-def next_pow2(value: int, base: int):
-    res = base
-    while value > 1:
-        value = (value + 1) // 2
-        res *= 2
-    return res
-
-
 def round_up(value: int, k: int):
     return (value + k - 1) // k * k
 
 
-def find_bucket(value: int, config: Tuple[int, int, int]):
-    bmin, bstep, _ = config
-    next_step = round_up(value, bstep)
-    next_pow = next_pow2(value, bmin)
-    return max(bmin, min(next_step, next_pow))
-
-
 def align_workers(value, op):
     group = get_world_group().cpu_group
     world_size = torch.distributed.get_world_size()
@@ -314,6 +171,7 @@ class HpuModelAdapter:
 
     def __init__(self, model, vllm_config):
         self.model = model
+        self.sampler = get_sampler()
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '0').lower() in ['1', 'true']
         self.vllm_config = vllm_config
@@ -403,16 +261,6 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
                                      attn_bias=attn_bias)
         return metadata
 
-    def _set_block_scales(self, metadata, device):
-        block_mapping = metadata.block_mapping
-        ones = torch.ones((block_mapping.size(0), ),
-                          device=device,
-                          dtype=block_mapping.dtype)
-        sums = batch2block(block2batch(ones, block_mapping), block_mapping)
-        block_scales = torch.reciprocal(torch.maximum(ones, sums))
-        metadata = metadata._replace(block_scales=block_scales)
-        return metadata
-
     def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
                          dtype):
         if attn_metadata.is_prompt:
@@ -423,7 +271,6 @@ def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
             meta = attn_metadata
             attn_metadata = self._set_block_mapping(meta, batch_size, device,
                                                     dtype)
-            attn_metadata = self._set_block_scales(attn_metadata, device)
         return attn_metadata
 
     def forward(self, *args, **kwargs):
@@ -452,7 +299,7 @@ def compute_logits(self, *args, **kwargs):
         return self.model.compute_logits(*args, **kwargs)
 
     def sample(self, *args, **kwargs):
-        return self.model.sample(*args, **kwargs)
+        return self.sampler(*args, **kwargs)
 
 
 class PreparePromptMetadata(NamedTuple):
@@ -622,6 +469,7 @@ def __init__(
         return_hidden_states: bool = False,
     ):
         ModelRunnerBase.__init__(self, vllm_config=vllm_config)
+        environment.set_model_config(self.model_config)
         self.is_driver_worker = is_driver_worker
         self.return_hidden_states = return_hidden_states
 
@@ -661,13 +509,21 @@ def __init__(
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
         self.seen_configs: set = set()
         self._mem_margin: Optional[int] = None
-        self.bucketing_global_state = HPUBucketingGlobalState()
-        self._setup_buckets()
+        HPUBucketingContext = get_bucketing_context()
+        self.bucketing_ctx = HPUBucketingContext(self.max_num_seqs,
+                                                 self.max_num_prefill_seqs,
+                                                 self.block_size,
+                                                 self.max_num_batched_tokens,
+                                                 False, self.max_model_len)
+        self.graphed_buckets: Set[Any] = set()
         self._set_gc_threshold()
         self.use_contiguous_pa = envs.VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH
 
         # For multi-step scheduling
         self.cached_step_outputs: List[torch.Tensor] = []
+        # For delayed sampling
+        self.cached_step_inputs: List[
+            ModelInputForHPUWithSamplingMetadata] = []
 
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
@@ -688,10 +544,6 @@ def _set_gc_threshold(self) -> None:
             ]
         gc.set_threshold(*requested_gc_thrs)
 
-        # Multi-modal data support
-        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
-            .create_input_mapper(self.model_config)
-
         self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP',
                                           'false').lower() == 'true'
 
@@ -718,14 +570,9 @@ def load_model(self) -> None:
                     "Bias support in LoRA is not enabled in HPU yet."
                 assert not self.lora_config.fully_sharded_loras, \
                     "Fully sharded LoRAs is not enabled in HPU yet."
-                # It's necessary to distinguish between the
-                # max_position_embeddings of VLMs and LLMs.
-                if hasattr(self.model.config, "max_position_embeddings"):
-                    max_pos_embeddings = (
-                        self.model.config.max_position_embeddings)
-                else:
-                    max_pos_embeddings = (
-                        self.model.config.text_config.max_position_embeddings)
+
+                # Use get_text_config() in case of multimodal models
+                text_config = self.model_config.hf_config.get_text_config()
 
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
@@ -735,7 +582,8 @@ def load_model(self) -> None:
                     self.device,
                     self.model.embedding_modules,
                     self.model.embedding_padding_modules,
-                    max_position_embeddings=max_pos_embeddings,
+                    max_position_embeddings=text_config.
+                    max_position_embeddings,
                 )
                 self.model = self.lora_manager.create_lora_manager(self.model)
 
@@ -771,6 +619,27 @@ def load_model(self) -> None:
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
+    def _add_dummy_seq(self, seq_group_metadata_list, is_prompt):
+        real_batch_size = len(seq_group_metadata_list)
+        batch_size_padded = self.bucketing_ctx.get_padded_batch_size(
+            real_batch_size, is_prompt)
+        batch_size_padding = batch_size_padded - real_batch_size
+
+        seq_group_metadata_list = seq_group_metadata_list.copy()
+
+        if batch_size_padding > 0:
+            dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
+                0, 0, is_prompt)
+            seq_group_metadata_list.extend(dummy_seq_group_metadata
+                                           for _ in range(batch_size_padding))
+        return seq_group_metadata_list, real_batch_size, batch_size_padded
+
+    def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
+        return htorch.hpu.wrap_in_hpu_graph(
+            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+        ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
+            *args, **kwargs)
+
     def get_model(self) -> nn.Module:
         return self.model
 
@@ -784,46 +653,6 @@ def _use_graphs(self, batch_size, seq_len, is_prompt):
     def _is_valid_bucket(self, bucket):
         return bucket[0] * bucket[1] <= self.max_num_batched_tokens
 
-    def _setup_buckets(self) -> None:
-        align_bs = lambda x: min(self.max_num_seqs, x)
-        #FIXME: The default values should be max_model_len
-        max_prompt_seq = 1024
-        max_decode_seq = 2048
-        self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings(
-            'prompt',
-            'bs',
-            min=1,
-            step=align_bs(32),
-            max=self.max_num_prefill_seqs)
-        self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings(
-            'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs)
-        self.bucketing_global_state.prompt_seq_bucket_cfg = \
-            read_bucket_settings(
-            'prompt',
-            'seq',
-            min=self.block_size,
-            step=self.block_size,
-            max=max_prompt_seq)
-        self.bucketing_global_state.decode_block_bucket_cfg = \
-            read_bucket_settings(
-            'decode',
-            'block',
-            min=self.block_size,
-            step=self.block_size,
-            max=max(self.block_size,
-                    self.max_num_seqs * max_decode_seq // self.block_size))
-        self.graphed_buckets: Set[Any] = set()
-
-        msg = ("Prompt bucket config (min, step, max_warmup) "
-               f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, "
-               f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}")
-        logger.info(msg)
-
-        msg = ("Decode bucket config (min, step, max_warmup) "
-               f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, "
-               f"block:{self.bucketing_global_state.decode_block_bucket_cfg}")
-        logger.info(msg)
-
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -897,9 +726,8 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+            mm_kwargs = seq_group_metadata.multi_modal_data
+            if mm_kwargs:
                 multi_modal_kwargs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
@@ -939,8 +767,7 @@ def _prepare_prompt(
         assert max_query_len > 0
 
         max_prompt_len = max(
-            find_bucket(max(seq_lens),
-                        self.bucketing_global_state.prompt_seq_bucket_cfg),
+            self.bucketing_ctx.get_padded_prompt_seq_len(max_query_len),
             self.block_size)
 
         lora_ids: List[int] = []
@@ -989,7 +816,6 @@ def _prepare_prompt(
             block_usage=None,
             block_indices=block_indices,
             block_offsets=block_offsets,
-            block_scales=None,
             block_groups=None,
             attn_bias=None,
             seq_lens_tensor=seq_lens_tensor,
@@ -1116,9 +942,8 @@ def _prepare_decode(
         padding_fn = None
         if self.use_contiguous_pa:
             block_bucket_size = max(max(block_list) + 1, len(block_list))
-            block_bucket_size = find_bucket(
-                block_bucket_size,
-                self.bucketing_global_state.decode_block_bucket_cfg)
+            block_bucket_size = self.bucketing_ctx.get_padded_decode_num_blocks(
+                block_bucket_size)
             indices: List[Any]
             indices = [None] * block_bucket_size
             for i, bid in enumerate(block_list):
@@ -1126,9 +951,9 @@ def _prepare_decode(
             padding_fn = lambda tensor, pad_value: gather_list(
                 tensor, indices, pad_value)
         else:
-            block_bucket_size = find_bucket(
-                len(block_list),
-                self.bucketing_global_state.decode_block_bucket_cfg)
+            block_bucket_size = \
+                    self.bucketing_ctx.get_padded_decode_num_blocks(
+                    len(block_list))
             padding_fn = lambda tensor, pad_value: pad_list(
                 tensor, block_bucket_size, pad_value)
 
@@ -1159,7 +984,6 @@ def _prepare_decode(
             block_usage=block_usage,
             block_indices=block_indices,
             block_offsets=block_offsets,
-            block_scales=None,
             block_groups=block_groups,
             attn_bias=None,
             seq_lens_tensor=None,
@@ -1202,17 +1026,8 @@ def prepare_input_tensors(
         base_event_name = 'prompt' if is_prompt else 'decode'
         self.profiler.start('internal', base_event_name)
 
-        real_batch_size = len(seq_group_metadata_list)
-        bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \
-            if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg
-        batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
-        batch_size_padding = batch_size_padded - real_batch_size
-        seq_group_metadata_list = seq_group_metadata_list.copy()
-        if batch_size_padding > 0:
-            dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
-                0, 0, is_prompt)
-            seq_group_metadata_list.extend(dummy_seq_group_metadata
-                                           for _ in range(batch_size_padding))
+        seq_group_metadata_list, real_batch_size, batch_size_padded = (
+            self._add_dummy_seq(seq_group_metadata_list, is_prompt))
 
         prefill_reqs = []
         decode_reqs = []
@@ -1374,7 +1189,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
             'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
             'block_usage', 'slot_mapping', 'is_prompt', 'block_indices',
-            'block_offsets', 'block_scales', 'block_groups'
+            'block_offsets', 'block_groups'
         ])
         return attention_metadata
 
@@ -1412,16 +1227,18 @@ def profile_run(self) -> None:
         bind_kv_cache(
             self.vllm_config.compilation_config.static_forward_context,
             [kv_caches])
-        max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
-        max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
-                             self.scheduler_config.max_num_seqs)
-        self.warmup_scenario(max_batch_size, max_seq_len, True, False, True)
+        _, max_seq_len = self.bucketing_ctx.get_max_prompt_shape()
+        max_batch_size = min(self.max_num_seqs,
+                             self.max_num_batched_tokens // max_seq_len)
+        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
+                             False, True)
         return
 
     def warmup_scenario(self,
                         batch_size,
                         seq_len,
                         is_prompt,
+                        kv_caches,
                         is_pt_profiler_run=False,
                         is_lora_profile_run=False) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
@@ -1557,16 +1374,17 @@ def log_warmup(self, phase, i, max_i, batch_size, seq_len):
                f"free_mem:{free_mem}")
         logger.info(msg)
 
-    def warmup_all_buckets(self, buckets, is_prompt):
+    def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
         for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
             self.log_warmup('Prompt' if is_prompt else 'Decode', i,
                             len(buckets), batch_size, seq_len)
-            self.warmup_scenario(batch_size, seq_len, is_prompt)
+            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
 
     def warmup_graphs(self,
                       strategy,
                       buckets,
                       is_prompt,
+                      kv_caches,
                       available_mem,
                       starting_mem=0,
                       total_batch_seq=0.001):
@@ -1598,7 +1416,7 @@ def warmup_graphs(self,
             self.graphed_buckets.add(graphed_bucket)
             self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
             with HabanaMemoryProfiler() as mem_prof:
-                self.warmup_scenario(batch_size, seq_len, is_prompt)
+                self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
             used_mem = align_workers(mem_prof.consumed_device_memory,
                                      torch.distributed.ReduceOp.MAX)
             available_mem -= used_mem
@@ -1622,50 +1440,21 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
 
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
+        max_blocks = kv_caches[0][0].size(0)
+        self.bucketing_ctx.generate_decode_buckets(max_blocks)
         if profile := os.environ.get('VLLM_PT_PROFILE', None):
             phase, bs, seq_len, graph = profile.split('_')
             is_prompt = phase == 'prompt'
             graphs = graph == 't'
             if graphs:
                 self.graphed_buckets.add((int(bs), int(seq_len), is_prompt))
-            self.warmup_scenario(int(bs), int(seq_len), is_prompt, True)
+            self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches,
+                                 True)
             raise AssertionError("Finished profiling")
-        if self.skip_warmup:
-            logger.info("Skipping warmup...")
-            return
-        self.profiler.start('internal', 'warmup')
-        max_blocks = kv_caches[0][0].size(0)
-
-        self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \
-            generate_prompt_buckets(
-            self.bucketing_global_state.prompt_bs_bucket_cfg,
-            self.bucketing_global_state.prompt_seq_bucket_cfg,
-            self.max_num_batched_tokens)
-
-        msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} "
-               f"prompt buckets [bs, seq]: \
-                {list(sorted(self.bucketing_global_state.prompt_buckets))}")
-        logger.info(msg)
-
-        msg = (f"Omitted {len(prompt_omitted_buckets)} "
-               "prompt buckets due to exceeded token budget "
-               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
-        logger.info(msg)
-
-        msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
-        logger.debug(msg)
-
-        self.bucketing_global_state.decode_buckets = generate_decode_buckets(
-            self.bucketing_global_state.decode_bs_bucket_cfg,
-            self.bucketing_global_state.decode_block_bucket_cfg, max_blocks)
-        logger.info("Generated %d decode buckets [bs, total_blocks]: %s",
-                    len(self.bucketing_global_state.decode_buckets),
-                    list(sorted(self.bucketing_global_state.decode_buckets)))
-
         if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
             cache_size_limit = 1 + 3 * (
-                len(self.bucketing_global_state.prompt_buckets) +
-                len(self.bucketing_global_state.decode_buckets))
+                len(self.bucketing_ctx.prompt_buckets) +
+                len(self.bucketing_ctx.decode_buckets))
             torch._dynamo.config.cache_size_limit = max(
                 cache_size_limit, torch._dynamo.config.cache_size_limit)
             # Multiply by 8 to follow the original default ratio between
@@ -1673,7 +1462,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             torch._dynamo.config.accumulated_cache_size_limit = max(
                 cache_size_limit * 8,
                 torch._dynamo.config.accumulated_cache_size_limit)
-
+        if self.skip_warmup:
+            logger.info("Skipping warmup...")
+            return
+        self.profiler.start('internal', 'warmup')
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
 
@@ -1692,10 +1484,12 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                            'Please update Gaudi Software Suite.')
         with compile_only_mode_context(
         ) if can_use_compile_only_mode else contextlib.nullcontext():
-            self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets,
-                                    True)
-            self.warmup_all_buckets(self.bucketing_global_state.decode_buckets,
-                                    False)
+            print("aa")
+            self.warmup_all_buckets(self.bucketing_ctx.prompt_buckets, True,
+                                    kv_caches)
+            print("bb")
+            self.warmup_all_buckets(self.bucketing_ctx.decode_buckets, False,
+                                    kv_caches)
 
             if not self.enforce_eager and htorch.utils.internal.is_lazy():
                 assert self.mem_margin is not None, \
@@ -1725,12 +1519,12 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                                                  'max_bs')
                 mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
                     self.warmup_graphs(
-                    prompt_strategy, self.bucketing_global_state.prompt_buckets,
-                    True, prompt_available_memory)
+                    prompt_strategy, self.bucketing_ctx.prompt_buckets,
+                    True, kv_caches, prompt_available_memory)
                 mem_post_decode, decode_batch_seq, decode_captured_all = \
                     self.warmup_graphs(
-                    decode_strategy, self.bucketing_global_state.decode_buckets,
-                    False, decode_available_memory)
+                    decode_strategy, self.bucketing_ctx.decode_buckets,
+                    False, kv_caches, decode_available_memory)
 
                 # Not all prompt buckets were captured, but all decode buckets
                 # were captured and we have some free graph-allocated space
@@ -1739,8 +1533,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                         and not prompt_captured_all and decode_captured_all):
                     mem_post_prompt, _, prompt_captured_all = (
                         self.warmup_graphs(
-                            prompt_strategy,
-                            self.bucketing_global_state.prompt_buckets, True,
+                            prompt_strategy, self.bucketing_ctx.prompt_buckets,
+                            True, kv_caches,
                             graph_free_mem - mem_post_prompt - mem_post_decode,
                             mem_post_prompt, prompt_batch_seq))
 
@@ -1751,17 +1545,15 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                     and not decode_captured_all \
                         and prompt_captured_all:
                     mem_post_decode, _, _ = self.warmup_graphs(
-                        decode_strategy,
-                        self.bucketing_global_state.decode_buckets, False,
+                        decode_strategy, self.bucketing_ctx.decode_buckets,
+                        False, kv_caches,
                         graph_free_mem - mem_post_prompt - mem_post_decode,
                         mem_post_decode, decode_batch_seq)
 
                 self.log_graph_warmup_summary(
-                    self.bucketing_global_state.prompt_buckets, True,
-                    mem_post_prompt)
+                    self.bucketing_ctx.prompt_buckets, True, mem_post_prompt)
                 self.log_graph_warmup_summary(
-                    self.bucketing_global_state.decode_buckets, False,
-                    mem_post_decode)
+                    self.bucketing_ctx.decode_buckets, False, mem_post_decode)
 
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_device_memory_usage()
@@ -2020,6 +1812,21 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int],
 
         return lora_mask, lora_logits_mask
 
+    def _get_seq_ids(self, model_input):
+        return ([
+            sg.seq_ids[0] for sg in model_input.sampling_metadata.seq_groups
+        ])
+
+    def _pad_to_max_num_seqs(self, tensor, value):
+        padding_needed = self.max_num_seqs - tensor.size(0)
+        if padding_needed:
+            padding = torch.full((padding_needed, *tensor.shape[1:]),
+                                 value,
+                                 device=tensor.device,
+                                 dtype=tensor.dtype)
+            tensor = torch.cat([tensor, padding])
+        return tensor
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2030,6 +1837,37 @@ def execute_model(
         warmup_mode=False,
         seqs=None,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        VLLM_DELAYED_SAMPLING = envs.VLLM_HPU_USE_DELAYED_SAMPLING
+        use_delayed_sampling = VLLM_DELAYED_SAMPLING and not warmup_mode
+        assert not (use_delayed_sampling and num_steps != 1), \
+            'Delayed sampling is not compatible with MSS!'
+        assert model_input.input_tokens is not None
+        if use_delayed_sampling and not model_input.is_prompt and \
+                self.is_driver_worker:
+            num_cached = len(self.cached_step_outputs)
+            assert num_cached > 0
+            cur_seq_ids = self._get_seq_ids(model_input)
+            cur_seq_id_pos = {
+                sid: idx
+                for idx, sid in enumerate(cur_seq_ids) if sid >= 0
+            }
+            htorch.core.mark_step()
+            for i in range(num_cached):
+                prev_seq_ids = self._get_seq_ids(self.cached_step_inputs[i])
+                target_indices = [
+                    cur_seq_id_pos.get(psi, -1) for psi in prev_seq_ids
+                ]
+                padding = self.cached_step_outputs[i].size(0) - len(
+                    target_indices)
+                target_indices.extend([-1] * padding)
+                target_indices = torch.tensor(
+                    target_indices,
+                    device=model_input.input_tokens.device,
+                    dtype=model_input.input_tokens.dtype)
+                model_input.input_tokens.index_copy_(
+                    0, target_indices, self.cached_step_outputs[i])
+                htorch.core.mark_step()
+
         if not model_input.is_first_multi_step:
             if not model_input.is_last_step:
                 # not first or last multi-step
@@ -2045,7 +1883,21 @@ def execute_model(
                 assert model_input.lora_mapping is not None
                 self.set_active_loras(model_input.lora_requests,
                                       model_input.lora_mapping)
-            input_tokens = model_input.input_tokens
+            # Rank!=0 workers has is_prompt==None
+            if use_delayed_sampling and not model_input.is_prompt and \
+                    model_input.input_tokens.size(1) == 1:
+                if self.is_driver_worker:
+                    model_kwargs_broadcast_data = {
+                        "input_tokens": model_input.input_tokens
+                    }
+                    broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
+                    input_tokens = model_input.input_tokens
+
+                else:
+                    model_kwargs_broadcast_data = broadcast_tensor_dict(src=0)
+                    input_tokens = model_kwargs_broadcast_data["input_tokens"]
+            else:
+                input_tokens = model_input.input_tokens
             input_positions = model_input.input_positions
             attn_metadata = model_input.attn_metadata
             sampling_metadata = model_input.sampling_metadata
@@ -2092,11 +1944,11 @@ def execute_model(
                                     f"graphs{'T' if use_graphs else 'F'}")
             else:
                 model_event_name = 'model_executable'
-            if num_steps > 1:
+            if num_steps > 1 or use_delayed_sampling:
                 # in case of multi-step scheduling
                 # we only want to pythonize in the last step
                 sampling_metadata.skip_sampler_cpu_output = True
-                self.model.model.sampler.include_gpu_probs_tensor = True
+                self.model.sampler.include_gpu_probs_tensor = True
             cache_orig_output_tokens_len: List[Dict] = []
 
             def try_revert_dummy_output_tokens():
@@ -2152,9 +2004,9 @@ def try_revert_dummy_output_tokens():
                 if not self.is_driver_worker:
                     continue
 
-                if model_input.async_callback is not None:
-                    model_input.async_callback()
-                # Sample the next token.
+                if use_delayed_sampling:
+                    fake_output = self._delayed_sampler_outputs(model_input)
+
                 with self.profiler.record_event(
                         'internal', ('sample_'
                                      f'{"prompt" if is_prompt else "decode"}_'
@@ -2166,9 +2018,16 @@ def try_revert_dummy_output_tokens():
                     )
                     if num_steps > 1:
                         output = output.sampled_token_ids
-                        self.cached_step_outputs.append(
-                            output.detach().clone())
+                        self.cached_step_outputs.append(output)
+                    if use_delayed_sampling and self.is_driver_worker:
+                        self._patch_prev_output()
+                        output = self._pad_to_max_num_seqs(
+                            output.sampled_token_ids, DUMMY_TOKEN_ID)
+                        self.cached_step_outputs.append(output)
+                        self.cached_step_inputs.append(model_input)
                 htorch.core.mark_step()
+                if model_input.async_callback is not None:
+                    model_input.async_callback()
                 if i < num_steps - 1:
                     if i == 0:
                         if model_input.async_callback is not None:
@@ -2241,11 +2100,30 @@ def try_revert_dummy_output_tokens():
                     is_prompt=is_prompt)
                 self.profiler.record_counter(self.event_start, counters)
             if num_steps == 1:
+                if self.return_hidden_states:
+                    # we only need to pass hidden states of most recent token
+                    assert model_input.sampling_metadata is not None
+                    if model_input.is_prompt:
+                        output.prefill_hidden_states = hidden_states
+                    output.hidden_states = hidden_states
+                if use_delayed_sampling:
+                    if self.is_driver_worker:
+                        return [fake_output]
+                    else:
+                        return []
+
                 return [output] if self.is_driver_worker else []
             else:
                 return []
         return output if type(output) is list else [output]
 
+    def _delayed_sampler_outputs(self, model_input):
+        next_token_ids = [[DUMMY_TOKEN_ID]] * len(
+            model_input.sampling_metadata.seq_groups)
+        sampler_output = self._make_decode_output(
+            next_token_ids, model_input.sampling_metadata.seq_groups)
+        return sampler_output
+
     def _decode_sampler_outputs(self, model_input):
         use_async_out_proc = model_input.async_callback is not None
         sampler_outputs = []
@@ -2312,3 +2190,32 @@ def shutdown_inc(self):
 
     def __del__(self):
         self.shutdown_inc()
+
+    def _patch_prev_output(self):
+        assert len(self.cached_step_inputs) == len(self.cached_step_outputs), \
+            f'''Inputs and outputs are out of sync!
+            {len(self.cached_step_inputs)} vs {len(self.cached_step_outputs)}'''
+        if len(self.cached_step_inputs) == 0:
+            return
+        model_input = self.cached_step_inputs.pop(0)
+        delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(
+            -1).tolist()
+        ctx = model_input.async_callback.keywords["ctx"]  # type: ignore
+        # If there's no output to patch with, which is usually the case when
+        # we're starting a new request after all requests are completed.
+        if len(ctx.output_queue) == 0:
+            return
+        assert len(
+            ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
+        output_data = ctx.output_queue[0]
+        assert len(output_data.outputs) == 1
+        for fake_out, real_out in zip(output_data.outputs[0], delayed_output):
+            fake_out.samples[0].output_token = real_out
+        for sg, real_out in zip(output_data.seq_group_metadata_list,
+                                delayed_output):
+            assert len(sg.seq_data) == 1
+            seq_data = list(sg.seq_data.values())[0]
+            # This is a hack. Assigning output_token_ids triggers
+            # a cache recomputation and we only need to update the last token
+            seq_data.output_token_ids_array[-1] = real_out
+            seq_data._cached_all_token_ids[-1] = real_out
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index ccb175d88fd..7898c645d66 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -201,9 +201,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         Then, it calculate the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
+        :::{tip}
+        You may limit the usage of GPU memory
+        by adjusting the `gpu_memory_utilization` parameter.
+        :::
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
@@ -245,6 +246,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                              cache_block_size)
         num_hpu_blocks = max(num_hpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
+        self.model_runner.bucketing_ctx.num_hpu_blocks = num_hpu_blocks
 
         if self.model_runner.lora_manager:
             self.model_runner.remove_all_loras()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 9524a69f6b3..e22bbcc656f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -23,7 +23,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_kv_transfer_group, get_pp_group
+from vllm.distributed import get_pp_group
+from vllm.distributed.kv_transfer import get_kv_transfer_group
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
                                              graph_capture)
 from vllm.forward_context import get_forward_context, set_forward_context
@@ -34,7 +35,8 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
+                                                get_sampler)
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import supports_lora, supports_multimodal
@@ -84,6 +86,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     additional fields.
     """
     input_tokens: Optional[torch.Tensor] = None
+    inputs_embeds: Optional[torch.Tensor] = None
     input_positions: Optional[torch.Tensor] = None
     token_types: Optional[torch.Tensor] = None
     seq_lens: Optional[List[int]] = None
@@ -104,6 +107,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
             "input_tokens": self.input_tokens,
+            "inputs_embeds": self.inputs_embeds,
             "input_positions": self.input_positions,
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
@@ -154,6 +158,7 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
             "input_tokens": self.input_tokens,
+            "inputs_embeds": self.inputs_embeds,
             "input_positions": self.input_positions,
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
@@ -193,6 +198,7 @@ class InterDataForSeqGroup:
 
         def simple_reinit(self):
             self.input_tokens[0].clear()  # type: ignore
+            self.inputs_embeds = None  # type: ignore
             self.input_positions[0].clear()  # type: ignore
             self.token_types[0].clear()  # type: ignore
             self.mrope_input_positions = None  # type: ignore
@@ -220,6 +226,7 @@ def __init__(
 
             # Input tokens and positions.
             input_tokens: Optional[List[List[int]]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
             input_positions: Optional[List[List[int]]] = None,
             token_types: Optional[List[List[int]]] = None,
             mrope_input_positions: Optional[List[List[List[int]]]] = None,
@@ -281,6 +288,8 @@ def __init__(
                         for seq_id in range(len(self.seq_ids)):
                             self.input_tokens[seq_id].clear()
 
+                    self.inputs_embeds = inputs_embeds
+
                     if input_positions:
                         self.input_positions = input_positions
                     else:
@@ -355,6 +364,7 @@ def __init__(
 
             else:
                 self.input_tokens = input_tokens or []
+                self.inputs_embeds = inputs_embeds
                 self.input_positions = input_positions or []
                 self.token_types = token_types or []
                 self.mrope_input_positions = mrope_input_positions or None
@@ -400,6 +410,26 @@ def __post_init__(self):
             self.lora_index_mapping = []
             self.lora_prompt_mapping = []
 
+        def __repr__(self) -> str:
+            return (f"InterDataForSeqGroup("
+                    f"request_id={self.request_id}, "
+                    f"seq_ids={self.seq_ids}, "
+                    f"is_prompt={self.is_prompt}, "
+                    f"block_tables={self.block_tables}, "
+                    f"computed_block_nums={self.computed_block_nums}, "
+                    f"n_seqs={self.n_seqs}, "
+                    f"input_tokens={self.input_tokens}, "
+                    f"inputs_embeds.shape="
+                    f"{getattr(self.inputs_embeds, 'shape', None)}, "
+                    f"input_positions={self.input_positions}, "
+                    f"token_types={self.token_types}, "
+                    f"mrope_input_positions={self.mrope_input_positions}, "
+                    f"seq_lens={self.seq_lens}, "
+                    f"orig_seq_lens={self.orig_seq_lens}, "
+                    f"query_lens={self.query_lens}, "
+                    f"context_lens={self.context_lens}, "
+                    f"multi_modal_kwargs={self.multi_modal_kwargs}")
+
     def gen_inter_data_builder(self, num_seqs: int):
         return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup(
             request_id="",
@@ -456,7 +486,6 @@ def __init__(self,
         self.enable_lora = self.runner.lora_config is not None
         self.enable_prompt_adapter = (self.runner.prompt_adapter_config
                                       is not None)
-        self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
 
         # Attention metadata inputs.
         if self.attn_backend is not None:
@@ -511,13 +540,21 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
             context_len = seq_data.get_num_computed_tokens()
 
         # Compute tokens.
-        tokens = seq_data.get_token_ids()[context_len:seq_len]
+        if seq_data.prompt_embeds is None:
+            tokens = seq_data.get_token_ids()[context_len:seq_len]
+            prompt_embeds = None
+        else:
+            tokens = [0] * (seq_len - context_len)
+            prompt_embeds = seq_data.get_token_embeddings(
+            )[context_len:seq_len]
+
         token_types = seq_group_metadata.token_type_ids
 
         inter_data.seq_lens[seq_idx] = seq_len
         inter_data.orig_seq_lens[seq_idx] = seq_len
         inter_data.context_lens[seq_idx] = context_len
         inter_data.input_tokens[seq_idx].extend(tokens)
+        inter_data.inputs_embeds = prompt_embeds
         inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
         inter_data.token_types[seq_idx].extend(
             token_types if token_types else [])
@@ -674,23 +711,15 @@ def _compute_prompt_adapter_input(
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
         """If multi-modal data is given, add it to the input."""
-        # NOTE: mm_data only includes the subset of multi-modal items that
+        # NOTE: mm_kwargs only includes the subset of multi-modal items that
         # intersect with the current prefill positions.
         positions = inter_data.input_positions[0]
-        mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
+        mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
             seq_group_metadata,
             range(positions[0], positions[0] + len(positions)))
-        if not mm_data:
+        if not mm_kwargs:
             return
 
-        if self.runner.mm_registry.has_processor(self.runner.model_config):
-            mm_kwargs = mm_data
-        else:
-            mm_kwargs = self.multi_modal_input_mapper(
-                mm_data,
-                seq_group_metadata.mm_processor_kwargs,
-            )
-
         inter_data.multi_modal_kwargs = mm_kwargs
         inter_data.multi_modal_placeholder_maps = placeholder_maps
 
@@ -698,11 +727,17 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         if self.runner.model_config.uses_mrope:
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            assert image_grid_thw is not None or video_grid_thw is not None, (
-                "mrope embedding type requires multi-modal input mapper "
-                "returns 'image_grid_thw' or 'video_grid_thw'.")
+            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
+                                                  None)
+            assert (
+                image_grid_thw is not None or video_grid_thw is not None
+                or audio_feature_lengths is not None), (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw' or "
+                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
+            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
             hf_config = self.runner.model_config.hf_config
 
             inter_data.mrope_input_positions = [None] * inter_data.n_seqs
@@ -720,6 +755,8 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                         second_per_grid_ts=second_per_grid_ts,
                         context_len=inter_data.context_lens[seq_idx],
                         seq_len=inter_data.seq_lens[seq_idx],
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
                     )
 
                 seq_data.mrope_position_delta = mrope_position_delta
@@ -822,15 +859,29 @@ def build(self) -> ModelInputForGPU:
         create on-device tensors.
         """
         # Combine and flatten intermediate data.
-        input_tokens = []
-        token_types = []
+        input_tokens = list[int]()
+        inputs_embeds_lst = list[torch.Tensor]()
+        token_types = list[int]()
         for inter_data in self.inter_data_list:
             for cur_input_tokens in inter_data.input_tokens:
                 input_tokens.extend(cur_input_tokens)
             for cur_token_types in inter_data.token_types:
                 token_types.extend(cur_token_types)
+            if inter_data.inputs_embeds is not None:
+                inputs_embeds_lst.append(
+                    inter_data.inputs_embeds.to(
+                        dtype=self.runner.model_config.dtype,
+                        device=self.runner.device))
+        inputs_embeds: Optional[torch.Tensor]
+        if len(inputs_embeds_lst) == 0:
+            inputs_embeds = None
+        else:
+            inputs_embeds = torch.cat(inputs_embeds_lst, dim=0).to(
+                dtype=self.runner.model_config.dtype,
+                device=self.runner.device)
+            assert len(inputs_embeds) == len(input_tokens)
 
-        if not input_tokens:
+        if not input_tokens and inputs_embeds is None:
             # This may happen when all prefill requests hit
             # prefix caching and there is no decode request.
             return self.model_input_cls()
@@ -980,6 +1031,7 @@ def build(self) -> ModelInputForGPU:
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
+            inputs_embeds=inputs_embeds,
             input_positions=input_positions_tensor,
             token_types=token_types_tensor,
             attn_metadata=attn_metadata,
@@ -1029,7 +1081,8 @@ def __init__(
         self.max_batchsize_to_capture = \
             self.vllm_config.compilation_config.max_capture_size
 
-        self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [
+        #
+        self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [
             {} for _ in range(self.parallel_config.pipeline_parallel_size)
         ]
         self.graph_memory_pool: Optional[Tuple[
@@ -1076,15 +1129,13 @@ def __init__(
         # Multi-modal data support
         self.input_registry = input_registry
         self.mm_registry = mm_registry
-        self.multi_modal_input_mapper = mm_registry \
-            .create_input_mapper(model_config)
-        self.mm_registry.init_mm_limits_per_prompt(self.model_config)
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
         self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
+        self.sampler = get_sampler()
 
         set_cpu_offload_max_bytes(
             int(self.cache_config.cpu_offload_gb * 1024**3))
@@ -1120,14 +1171,9 @@ def load_model(self) -> None:
                     logger.warning(
                         "Regarding multimodal models, vLLM currently "
                         "only supports adding LoRA to language model.")
-                # It's necessary to distinguish between the
-                # max_position_embeddings of VLMs and LLMs.
-                if hasattr(self.model.config, "max_position_embeddings"):
-                    max_pos_embeddings = (
-                        self.model.config.max_position_embeddings)
-                else:
-                    max_pos_embeddings = (
-                        self.model.config.text_config.max_position_embeddings)
+
+                # Use get_text_config() in case of multimodal models
+                text_config = self.model_config.hf_config.get_text_config()
 
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
@@ -1137,7 +1183,8 @@ def load_model(self) -> None:
                     self.device,
                     self.model.embedding_modules,
                     self.model.embedding_padding_modules,
-                    max_position_embeddings=max_pos_embeddings,
+                    max_position_embeddings=text_config.
+                    max_position_embeddings,
                 )
                 self.model = self.lora_manager.create_lora_manager(self.model)
             time_after_load = time.perf_counter()
@@ -1321,8 +1368,8 @@ def _dummy_run(self,
 
                 dummy_data = self.input_registry \
                     .dummy_data_for_profiling(self.model_config,
-                                            seq_len,
-                                            self.mm_registry)
+                                              seq_len,
+                                              self.mm_registry)
 
                 seq = SequenceGroupMetadata(
                     request_id=str(group_id),
@@ -1472,6 +1519,10 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         input_positions = torch.zeros(max_batch_size,
                                       dtype=torch.long,
                                       device=self.device)
+        inputs_embeds = torch.zeros(
+            (max_batch_size, self.model_config.get_hidden_size()),
+            dtype=self.model_config.dtype,
+            device=self.device)
         if self.model_config.uses_mrope:
             input_positions = torch.tile(input_positions,
                                          (3, 1)).cuda(device=self.device)
@@ -1509,15 +1560,24 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
             # memory usage of CUDA graph.
             for virtual_engine in range(
                     self.parallel_config.pipeline_parallel_size):
-                # Only rank 0 should print progress bar during capture
-                cudagraph_capture_sizes = (tqdm(
-                    self.vllm_config.compilation_config.
+                # We need to not only iterate over batch sizes, but also whether
+                # to use inputs_embeds or not, hence we use the cartesian
+                # product.
+                cudagraph_capture_sizes = self.vllm_config.compilation_config\
+                    .cudagraph_capture_sizes
+                cudagraph_inputs_embeds = ((
+                    True, False) if self.model_config.enable_prompt_embeds else
+                                           (False, ))
+                compilation_cases = itertools.product(
                     cudagraph_capture_sizes,
-                    desc="Capturing CUDA graph shapes",
-                ) if get_tensor_model_parallel_rank() == 0 else
-                                           self.vllm_config.compilation_config.
-                                           cudagraph_capture_sizes)
-                for batch_size in cudagraph_capture_sizes:
+                    cudagraph_inputs_embeds,
+                )
+                # Only rank 0 should print progress bar during capture
+                if get_tensor_model_parallel_rank() == 0:
+                    compilation_cases = tqdm(
+                        list(compilation_cases),
+                        desc="Capturing CUDA graph shapes")
+                for batch_size, use_inputs_embeds in compilation_cases:
                     attn_metadata = (
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
@@ -1548,6 +1608,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                     capture_inputs = {
                         "input_ids":
                         input_tokens[:batch_size],
+                        "inputs_embeds":
+                        inputs_embeds[:batch_size]
+                        if use_inputs_embeds else None,
                         "positions":
                         input_positions[..., :batch_size],
                         "intermediate_inputs":
@@ -1584,8 +1647,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                                              virtual_engine):
                         graph_runner.capture(**capture_inputs)
                     self.graph_memory_pool = graph_runner.graph.pool()
-                    self.graph_runners[virtual_engine][batch_size] = (
-                        graph_runner)
+                    self.graph_runners[virtual_engine][(
+                        batch_size, use_inputs_embeds)] = graph_runner
 
         if self.lora_config:
             self._remove_dummy_loras()
@@ -1717,8 +1780,9 @@ def execute_model(
         if prefill_meta is None and decode_meta.use_cuda_graph:
             assert model_input.input_tokens is not None
             graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = self.graph_runners[virtual_engine][
-                graph_batch_size]
+            use_inputs_embeds = model_input.inputs_embeds is not None
+            model_executable = self.graph_runners[virtual_engine][(
+                graph_batch_size, use_inputs_embeds)]
             if previous_hidden_states is not None:
                 previous_hidden_states = torch.cat([
                     previous_hidden_states,
@@ -1769,6 +1833,7 @@ def execute_model(
                                      self.vllm_config, virtual_engine):
                 hidden_or_intermediate_states = model_executable(
                     input_ids=model_input.input_tokens,
+                    inputs_embeds=model_input.inputs_embeds,
                     positions=model_input.input_positions,
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
@@ -1823,7 +1888,12 @@ def execute_model(
             model_input.async_callback()
 
         # Sample the next token.
-        output: SamplerOutput = self.model.sample(
+        assert isinstance(self.sampler, Sampler)
+        orig_include_gpu_probs_tensor = self.sampler.include_gpu_probs_tensor
+        if model_input.inputs_embeds is not None:
+            self.sampler.include_gpu_probs_tensor = True
+
+        output: SamplerOutput = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
@@ -1844,6 +1914,18 @@ def execute_model(
             output.model_forward_time = (orig_model_forward_time +
                                          model_forward_time)
 
+        if model_input.inputs_embeds is not None:
+            self.sampler.include_gpu_probs_tensor = \
+                orig_include_gpu_probs_tensor
+            if output.sampled_token_ids is not None:
+                output.sampled_token_embeds = self.model.get_input_embeddings(
+                    output.sampled_token_ids.squeeze(1))
+
+                for token_embed, sequence_group_output in zip(
+                        output.sampled_token_embeds, output.outputs):
+                    assert len(sequence_group_output.samples) == 1
+                    sequence_group_output.samples[0].output_embed = token_embed
+
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1937,6 +2019,7 @@ def graph(self):
     def capture(
         self,
         input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor],
         positions: torch.Tensor,
         intermediate_inputs: Optional[IntermediateTensors],
         kv_caches: List[torch.Tensor],
@@ -1953,6 +2036,7 @@ def capture(
         for _ in range(_NUM_WARMUP_ITERS):
             self.model(
                 input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
                 positions=positions,
                 intermediate_tensors=intermediate_inputs,
                 **kwargs,
@@ -1965,6 +2049,9 @@ def capture(
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
             output_hidden_or_intermediate_states = self.model(
                 input_ids=input_ids,
+                **({
+                    "inputs_embeds": inputs_embeds,
+                } if inputs_embeds is not None else {}),
                 positions=positions,
                 intermediate_tensors=intermediate_inputs,
                 **kwargs,
@@ -1992,6 +2079,9 @@ def capture(
         self.input_buffers = {
             "input_ids":
             input_ids,
+            **({
+                "inputs_embeds": inputs_embeds,
+            } if inputs_embeds is not None else {}),
             "positions":
             positions,
             "kv_caches":
@@ -2012,6 +2102,7 @@ def capture(
     def forward(
         self,
         input_ids: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor],
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors],
         **kwargs,
@@ -2026,6 +2117,9 @@ def forward(
             # so the shape is not padded, we need to copy partial only
             self.input_buffers["positions"][:positions.shape[0]].copy_(
                 positions, non_blocking=True)
+        if inputs_embeds is not None:
+            self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_(
+                inputs_embeds, non_blocking=True)
 
         if self.backend_name != "NO_ATTENTION":
             self.input_buffers["slot_mapping"].copy_(
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 7ddf382079c..0825abbed14 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -14,6 +14,7 @@
                                                 SamplerOutput,
                                                 SamplingMetadata, get_logprobs,
                                                 get_pythonized_sample_results)
+from vllm.platforms import current_platform
 from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
                            Logprob, SequenceGroupMetadata, SequenceOutput)
 from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
@@ -158,8 +159,8 @@ class StatefulModelInput(BroadcastableModelInput):
     is_first_multi_step: bool = False
     base_output_proc_callback: Optional[Callable] = None
     # ping-pong data structures for multi-step to wait on the previous step
-    step_cuda_events: List[torch.cuda.Event] = field(
-        default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2)
+    step_cuda_events: List[current_platform.Event] = field(
+        default_factory=lambda: [current_platform.Event(blocking=True)] * 2)
     num_seqs: int = -1
     num_queries: int = -1
     num_single_step_prefills: int = 0
@@ -488,8 +489,7 @@ def execute_model(
                     device="cpu",
                     pin_memory=True)
 
-            self._base_model_runner.model.sampler.include_gpu_probs_tensor = (
-                True)
+            self._base_model_runner.sampler.include_gpu_probs_tensor = True
             if frozen_model_input.sampling_metadata:
                 frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
                     True)
@@ -734,11 +734,11 @@ def _pythonize_sampler_output(
     cache: Optional[PythonizationCache],
 ) -> None:
     """ This function is only called when the output tensors are ready. 
-    See :class:`ModelOutput`. 
+    See {class}`ModelOutput`. 
     
     Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
     adding a Pythonized output data structure
-    (:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
+    ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
 
     Args:
       model_input
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index f2093fc42ad..e046ebc449d 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -15,8 +15,7 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
+from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
@@ -69,11 +68,6 @@ def __init__(
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
-        # Multi-modal data support
-        self.mm_registry = MULTIMODAL_REGISTRY
-        self.multi_modal_input_mapper = self.mm_registry \
-            .create_input_mapper(self.model_config)
-
         # Lazy initialization.
         self.model: nn.Module  # initialize after load_model.
 
@@ -149,16 +143,8 @@ def _prepare_prompt(
             assert len(block_table) == 1
             input_block_ids.append(block_table[0])
 
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                if self.mm_registry.has_processor(self.model_config):
-                    mm_kwargs = mm_data
-                else:
-                    mm_kwargs = self.multi_modal_input_mapper(
-                        mm_data,
-                        seq_group_metadata.mm_processor_kwargs,
-                    )
-
+            mm_kwargs = seq_group_metadata.multi_modal_data
+            if mm_kwargs:
                 multi_modal_kwargs_list.append(mm_kwargs)
 
         max_seq_len = max(seq_lens)
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index cbd5e2060ca..fdb7353f2f9 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -84,10 +84,17 @@ def execute_model(
         #  explore how to leverage it.
         if (prefill_meta is None and decode_meta is not None
                 and decode_meta.use_cuda_graph):
-            assert model_input.input_tokens is not None
-            graph_batch_size = model_input.input_tokens.shape[0]
-            model_executable = self.graph_runners[virtual_engine][
-                graph_batch_size]
+            if model_input.inputs_embeds is None:
+                assert model_input.input_tokens is not None
+                graph_batch_size = model_input.input_tokens.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, False)])
+            else:
+                graph_batch_size = model_input.inputs_embeds.shape[0]
+                model_executable = (
+                    self.graph_runners[model_input.virtual_engine][(
+                        graph_batch_size, True)])
         else:
             model_executable = self.model
 
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 71b4b38fb9d..bbcc4d59ae1 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -163,8 +163,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         usable_memory_size = int(total_memory_size *
                                  self.cache_config.gpu_memory_utilization)
         tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
-        dtype_btyes = get_dtype_size(self.cache_dtype)
-        block_size_bytes = (dtype_btyes * self.cache_config.block_size *
+        dtype_bytes = get_dtype_size(self.cache_dtype)
+        block_size_bytes = (dtype_bytes * self.cache_config.block_size *
                             num_layers * 2 * head_size * num_kv_heads)
         num_tpu_blocks = tpu_kv_cache_bytes // block_size_bytes
         num_tpu_blocks = (num_tpu_blocks // 8) * 8  # Round down to 8.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d59f20f4999..1a14919ddfb 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -10,10 +10,10 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.device_allocator.cumem import CuMemAllocator
-from vllm.distributed import (ensure_kv_transfer_initialized,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
@@ -95,6 +95,9 @@ def __init__(
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 
+        # Buffers saved before sleep
+        self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
+
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
@@ -124,6 +127,15 @@ def stop_profile(self):
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+
+        # Save the buffers before level 2 sleep
+        if level == 2:
+            model = self.model_runner.model
+            self._sleep_saved_buffers = {
+                name: buffer.cpu().clone()
+                for name, buffer in model.named_buffers()
+            }
+
         allocator = CuMemAllocator.get_instance()
         allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
         free_bytes_after_sleep, total = torch.cuda.mem_get_info()
@@ -139,6 +151,14 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
         allocator.wake_up(tags=tags)
 
+        # Restore the buffers after level 2 sleep
+        if len(self._sleep_saved_buffers):
+            model = self.model_runner.model
+            for name, buffer in model.named_buffers():
+                if name in self._sleep_saved_buffers:
+                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
+            self._sleep_saved_buffers = {}
+
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
@@ -210,9 +230,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         Then, it calculate the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
+        :::{tip}
+        You may limit the usage of GPU memory
+        by adjusting the `gpu_memory_utilization` parameter.
+        :::
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9d49b4385dc..7042b575aa7 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -18,7 +18,7 @@
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadataCache
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs, MultiModalPlaceholderMap,
@@ -188,20 +188,11 @@ def _prepare_prompt(
             input_positions.extend(list(positions_range))
 
             if seq_group_metadata.multi_modal_data:
-                # NOTE: mm_data only includes the subset of multi-modal items
+                # NOTE: mm_kwargs only includes the subset of multi-modal items
                 # that intersect with the current prefill positions.
-                mm_data, placeholder_maps = MultiModalPlaceholderMap \
+                mm_kwargs, placeholder_maps = MultiModalPlaceholderMap \
                     .from_seq_group(seq_group_metadata, positions_range)
 
-                if self.runner.mm_registry.has_processor(
-                        self.runner.model_config):
-                    mm_kwargs = mm_data
-                else:
-                    mm_kwargs = self.runner.multi_modal_input_mapper(
-                        mm_data,
-                        seq_group_metadata.mm_processor_kwargs,
-                    )
-
                 multi_modal_kwargs_list.append(mm_kwargs)
 
                 for modality, placeholder_map in placeholder_maps.items():
@@ -404,12 +395,10 @@ def __init__(
         # Multi-modal data support
         self.input_registry = input_registry
         self.mm_registry = mm_registry
-        self.multi_modal_input_mapper = mm_registry \
-            .create_input_mapper(model_config)
-        self.mm_registry.init_mm_limits_per_prompt(self.model_config)
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
+        self.sampler = get_sampler()
 
         self.sampling_metadata_cache: SamplingMetadataCache = \
               SamplingMetadataCache() \
@@ -596,7 +585,7 @@ def execute_model(
             model_input.async_callback()
 
         # Sample the next token.
-        output: SamplerOutput = self.model.sample(
+        output: SamplerOutput = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 3aea0d7419d..17f53352517 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -26,7 +26,7 @@
 
 class XPUWorker(LoRANotSupportedWorkerBase, Worker):
     """A worker class that executes (a partition of) the model on a GPU.
-    
+
     Each worker is associated with a single XPU device. The worker is 
     responsible for maintaining the KV cache and executing the model on the 
     XPU. In case of distributed inference, each worker is assigned a partition
@@ -93,9 +93,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         Then, it calculate the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
+        :::{tip}
+        You may limit the usage of GPU memory
+        by adjusting the `gpu_memory_utilization` parameter.
+        :::
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.

From c18c558a709bf98794c93a16ac56c69a2d215ea3 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 5 May 2025 11:02:22 +0000
Subject: [PATCH 015/180] merge

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/tpu_input_batch.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 50dddcd2a30..249cf9a0470 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -12,8 +12,6 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
-from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
 
 _SAMPLING_EPS = 1e-5

From bc5fd4f52e55d37c2d3de6f1268693f3abc4a380 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 5 May 2025 14:07:57 +0000
Subject: [PATCH 016/180] test logits processors

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/sample/test_logits_processors.py | 414 ++++++++++++++++++++++
 tests/v1/sample/test_sampler.py           | 179 ++--------
 2 files changed, 438 insertions(+), 155 deletions(-)
 create mode 100644 tests/v1/sample/test_logits_processors.py

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
new file mode 100644
index 00000000000..82d5e5a7454
--- /dev/null
+++ b/tests/v1/sample/test_logits_processors.py
@@ -0,0 +1,414 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.logits_processor import (BatchUpdate,
+                                             LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor)
+from vllm.v1.sample.metadata import SamplingMetadata
+
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(batch_size: int, penalty_value: float,
+                           device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_logit_bias(
+    batch_size: int,
+    vocab_size: int,
+    bias_value: float,
+) -> list[Optional[dict[int, float]]]:
+    res: list[Optional[dict[int, float]]] = []
+    for i in range(batch_size):
+        logit_bias = {min(i, vocab_size - 1): bias_value}
+        res.append(logit_bias)
+    return res
+
+
+def _create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> Optional[torch.Tensor]:
+    mask: Optional[torch.Tensor] = None
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros((batch_size, vocab_size),
+                               dtype=torch.bool,
+                               device=device)
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+    return mask
+
+
+def _create_bad_words_token_ids(
+        batch_size: int, vocab_size: int,
+        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
+    bad_words_token_ids = {}
+    for batch_idx in range(batch_size):
+        token_ids_single_batch = []
+        for bad_words_length in bad_words_lengths:
+            token_ids = np.random.choice(vocab_size,
+                                         size=bad_words_length,
+                                         replace=True).tolist()
+            token_ids_single_batch.append(token_ids)
+        bad_words_token_ids[batch_idx] = token_ids_single_batch
+    if batch_size >= 2:
+        # Test no bad_words for some batch
+        no_bad_words_batch_idx = np.random.choice(batch_size)
+        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
+    return bad_words_token_ids
+
+
+def _update_output_token_ids_for_bad_words(
+        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    bad_words_last_tokens = {}
+    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
+        output_token_ids = metadata.output_token_ids[batch_idx]
+        bad_words_last_token: list[int] = []
+        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
+            if len(bad_word_token_ids) == 1:
+                # Single token id always affects logits
+                bad_words_last_token.append(bad_word_token_ids[0])
+            else:
+                prefix_length = len(bad_word_token_ids) - 1
+                has_bad_words = np.random.choice([True, False])
+                if has_bad_words:
+                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
+                    bad_words_last_token.append(bad_word_token_ids[-1])
+                    break  # Maximum one update to output_token_ids
+                else:  # Make sure no accidental match to bad words
+                    output_token_ids[-1] = (bad_word_token_ids[-2] +
+                                            1) % vocab_size
+        bad_words_last_tokens[batch_idx] = bad_words_last_token
+    return bad_words_last_tokens
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> tuple[SamplingMetadata, dict[str, LogitsProcessor]]:
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    min_tokens_logitproc = MinTokensLogitsProcessor(
+        pin_memory=PIN_MEMORY_AVAILABLE, device=device)
+    logit_bias_logitproc = LogitBiasLogitsProcessor(
+        pin_memory=PIN_MEMORY_AVAILABLE, device=device)
+    min_p_logitproc = MinPLogitsProcessor(
+        pin_memory=PIN_MEMORY_AVAILABLE,
+        device=device,
+        # +1 for temporary swap space
+        max_num_reqs=MAX_NUM_REQS + 1)
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=None,
+        top_k=None,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
+                                                      vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logits_procs=[
+            min_tokens_logitproc,
+            logit_bias_logitproc,
+        ],
+        nongreedy_logits_procs=[min_p_logitproc])
+    return fake_sampling_metadata, {
+        "min_tokens": min_tokens_logitproc,
+        "logit_bias": logit_bias_logitproc,
+        "min_p": min_p_logitproc
+    }
+
+
+def _fake_apply_greedy_logits_processors(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    """Imitate greedy logit processor application in engine
+    core"""
+    for processor in sampling_metadata.logits_procs:
+        logits = processor.apply(logits)
+    return logits
+
+
+def _fake_apply_nongreedy_logits_processors(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    """Imitate non-greedy logit processoed application in engine
+    core"""
+    for processor in sampling_metadata.nongreedy_logits_procs:
+        logits = processor.apply(logits)
+    return logits
+
+
+def _generate_min_token_penalties_and_stop_tokens(
+    num_output_tokens: int, batch_size: int, vocab_size: int,
+    batch_indices_for_min_token_penalty: list[int]
+) -> dict[int, tuple[int, set[int]]]:
+    """
+    Generates and returns a dict of minimum token penalties and
+    corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
+    batch.
+
+    If a batch index is included in `batch_indices_for_min_token_penalty`,
+    a higher `min_tokens` value is assigned (within a randomized range),
+    and a random set of stop token IDs is created. Otherwise, a lower
+    `min_tokens` value is assigned, and the stop token IDs set is empty.
+    """
+    min_tokens: dict[int, tuple[int, set[int]]] = {}
+    for index in range(batch_size):
+        if index in batch_indices_for_min_token_penalty:
+            min_tokens[index] = (
+                np.random.randint(num_output_tokens + 1,
+                                  2 * num_output_tokens),
+                set(
+                    np.random.randint(0, vocab_size - 1)
+                    for _ in range(np.random.randint(0, vocab_size))))
+        else:
+            min_tokens[index] = (np.random.randint(0,
+                                                   num_output_tokens), set())
+    return min_tokens
+
+
+def _create_weighted_output_token_list(
+        batch_size: int,
+        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        tuple[list[list[int]], list[list[int]]]:
+            - The first element is the output token list, where each sublist
+              corresponds to a batch and contains tokens with weighted
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: list[list[int]] = []
+    sorted_token_ids_in_output: list[list[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(vocab_size,
+                                              size=np.random.randint(1, 10),
+                                              replace=False).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend(
+                [token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return output_token_ids, sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("bias_value", [-0.1, 1.2])
+def test_logit_bias(device: str, batch_size: int, bias_value: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    logit_bias_logitproc = logitproc_dict["logit_bias"]
+    # Create batch update where each request demands a
+    # different logit bias
+    logit_bias_list = _create_logit_bias(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        bias_value=bias_value,
+    )
+    added = [(rdx, SamplingParams(logit_bias=logit_bias_list[rdx]), [])
+             for rdx in range(batch_size)]
+    batch_update = BatchUpdate(
+        removed=[],
+        moved=[],
+        added=added,
+        batch_size=batch_size,
+    )
+    # Register batch update with logit processor
+    logit_bias_logitproc.update_states(batch_update)
+    # Emulate application of greedy logits processors in engine
+    logits = _fake_apply_greedy_logits_processors(fake_logits,
+                                                  sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        biased_index = min(batch_idx, VOCAB_SIZE - 1)
+        for token_id in range(VOCAB_SIZE):
+            if biased_index == token_id:
+                assert logits_for_req[token_id] == pytest.approx(bias_value +
+                                                                 1e-2)
+            else:
+                assert logits_for_req[token_id] == pytest.approx(1e-2)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("min_p", [0.0, 0.1])
+def test_min_p(device: str, batch_size: int, min_p: float):
+    """
+    Tests that when min_p is applied, tokens with probability below 
+    min_p * max_prob are masked with -inf.
+    """
+    torch.set_default_device(device)
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+
+    # Create one dominant token per batch
+    for i in range(batch_size):
+        fake_logits[i, 0] = 10.0  # High logit for first token
+        fake_logits[i, 1:] = 1e-2  # Others remain low
+
+    sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+
+    min_p_logitproc = logitproc_dict["min_p"]
+    # Create batch update where each request demands
+    # the same min_p value
+    added = [(rdx, SamplingParams(min_p=min_p), [])
+             for rdx in range(batch_size)]
+    batch_update = BatchUpdate(
+        removed=[],
+        moved=[],
+        added=added,
+        batch_size=batch_size,
+    )
+    # Register batch update with logit processor
+    min_p_logitproc.update_states(batch_update)
+    # Emulate application of non-greedy logits processors in engine
+    logits = _fake_apply_nongreedy_logits_processors(fake_logits,
+                                                     sampling_metadata)
+    logits = logits.cpu()
+
+    for batch_idx in range(batch_size):
+        for token_id in range(VOCAB_SIZE):
+            if token_id == 0:
+                # Dominant token should always be unmasked
+                assert logits[batch_idx][token_id] != -float("inf")
+            else:
+                if min_p > 0.0:
+                    # Non-dominant tokens should be masked when min_p > 0
+                    assert logits[batch_idx][token_id] == -float("inf")
+                else:
+                    # No masking when min_p is 0
+                    assert logits[batch_idx][token_id] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+def test_min_tokens_penalty(device: str, batch_size: int):
+    """
+    Tests that if the number of output tokens is less than
+    SamplingParams.min_tokens then we will set the logits for
+    the stop token ids to -inf.
+    """
+    torch.set_default_device(device)
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    min_tokens_logitproc = logitproc_dict["min_tokens"]
+    batch_indices_for_min_token_penalty = np.random.randint(
+        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
+    min_tokens_dict = _generate_min_token_penalties_and_stop_tokens(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
+        batch_indices_for_min_token_penalty)
+
+    # Create batch update where each request demands
+    # a different min_tokens value
+    added = [(rdx,
+              SamplingParams(min_tokens=min_tokens_dict[rdx][0],
+                             max_tokens=None), [])
+             for rdx in range(batch_size)]
+    batch_update = BatchUpdate(
+        removed=[],
+        moved=[],
+        added=added,
+        batch_size=batch_size,
+    )
+    # Register batch update with logit processor
+    min_tokens_logitproc.update_states(batch_update)
+    # Emulate application of greedy logits processors in engine
+    logits = _fake_apply_greedy_logits_processors(fake_logits,
+                                                  sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        for token_id in range(VOCAB_SIZE):
+            _, stop_token_ids = min_tokens_dict.get(batch_idx, (0, set()))
+            if token_id in stop_token_ids:
+                assert logits[batch_idx][token_id] == -float("inf")
+            else:
+                assert logits[batch_idx][token_id] != -float("inf")
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 24b759bc1fa..b0c7efdf203 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -7,10 +7,15 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.utils import make_tensor_with_pad
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
 CUDA_DEVICES = [
@@ -47,18 +52,6 @@ def _create_prompt_tokens_tensor(
     )
 
 
-def _create_logit_bias(
-    batch_size: int,
-    vocab_size: int,
-    bias_value: float,
-) -> list[Optional[dict[int, float]]]:
-    res: list[Optional[dict[int, float]]] = []
-    for i in range(batch_size):
-        logit_bias = {min(i, vocab_size - 1): bias_value}
-        res.append(logit_bias)
-    return res
-
-
 def _create_allowed_token_ids(
     batch_size: int,
     vocab_size: int,
@@ -144,7 +137,6 @@ def _create_default_sampling_metadata(
         all_random=False,
         top_p=None,
         top_k=None,
-        min_p=None,
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
@@ -154,43 +146,24 @@ def _create_default_sampling_metadata(
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
         no_penalties=True,
-        min_tokens={},
-        logit_bias=[None] * batch_size,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-    )
+        logits_procs=[
+            MinTokensLogitsProcessor(pin_memory=PIN_MEMORY_AVAILABLE,
+                                     device=device),
+            LogitBiasLogitsProcessor(pin_memory=PIN_MEMORY_AVAILABLE,
+                                     device=device),
+        ],
+        nongreedy_logits_procs=[
+            MinPLogitsProcessor(
+                pin_memory=PIN_MEMORY_AVAILABLE,
+                device=device,
+                # +1 for temporary swap space
+                max_num_reqs=MAX_NUM_REQS + 1)
+        ])
     return fake_sampling_metadata
 
 
-def _generate_min_token_penalties_and_stop_tokens(
-    num_output_tokens: int, batch_size: int, vocab_size: int,
-    batch_indices_for_min_token_penalty: list[int]
-) -> dict[int, tuple[int, set[int]]]:
-    """
-    Generates and returns a dict of minimum token penalties and
-    corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
-    batch.
-
-    If a batch index is included in `batch_indices_for_min_token_penalty`,
-    a higher `min_tokens` value is assigned (within a randomized range),
-    and a random set of stop token IDs is created. Otherwise, a lower
-    `min_tokens` value is assigned, and the stop token IDs set is empty.
-    """
-    min_tokens: dict[int, tuple[int, set[int]]] = {}
-    for index in range(batch_size):
-        if index in batch_indices_for_min_token_penalty:
-            min_tokens[index] = (
-                np.random.randint(num_output_tokens + 1,
-                                  2 * num_output_tokens),
-                set(
-                    np.random.randint(0, vocab_size - 1)
-                    for _ in range(np.random.randint(0, vocab_size))))
-        else:
-            min_tokens[index] = (np.random.randint(0,
-                                                   num_output_tokens), set())
-    return min_tokens
-
-
 def _create_weighted_output_token_list(
         batch_size: int,
         vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
@@ -226,36 +199,6 @@ def _create_weighted_output_token_list(
     return output_token_ids, sorted_token_ids_in_output
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-def test_sampler_min_tokens_penalty(device: str, batch_size: int):
-    """
-    Tests that if the number of output tokens is less than
-    SamplingParams.min_tokens then we will set the logits for
-    the stop token ids to -inf.
-    """
-    torch.set_default_device(device)
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    batch_indices_for_min_token_penalty = np.random.randint(
-        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
-    min_tokens = _generate_min_token_penalties_and_stop_tokens(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
-        batch_indices_for_min_token_penalty)
-    sampling_metadata.min_tokens = min_tokens
-    sampler = Sampler()
-    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
-    logits = logits.cpu()
-    for batch_idx in range(batch_size):
-        for token_id in range(VOCAB_SIZE):
-            _, stop_token_ids = min_tokens.get(batch_idx, (0, set()))
-            if token_id in stop_token_ids:
-                assert logits[batch_idx][token_id] == -float("inf")
-            else:
-                assert logits[batch_idx][token_id] != -float("inf")
-
-
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
@@ -269,7 +212,7 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
+    sampling_metadata, _ = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     output_token_ids = sampling_metadata.output_token_ids
     sampling_metadata.presence_penalties = _create_penalty_tensor(
@@ -314,7 +257,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
+    sampling_metadata, _ = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     sampling_metadata.frequency_penalties = _create_penalty_tensor(
         batch_size, frequency_penalty, torch.device(device))
@@ -368,7 +311,7 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
+    sampling_metadata, _ = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     sampling_metadata.repetition_penalties = _create_penalty_tensor(
         batch_size, repetition_penalty, torch.device(device))
@@ -400,80 +343,6 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
                     or non_penalized_token_id in output_tokens)
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("min_p", [0.0, 0.1])
-def test_sampler_min_p(device: str, batch_size: int, min_p: float):
-    """
-    Tests that when min_p is applied, tokens with probability below 
-    min_p * max_prob are masked with -inf.
-    """
-    torch.set_default_device(device)
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-
-    # Create one dominant token per batch
-    for i in range(batch_size):
-        fake_logits[i, 0] = 10.0  # High logit for first token
-        fake_logits[i, 1:] = 1e-2  # Others remain low
-
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-
-    # Configure min_p parameters
-    sampling_metadata.min_p = torch.full((batch_size, ), min_p, device=device)
-
-    sampler = Sampler()
-    logits = sampler.apply_min_p(fake_logits, sampling_metadata.min_p)
-    logits = logits.cpu()
-
-    for batch_idx in range(batch_size):
-        for token_id in range(VOCAB_SIZE):
-            if token_id == 0:
-                # Dominant token should always be unmasked
-                assert logits[batch_idx][token_id] != -float("inf")
-            else:
-                if min_p > 0.0:
-                    # Non-dominant tokens should be masked when min_p > 0
-                    assert logits[batch_idx][token_id] == -float("inf")
-                else:
-                    # No masking when min_p is 0
-                    assert logits[batch_idx][token_id] != -float("inf")
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("bias_value", [-0.1, 1.2])
-def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
-    """
-    Test to verify that when the repetition penalty is enabled, tokens
-    are penalized based on their presence in the prompt or the existing
-    output.
-    """
-    torch.set_default_device(device)
-    # Create fake logits where each token is assigned the same
-    # logit value.
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    sampling_metadata.logit_bias = _create_logit_bias(
-        batch_size=batch_size,
-        vocab_size=VOCAB_SIZE,
-        bias_value=bias_value,
-    )
-    sampler = Sampler()
-    logits = sampler.apply_logits_bias(fake_logits, sampling_metadata)
-    logits = logits.cpu()
-    for batch_idx in range(batch_size):
-        logits_for_req = logits[batch_idx]
-        biased_index = min(batch_idx, VOCAB_SIZE - 1)
-        for token_id in range(VOCAB_SIZE):
-            if biased_index == token_id:
-                assert logits_for_req[token_id] == pytest.approx(bias_value +
-                                                                 1e-2)
-            else:
-                assert logits_for_req[token_id] == pytest.approx(1e-2)
-
-
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
@@ -488,7 +357,7 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
+    sampling_metadata, _ = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     mask = _create_allowed_token_ids(
         batch_size=batch_size,
@@ -528,7 +397,7 @@ def test_sampler_bad_words(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
+    sampling_metadata, _ = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
         batch_size, VOCAB_SIZE, bad_words_lengths)

From 52988b864a228f0fd2ba3e4edc65227c28bdca75 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 7 May 2025 12:27:05 +0000
Subject: [PATCH 017/180] revert sampling params

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/sampling_params.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index af551205e78..66a77681be9 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -199,7 +199,7 @@ class SamplingParams(
             Defaults to None.
         extra_args: Arbitrary additional args, that can be used by custom
             sampling implementations. Not used by any in-tree sampling
-            implementations. (Not actually a class member.)
+            implementations.
     """
 
     n: int = 1
@@ -242,6 +242,7 @@ class SamplingParams(
     guided_decoding: Optional[GuidedDecodingParams] = None
     logit_bias: Optional[dict[int, float]] = None
     allowed_token_ids: Optional[list[int]] = None
+    extra_args: Optional[dict[str, Any]] = None
 
     # Fields used for bad words
     bad_words: Optional[list[str]] = None
@@ -287,7 +288,8 @@ def from_optional(
                 int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
-        sampling_params = SamplingParams(
+
+        return SamplingParams(
             n=1 if n is None else n,
             best_of=best_of,
             presence_penalty=0.0
@@ -319,12 +321,8 @@ def from_optional(
             guided_decoding=guided_decoding,
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
+            extra_args=extra_args,
         )
-        # Custom sampling params
-        if extra_args:
-            for attr_name, attr_val in extra_args.items():
-                setattr(sampling_params, attr_name, attr_val)
-        return sampling_params
 
     def __post_init__(self) -> None:
         # how we deal with `best_of``:

From a869a6d2970abb1a679a428e3c5461d2c7d623d1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 7 May 2025 13:06:44 +0000
Subject: [PATCH 018/180] impl based on rfc

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/entrypoints/openai/protocol.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5bdaca747be..69bfe598ee1 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -246,7 +246,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     ]] = "none"
 
     # Custom sampling params
-    extra_sampling_params: Optional[dict[str, Any]] = Field(
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
         description=("Additional kwargs to pass to sampling."),
     )
@@ -545,7 +545,7 @@ def to_sampling_params(
                 else RequestOutputKind.FINAL_ONLY,
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
-            extra_args=self.extra_sampling_params)
+            extra_args=self.vllm_xargs)
 
     def _get_guided_json_from_tool(
             self) -> Optional[Union[str, dict, BaseModel]]:
@@ -764,7 +764,7 @@ class CompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # Custom args param
-    extra_sampling_params: Optional[dict[str, Any]] = Field(
+    vllm_xargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to sampling."),
     )
@@ -985,7 +985,7 @@ def to_sampling_params(
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
-            extra_args=self.extra_sampling_params)
+            extra_args=self.vllm_xargs)
 
     @model_validator(mode="before")
     @classmethod
@@ -1697,6 +1697,12 @@ class TranscriptionRequest(OpenAIBaseModel):
         "min_p": 0.0,
     }
 
+    # Custom sampling params
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to sampling."),
+    )
+
     def to_sampling_params(
             self,
             default_max_tokens: int,
@@ -1737,7 +1743,8 @@ def to_sampling_params(
                                             presence_penalty=self.presence_penalty,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
-                                            else RequestOutputKind.FINAL_ONLY)
+                                            else RequestOutputKind.FINAL_ONLY,
+                                            extra_args=self.vllm_xargs)
 
     @model_validator(mode="before")
     @classmethod

From c9a193f4525710f88da0a3c41884af0c998907f9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 9 May 2025 07:22:17 +0000
Subject: [PATCH 019/180] merge

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/tpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d8dcb739b23..20de8352b93 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -38,8 +38,8 @@
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
-from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
 from .utils import sanity_check_mm_encoder_outputs
 

From a47c414f6dbb3c7695633f783c087b344b0ee4b7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 16 May 2025 14:54:42 +0000
Subject: [PATCH 020/180] requires_nongreedy

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 22334e687a8..7f86e3f490d 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -44,6 +44,16 @@ def update_states(
         """
         raise NotImplementedError
 
+    @abstractmethod
+    @property
+    def requires_nongreedy(self) -> bool:
+        """True if logits processor is incompatible with
+        greedy sampling.
+        TODO(andy): won't be utilized until logits
+        processors are user-extensible
+        """
+        raise NotImplementedError
+
 
 ###### ----- LogitsProcessor impls below here
 
@@ -66,6 +76,10 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
+    @property
+    def requires_nongreedy(self) -> bool:
+        return True
+
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
@@ -133,6 +147,10 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
             ()), torch.tensor(()))
 
+    @property
+    def requires_nongreedy(self) -> bool:
+        return False
+
     def update_states(self, batch_update: Optional[BatchUpdate] = None):
         if not batch_update:
             return
@@ -191,6 +209,10 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
             ()), torch.tensor(()))
 
+    @property
+    def requires_nongreedy(self) -> bool:
+        return False
+
     def update_states(self, batch_update: Optional[BatchUpdate] = None):
         needs_update = False
         if batch_update:

From 50ee0b549f8fc1cdcc2aaa79c8872dc92281170f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 22 May 2025 14:37:09 +0000
Subject: [PATCH 021/180] remove TPU hacks

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/tpu/metadata.py     |   2 +-
 vllm/v1/worker/tpu_input_batch.py  | 609 -----------------------------
 vllm/v1/worker/tpu_model_runner.py |   6 +-
 3 files changed, 4 insertions(+), 613 deletions(-)
 delete mode 100644 vllm/v1/worker/tpu_input_batch.py

diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index f5d0836dcd6..a1c7dcdb111 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm.v1.worker.tpu_input_batch import InputBatch
+from vllm.v1.worker.gpu_input_batch import InputBatch
 
 DEFAULT_SAMPLING_PARAMS = dict(
     temperature=-1.0,
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
deleted file mode 100644
index 249cf9a0470..00000000000
--- a/vllm/v1/worker/tpu_input_batch.py
+++ /dev/null
@@ -1,609 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Datastructures defining an input batch
-
-from dataclasses import dataclass
-from typing import Optional, cast
-
-import numpy as np
-import torch
-
-from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import swap_dict_values
-from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.worker.block_table import BlockTable
-
-_SAMPLING_EPS = 1e-5
-
-
-@dataclass
-class CachedRequestState:
-
-    req_id: str
-    prompt_token_ids: list[int]
-    mm_inputs: list[MultiModalKwargs]
-    mm_positions: list[PlaceholderRange]
-    sampling_params: SamplingParams
-    generator: Optional[torch.Generator]
-
-    block_ids: list[int]
-    num_computed_tokens: int
-    output_token_ids: list[int]
-
-    mrope_positions: Optional[torch.Tensor] = None
-    mrope_position_delta: Optional[int] = None
-
-    lora_request: Optional[LoRARequest] = None
-
-    def __post_init__(self):
-        self.num_prompt_tokens = len(self.prompt_token_ids)
-
-    @property
-    def num_tokens(self) -> int:
-        return self.num_prompt_tokens + len(self.output_token_ids)
-
-    def get_token_id(self, idx: int) -> int:
-        if idx < self.num_prompt_tokens:
-            return self.prompt_token_ids[idx]
-        else:
-            return self.output_token_ids[idx - self.num_prompt_tokens]
-
-
-class InputBatch:
-
-    def __init__(
-        self,
-        max_num_reqs: int,
-        max_model_len: int,
-        max_num_blocks_per_req: int,
-        device: torch.device,
-        pin_memory: bool,
-        vocab_size: int,
-    ):
-        self.max_num_reqs = max_num_reqs
-        self.max_model_len = max_model_len
-        self.max_num_blocks_per_req = max_num_blocks_per_req
-        self.device = device
-        self.pin_memory = pin_memory
-        self.vocab_size = vocab_size
-
-        self._req_ids: list[Optional[str]] = []
-        self.req_id_to_index: dict[str, int] = {}
-
-        # TODO(woosuk): This buffer could be too large if max_model_len is big.
-        # Find a way to reduce the CPU memory usage.
-        # This buffer is not directly transferred to the GPU, so it does not
-        # need to be pinned.
-        self.token_ids_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_model_len),
-            device="cpu",
-            dtype=torch.int32,
-            pin_memory=False,
-        )
-        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
-        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_computed_tokens_cpu_tensor = torch.zeros(
-            (max_num_reqs, ),
-            device="cpu",
-            dtype=torch.int32,
-            pin_memory=pin_memory,
-        )
-        self.num_computed_tokens_cpu = \
-            self.num_computed_tokens_cpu_tensor.numpy()
-
-        # Block table.
-        self.block_table = BlockTable(
-            max_num_reqs=max_num_reqs,
-            max_num_blocks_per_req=max_num_blocks_per_req,
-            pin_memory=pin_memory,
-            device=device,
-        )
-
-        # Sampling-related.
-        self.temperature = torch.empty((max_num_reqs, ),
-                                       dtype=torch.float32,
-                                       device=device)
-        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                  dtype=torch.float32,
-                                                  device="cpu",
-                                                  pin_memory=pin_memory)
-        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
-        self.greedy_reqs: set[str] = set()
-        self.random_reqs: set[str] = set()
-
-        self.top_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
-        self.top_p_reqs: set[str] = set()
-
-        self.top_k = torch.empty((max_num_reqs, ),
-                                 dtype=torch.int32,
-                                 device=device)
-        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.int32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
-        self.top_k_reqs: set[str] = set()
-
-        self.min_p = torch.empty((max_num_reqs, ),
-                                 dtype=torch.float32,
-                                 device=device)
-        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        self.min_p_reqs: set[str] = set()
-
-        # Frequency penalty related data structures
-        self.frequency_penalties = torch.empty((max_num_reqs, ),
-                                               dtype=torch.float,
-                                               device=device)
-        self.frequency_penalties_cpu_tensor = torch.empty(
-            (max_num_reqs, ),
-            dtype=torch.float,
-            device="cpu",
-            pin_memory=pin_memory)
-        self.frequency_penalties_cpu = \
-            self.frequency_penalties_cpu_tensor.numpy()
-        self.frequency_penalties_reqs: set[str] = set()
-
-        # Presence penalty related data structures
-        self.presence_penalties = torch.empty((max_num_reqs, ),
-                                              dtype=torch.float,
-                                              device=device)
-        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                         dtype=torch.float,
-                                                         device="cpu",
-                                                         pin_memory=pin_memory)
-        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
-        )
-        self.presence_penalties_reqs: set[str] = set()
-
-        # Repetition penalty related data structures
-        self.repetition_penalties = torch.empty((max_num_reqs, ),
-                                                dtype=torch.float,
-                                                device=device)
-        self.repetition_penalties_cpu_tensor = torch.empty(
-            (max_num_reqs, ),
-            dtype=torch.float,
-            device="cpu",
-            pin_memory=pin_memory)
-        self.repetition_penalties_cpu = \
-            self.repetition_penalties_cpu_tensor.numpy()
-        self.repetition_penalties_reqs: set[str] = set()
-
-        # req_index -> (min_tokens, stop_token_ids)
-        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
-
-        # lora related
-        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
-                                             dtype=np.int32)
-        self.lora_id_to_request_ids: dict[int, set[str]] = {}
-        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
-
-        # req_index -> generator
-        # NOTE(woosuk): The indices of the requests that do not have their own
-        # generator should not be included in the dictionary.
-        self.generators: dict[int, torch.Generator] = {}
-
-        self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
-
-        # To accumulate prompt logprobs tensor chunks across prefill steps.
-        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
-
-        self.logit_bias: list[Optional[dict[int,
-                                            float]]] = [None] * max_num_reqs
-        self.has_allowed_token_ids: set[str] = set()
-        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
-        # the value is False. Since we use masked_fill_ to set -inf.
-        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
-        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
-
-        # req_index -> bad_words_token_ids
-        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
-
-        self.req_output_token_ids: list[Optional[list[int]]] = []
-
-    @property
-    def req_ids(self) -> list[str]:
-        # None elements should only be present transiently
-        # while performing state updates to the batch.
-        return cast(list[str], self._req_ids)
-
-    def add_request(
-        self,
-        request: "CachedRequestState",
-        req_index: Optional[int] = None,
-    ) -> None:
-        if req_index is None:
-            req_index = self.num_reqs
-        assert req_index < self.max_num_reqs
-
-        req_id = request.req_id
-        if req_index == len(self._req_ids):
-            self._req_ids.append(req_id)
-            self.req_output_token_ids.append(request.output_token_ids)
-        else:
-            self._req_ids[req_index] = req_id
-            self.req_output_token_ids[req_index] = request.output_token_ids
-
-        self.req_id_to_index[req_id] = req_index
-
-        # Copy the prompt token ids and output token ids.
-        num_prompt_tokens = len(request.prompt_token_ids)
-        self.num_prompt_tokens[req_index] = num_prompt_tokens
-        self.token_ids_cpu[
-            req_index, :num_prompt_tokens] = request.prompt_token_ids
-        start_idx = num_prompt_tokens
-        end_idx = start_idx + len(request.output_token_ids)
-        self.token_ids_cpu[req_index,
-                           start_idx:end_idx] = request.output_token_ids
-        # Number of token ids in token_ids_cpu.
-        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
-        # Number of tokens without spec decode tokens.
-        self.num_tokens_no_spec[req_index] = request.num_tokens
-
-        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        self.block_table.add_row(request.block_ids, req_index)
-
-        sampling_params = request.sampling_params
-        if sampling_params.sampling_type == SamplingType.GREEDY:
-            # Avoid later division by zero.
-            self.temperature_cpu[req_index] = -1.0
-            self.greedy_reqs.add(req_id)
-        else:
-            self.temperature_cpu[req_index] = sampling_params.temperature
-            self.random_reqs.add(req_id)
-
-        self.top_p_cpu[req_index] = sampling_params.top_p
-        if sampling_params.top_p < 1:
-            self.top_p_reqs.add(req_id)
-        top_k = sampling_params.top_k
-        if 0 < top_k < self.vocab_size:
-            self.top_k_reqs.add(req_id)
-        else:
-            top_k = self.vocab_size
-        self.top_k_cpu[req_index] = top_k
-        self.min_p_cpu[req_index] = sampling_params.min_p
-        self.frequency_penalties_cpu[
-            req_index] = sampling_params.frequency_penalty
-        if sampling_params.min_p > _SAMPLING_EPS:
-            self.min_p_reqs.add(req_id)
-        if sampling_params.frequency_penalty != 0.0:
-            self.frequency_penalties_reqs.add(req_id)
-        self.presence_penalties_cpu[
-            req_index] = sampling_params.presence_penalty
-        if sampling_params.presence_penalty != 0.0:
-            self.presence_penalties_reqs.add(req_id)
-        self.repetition_penalties_cpu[
-            req_index] = sampling_params.repetition_penalty
-        if sampling_params.repetition_penalty != 1.0:
-            self.repetition_penalties_reqs.add(req_id)
-        if sampling_params.min_tokens:
-            self.min_tokens[req_index] = (sampling_params.min_tokens,
-                                          sampling_params.all_stop_token_ids)
-
-        # NOTE(woosuk): self.generators should not include the requests that
-        # do not have their own generator.
-        if request.generator is not None:
-            self.generators[req_index] = request.generator
-
-        if sampling_params.logprobs is not None:
-            self.num_logprobs[req_id] = sampling_params.logprobs
-        if sampling_params.prompt_logprobs is not None:
-            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
-        if sampling_params.logit_bias is not None:
-            self.logit_bias[req_index] = sampling_params.logit_bias
-
-        if sampling_params.allowed_token_ids:
-            self.has_allowed_token_ids.add(req_id)
-            if self.allowed_token_ids_mask_cpu_tensor is None:
-                # Lazy allocation for this tensor, which can be large.
-                # False means we don't fill with -inf.
-                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
-                                                          self.vocab_size,
-                                                          dtype=torch.bool,
-                                                          device=self.device)
-                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
-                    self.max_num_reqs,
-                    self.vocab_size,
-                    dtype=torch.bool,
-                    device="cpu")
-            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
-            # False means we don't fill with -inf.
-            self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = False
-
-        if sampling_params.bad_words_token_ids:
-            self.bad_words_token_ids[
-                req_index] = sampling_params.bad_words_token_ids
-
-        # Add request lora ID
-        if request.lora_request:
-            lora_id = request.lora_request.lora_int_id
-            if lora_id not in self.lora_id_to_request_ids:
-                self.lora_id_to_request_ids[lora_id] = set()
-
-            self.request_lora_mapping[req_index] = lora_id
-            self.lora_id_to_request_ids[lora_id].add(request.req_id)
-            self.lora_id_to_lora_request[lora_id] = request.lora_request
-        else:
-            # No LoRA
-            self.request_lora_mapping[req_index] = 0
-
-    def remove_request(self, req_id: str) -> Optional[int]:
-        """This method must always be followed by a call to condense()."""
-
-        req_index = self.req_id_to_index.pop(req_id, None)
-        if req_index is None:
-            return None
-        self._req_ids[req_index] = None
-        self.req_output_token_ids[req_index] = None
-
-        self.greedy_reqs.discard(req_id)
-        self.random_reqs.discard(req_id)
-        self.top_p_reqs.discard(req_id)
-        self.top_k_reqs.discard(req_id)
-        self.min_p_reqs.discard(req_id)
-        self.min_tokens.pop(req_index, None)
-        self.frequency_penalties_reqs.discard(req_id)
-        self.presence_penalties_reqs.discard(req_id)
-        self.repetition_penalties_reqs.discard(req_id)
-        self.generators.pop(req_index, None)
-        self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
-        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
-
-        # LoRA
-        lora_id = self.request_lora_mapping[req_index]
-        if lora_id != 0:
-            self.lora_id_to_request_ids[lora_id].discard(req_id)
-            if len(self.lora_id_to_request_ids[lora_id]) == 0:
-                self.lora_id_to_request_ids.pop(lora_id)
-                self.lora_id_to_lora_request.pop(lora_id)
-            self.request_lora_mapping[req_index] = 0
-
-        self.logit_bias[req_index] = None
-        self.has_allowed_token_ids.discard(req_id)
-        if self.allowed_token_ids_mask_cpu_tensor is not None:
-            # False means we don't fill with -inf.
-            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
-        self.bad_words_token_ids.pop(req_index, None)
-        return req_index
-
-    def swap_states(self, i1: int, i2: int) -> None:
-        old_id_i1 = self._req_ids[i1]
-        old_id_i2 = self._req_ids[i2]
-        self._req_ids[i1], self._req_ids[i2] =\
-            self._req_ids[i2], self._req_ids[i1] # noqa
-        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
-            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
-        assert old_id_i1 is not None and old_id_i2 is not None
-        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
-            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
-        self.num_tokens[i1], self.num_tokens[i2] =\
-            self.num_tokens[i2], self.num_tokens[i1]
-        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
-            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
-        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
-            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
-        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
-            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
-        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
-            self.temperature_cpu[i2], self.temperature_cpu[i1]
-        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
-            self.top_p_cpu[i2], self.top_p_cpu[i1]
-        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
-            self.top_k_cpu[i2], self.top_k_cpu[i1]
-        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
-            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
-        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
-            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
-        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
-            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
-            self.min_p_cpu[i2], self.min_p_cpu[i1]
-
-        # NOTE: the following is unsafe
-        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
-        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
-        # instead, we need to temporiarily copy the data for one of the indices
-        # TODO(lucas): optimize this by only copying valid indices
-        tmp = self.token_ids_cpu[i1, ...].copy()
-        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
-        self.token_ids_cpu[i2, ...] = tmp
-
-        swap_dict_values(self.generators, i1, i2)
-        swap_dict_values(self.min_tokens, i1, i2)
-        swap_dict_values(self.bad_words_token_ids, i1, i2)
-
-        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
-            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
-        self.logit_bias[i1], self.logit_bias[i2] =\
-            self.logit_bias[i2], self.logit_bias[i1]
-
-        if self.allowed_token_ids_mask_cpu_tensor is not None:
-            self.allowed_token_ids_mask_cpu_tensor[i1], \
-                self.allowed_token_ids_mask_cpu_tensor[i2] =\
-                self.allowed_token_ids_mask_cpu_tensor[i2], \
-                    self.allowed_token_ids_mask_cpu_tensor[i1]
-        self.block_table.swap_row(i1, i2)
-
-    def condense(self, empty_req_indices: list[int]) -> None:
-        num_reqs = self.num_reqs
-        if num_reqs == 0:
-            # The batched states are empty.
-            self._req_ids.clear()
-            self.req_output_token_ids.clear()
-            return
-
-        # NOTE(woosuk): This function assumes that the empty_req_indices
-        # is sorted in descending order.
-        last_req_index = num_reqs + len(empty_req_indices) - 1
-        while empty_req_indices:
-            # Find the largest non-empty index.
-            while last_req_index in empty_req_indices:
-                last_req_index -= 1
-
-            # Find the smallest empty index.
-            empty_index = empty_req_indices.pop()
-            if empty_index >= last_req_index:
-                break
-
-            # Swap the states.
-            req_id = self._req_ids[last_req_index]
-            output_token_ids = self.req_output_token_ids[last_req_index]
-            assert req_id is not None
-            self._req_ids[empty_index] = req_id
-            self._req_ids[last_req_index] = None
-            self.req_output_token_ids[empty_index] = output_token_ids
-            self.req_output_token_ids[last_req_index] = None
-            self.req_id_to_index[req_id] = empty_index
-
-            num_tokens = self.num_tokens[last_req_index]
-            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
-                last_req_index, :num_tokens]
-            self.num_tokens[empty_index] = num_tokens
-            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
-                last_req_index]
-            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
-                last_req_index]
-            self.num_computed_tokens_cpu[
-                empty_index] = self.num_computed_tokens_cpu[last_req_index]
-            self.block_table.move_row(last_req_index, empty_index)
-            self.temperature_cpu[empty_index] = self.temperature_cpu[
-                last_req_index]
-            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
-            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            self.frequency_penalties_cpu[
-                empty_index] = self.frequency_penalties_cpu[last_req_index]
-            self.presence_penalties_cpu[
-                empty_index] = self.presence_penalties_cpu[last_req_index]
-            self.repetition_penalties_cpu[
-                empty_index] = self.repetition_penalties_cpu[last_req_index]
-            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
-            generator = self.generators.pop(last_req_index, None)
-            if generator is not None:
-                self.generators[empty_index] = generator
-
-            min_token = self.min_tokens.pop(last_req_index, None)
-            if min_token is not None:
-                self.min_tokens[empty_index] = min_token
-
-            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
-                last_req_index]
-
-            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
-
-            if self.allowed_token_ids_mask_cpu_tensor is not None:
-                self.allowed_token_ids_mask_cpu_tensor[
-                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
-                        last_req_index]
-
-            bad_words_token_ids = self.bad_words_token_ids.pop(
-                last_req_index, None)
-            if bad_words_token_ids is not None:
-                self.bad_words_token_ids[empty_index] = bad_words_token_ids
-            # Decrement last_req_index since it is now empty.
-            last_req_index -= 1
-
-        # Trim lists to the batch size.
-        del self._req_ids[self.num_reqs:]
-        del self.req_output_token_ids[self.num_reqs:]
-
-    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
-        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
-        prompt_token_ids_cpu_tensor = torch.empty(
-            (self.num_reqs, max_prompt_len),
-            device="cpu",
-            dtype=torch.int64,
-            pin_memory=self.pin_memory,
-        )
-        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
-        prompt_token_ids[:] = self.token_ids_cpu[:self.
-                                                 num_reqs, :max_prompt_len]
-        # Use the value of vocab_size as a pad since we don't have a
-        # token_id of this value.
-        for i in range(self.num_reqs):
-            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
-        return prompt_token_ids_cpu_tensor.to(device=self.device,
-                                              non_blocking=True)
-
-    def make_lora_inputs(
-        self, num_scheduled_tokens: np.ndarray
-    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
-        """
-        Given the num_scheduled_tokens for each request in the batch, return
-        datastructures used to activate the current LoRAs.
-        Returns:
-            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
-               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
-            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
-               where, token_lora_mapping[i] is the LoRA id to use for ith token.
-            3. lora_requests: Set of relevant LoRA requests.
-        """
-
-        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
-        prompt_lora_mapping = tuple(req_lora_mapping)
-        token_lora_mapping = tuple(
-            req_lora_mapping.repeat(num_scheduled_tokens))
-        active_lora_requests: set[LoRARequest] = set(
-            self.lora_id_to_lora_request.values())
-
-        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
-
-    @property
-    def num_reqs(self) -> int:
-        return len(self.req_id_to_index)
-
-    @property
-    def all_greedy(self) -> bool:
-        return len(self.random_reqs) == 0
-
-    @property
-    def all_random(self) -> bool:
-        return len(self.greedy_reqs) == 0
-
-    @property
-    def no_top_p(self) -> bool:
-        return len(self.top_p_reqs) == 0
-
-    @property
-    def no_top_k(self) -> bool:
-        return len(self.top_k_reqs) == 0
-
-    @property
-    def no_min_p(self) -> bool:
-        return len(self.min_p_reqs) == 0
-
-    @property
-    def no_penalties(self) -> bool:
-        return (len(self.presence_penalties_reqs) == 0
-                and len(self.frequency_penalties_reqs) == 0
-                and len(self.repetition_penalties_reqs) == 0)
-
-    @property
-    def max_num_logprobs(self) -> Optional[int]:
-        return max(self.num_logprobs.values()) if self.num_logprobs else None
-
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
-    @property
-    def no_allowed_token_ids(self) -> bool:
-        return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 42251ddfe2a..b4daf5a3467 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -38,8 +38,8 @@
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
 from .utils import sanity_check_mm_encoder_outputs
 
@@ -1455,9 +1455,9 @@ def _get_padded_num_reqs_with_upper_limit(x: int, upper_limit: int) -> int:
 
 def _get_token_paddings(min_token_size: int, max_token_size: int,
                         padding_gap: int) -> list[int]:
-    """Generate a list of padding size, starting from min_token_size,
+    """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
-
+    
     If padding_gap == 0 then:
         increase 2X each time (exponential)
     else:

From 000794d26b9092663ad2c6a9e95bf8bd846a5f68 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 22 May 2025 20:18:55 +0000
Subject: [PATCH 022/180] is_tpu

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_input_batch.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e165d84bb1f..c7d5221b19b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -9,6 +9,7 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
@@ -20,6 +21,8 @@
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
 
+is_tpu = current_platform.is_tpu()
+
 _SAMPLING_EPS = 1e-5
 
 

From 5a2c7f867f5b742a37e612c6248f3215be97ae66 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 27 May 2025 08:24:06 +0000
Subject: [PATCH 023/180] removed property

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 7f86e3f490d..0596b5e7a08 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -45,7 +45,6 @@ def update_states(
         raise NotImplementedError
 
     @abstractmethod
-    @property
     def requires_nongreedy(self) -> bool:
         """True if logits processor is incompatible with
         greedy sampling.
@@ -76,7 +75,6 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
-    @property
     def requires_nongreedy(self) -> bool:
         return True
 
@@ -147,7 +145,6 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
             ()), torch.tensor(()))
 
-    @property
     def requires_nongreedy(self) -> bool:
         return False
 
@@ -209,7 +206,6 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
             ()), torch.tensor(()))
 
-    @property
     def requires_nongreedy(self) -> bool:
         return False
 

From 7226815a5b157df3ff816f563b6fb0decae743aa Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 27 May 2025 09:27:43 +0000
Subject: [PATCH 024/180] feature flag re-enables hard-coded min-p for TPU

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/metadata.py        |  4 ++++
 vllm/v1/worker/gpu_input_batch.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 0036582d493..0c8c71798ff 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -43,3 +43,7 @@ class SamplingMetadata:
     # only applied to random-sampled requests in the batch.
     logits_procs: list[LogitsProcessor]
     nongreedy_logits_procs: list[LogitsProcessor]
+
+    # TODO(andy): Because newest logits processors implementation
+    # does not support TPU yet, the old min_p field is still required
+    min_p: Optional[torch.Tensor] = None
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6aa2bc0f898..b5321f85ae1 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -21,6 +21,8 @@
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 
+# TODO(andy): TPU implementation does not support
+# latest logits processor implementation
 is_tpu = current_platform.is_tpu()
 
 _SAMPLING_EPS = 1e-5
@@ -146,6 +148,17 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
+        if is_tpu:
+            self.min_p = torch.empty((max_num_reqs, ),
+                                     dtype=torch.float32,
+                                     device=device)
+            self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float32,
+                                                device="cpu",
+                                                pin_memory=pin_memory)
+            self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+            self.min_p_reqs: set[str] = set()
+
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -297,6 +310,10 @@ def add_request(
         else:
             top_k = self.vocab_size
         self.top_k_cpu[req_index] = top_k
+        if is_tpu:
+            self.min_p_cpu[req_index] = sampling_params.min_p
+            if sampling_params.min_p > _SAMPLING_EPS:
+                self.min_p_reqs.add(req_id)
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
         if sampling_params.frequency_penalty != 0.0:
@@ -371,6 +388,8 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        if is_tpu:
+            self.min_p_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -425,6 +444,9 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        if is_tpu:
+            self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+                self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -505,6 +527,8 @@ def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
+            if is_tpu:
+                self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -546,6 +570,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
             copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
+        if is_tpu and not self.no_min_p:
+            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
 
         if not self.no_penalties:
             # Since syncing these tensors is expensive only copy them
@@ -578,6 +604,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            min_p=None if self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,
@@ -658,6 +685,10 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0

From b36ea7213819759d415cd839d2cf7bc6097ced07 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 27 May 2025 10:43:33 +0000
Subject: [PATCH 025/180] _device_tensor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 0596b5e7a08..66c711f556c 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -178,11 +178,11 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
                 tok_ids.extend(lb.keys())
                 biases.extend(lb.values())
 
-            self.bias_tensor = self._tensor(biases, torch.float32)
-            self.logits_slice = (self._tensor(reqs, torch.int32),
-                                 self._tensor(tok_ids, torch.int32))
+            self.bias_tensor = self._device_tensor(biases, torch.float32)
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
 
-    def _tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
         return (torch.tensor(data,
                              device="cpu",
                              dtype=dtype,
@@ -249,10 +249,10 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
                     reqs.extend([req] * len(stop_tok_ids))
                     tok_ids.extend(stop_tok_ids)
 
-                self.logits_slice = (self._tensor(reqs, torch.int32),
-                                     self._tensor(tok_ids, torch.int32))
+                self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                     self._device_tensor(tok_ids, torch.int32))
 
-    def _tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
         return (torch.tensor(data,
                              device="cpu",
                              dtype=dtype,

From 47c7e418ced3fe87aa699be2156bbbdd2a2885a7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 27 May 2025 10:48:35 +0000
Subject: [PATCH 026/180] class method

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/sample/logits_processor.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 66c711f556c..1e61c9fa62d 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -44,8 +44,9 @@ def update_states(
         """
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def requires_nongreedy(self) -> bool:
+    def requires_nongreedy(cls) -> bool:
         """True if logits processor is incompatible with
         greedy sampling.
         TODO(andy): won't be utilized until logits
@@ -75,7 +76,8 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
-    def requires_nongreedy(self) -> bool:
+    @classmethod
+    def requires_nongreedy(cls) -> bool:
         return True
 
     def get_min_p_by_index(self, index: int) -> float:
@@ -145,7 +147,8 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
             ()), torch.tensor(()))
 
-    def requires_nongreedy(self) -> bool:
+    @classmethod
+    def requires_nongreedy(cls) -> bool:
         return False
 
     def update_states(self, batch_update: Optional[BatchUpdate] = None):
@@ -206,7 +209,8 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
             ()), torch.tensor(()))
 
-    def requires_nongreedy(self) -> bool:
+    @classmethod
+    def requires_nongreedy(cls) -> bool:
         return False
 
     def update_states(self, batch_update: Optional[BatchUpdate] = None):

From fa9344432d6c2a04d8da2f9dee1d574cee804245 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 27 May 2025 14:16:55 +0000
Subject: [PATCH 027/180] upstream merge swap logic

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 23 ++++++++++++++++-
 vllm/v1/worker/gpu_model_runner.py | 41 +++++++++++++-----------------
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b5321f85ae1..28b6e887cb4 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # Datastructures defining an input batch
 
+from collections.abc import Sequence
 from dataclasses import dataclass
+from itertools import chain
 from typing import Optional, cast
 
 import numpy as np
@@ -13,7 +15,8 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
+from vllm.v1.sample.logits_processor import (BatchUpdate,
+                                             LogitBiasLogitsProcessor,
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor)
@@ -559,6 +562,24 @@ def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:
     def refresh_sampling_metadata(self):
         self.sampling_metadata = self._make_sampling_metadata()
 
+    def logit_procs_update_states(
+        self,
+        removed: Sequence[int] = (),
+        moved: Sequence[tuple[int, int]] = (),
+        added: Sequence[tuple[int, SamplingParams, list[int]]] = ()
+    ) -> None:
+        """Update logits processor state after batch remove/move/add"""
+
+        # Update states of logits processors
+        for processor in chain(self.logit_procs, self.nongreedy_logits_procs):
+            processor.update_states(
+                BatchUpdate(
+                    removed=removed,
+                    moved=moved,
+                    added=added,
+                    batch_size=self.num_reqs,
+                ))
+
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
         if not self.all_greedy:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 68b622011c6..c893063ced5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4,7 +4,7 @@
 import gc
 import time
 import weakref
-from itertools import chain
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional, Union
 
 import numpy as np
@@ -45,7 +45,6 @@
                                         SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
-from vllm.v1.sample.logits_processor import BatchUpdate
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -280,7 +279,10 @@ def __init__(
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
+    def _may_reorder_batch(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Sequence[tuple[int, int]]:
         """
         Update the order of requests in the batch based on the attention
         backend's needs. For example, some attention backends (namely MLA) may
@@ -291,9 +293,9 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
             scheduler_output: The scheduler output.
 
         Returns:
-            True if the batch was reordered, False otherwise.
+            Sequence of swap tuples
         """
-        batch_reordered = self.attn_metadata_builders[0].reorder_batch(
+        swaps = self.attn_metadata_builders[0].reorder_batch(
             self.input_batch, scheduler_output)
 
         # For models with multiple KV cache groups, the groups should agree on
@@ -303,7 +305,7 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
             assert not self.attn_metadata_builders[i].reorder_batch(
                 self.input_batch, scheduler_output)
-        return batch_reordered
+        return swaps
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
@@ -481,7 +483,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         # Check if the batch has changed. If not, we can skip copying the
         # sampling metadata from CPU to GPU.
-        batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
+        batch_changed = (len(removed_req_indices) > 0
+                         or len(req_ids_to_add) > 0)
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
@@ -509,27 +512,17 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Some attention backends (namely MLA) may want to separate requests
         # based on if the attention computation will be compute-bound or
         # memory-bound. This gives them a hook to do that.
-        if swaps := self.attn_metadata_builder.reorder_batch(
-                self.input_batch, scheduler_output):
+        if swaps := self._may_reorder_batch(scheduler_output):
             moved.extend(swaps)
             batch_changed = True
 
-        # Update states of logits processors
-        batch_update = None if not batch_changed else BatchUpdate(
-            removed=removed,
-            moved=moved,
-            added=added,
-            batch_size=self.input_batch.num_reqs,
-        )
-
-        for processor in chain(self.input_batch.logit_procs,
-                               self.input_batch.nongreedy_logits_procs):
-            processor.update_states(batch_update)
-
-        batch_reordered = self._may_reorder_batch(scheduler_output)
-
-        if batch_changed or batch_reordered:
+        if batch_changed:
             self.input_batch.refresh_sampling_metadata()
+            self.input_batch.logit_procs_update_states(
+                removed=removed,
+                moved=moved,
+                added=added,
+            )
 
     def _prepare_inputs(
         self,

From 647bbea64d579436f79b4f98f60de214d450e65a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 27 May 2025 15:30:05 -0400
Subject: [PATCH 028/180] bugfixes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 28b6e887cb4..10720e9c1b8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -625,7 +625,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
-            min_p=None if self.no_min_p else self.min_p[:num_reqs],
+            min_p=None if not is_tpu or self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,
@@ -708,6 +708,10 @@ def no_top_k(self) -> bool:
 
     @property
     def no_min_p(self) -> bool:
+        # TODO(andy): remove this method once
+        # new logits processors implementation
+        # supports TPU
+        assert is_tpu
         return len(self.min_p_reqs) == 0
 
     @property

From af7fcbfc16854d54d60735fe389a4ce64534cc43 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 27 May 2025 15:47:32 -0400
Subject: [PATCH 029/180] test type annotation

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 requirements/test.txt                     | 22 ++++++++++++++++++++--
 tests/v1/sample/test_logits_processors.py | 21 +++++++++++++--------
 vllm/v1/worker/gpu_input_batch.py         |  3 ++-
 3 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index 89d47701734..df377085602 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -27,6 +27,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -129,6 +133,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -640,7 +649,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -700,8 +708,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -775,13 +788,18 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
 tzdata==2024.2
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 82d5e5a7454..4ab0c73da7f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Sequence
 from typing import Optional
 
 import numpy as np
@@ -16,6 +17,8 @@
                                              MinTokensLogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
 
+BatchAddType = Sequence[tuple[int, SamplingParams, list[int]]]
+
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
 VOCAB_SIZE = 1024
@@ -292,8 +295,10 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
         vocab_size=VOCAB_SIZE,
         bias_value=bias_value,
     )
-    added = [(rdx, SamplingParams(logit_bias=logit_bias_list[rdx]), [])
-             for rdx in range(batch_size)]
+    added: BatchAddType = [
+        (rdx, SamplingParams(logit_bias=logit_bias_list[rdx]), [])
+        for rdx in range(batch_size)
+    ]
     batch_update = BatchUpdate(
         removed=[],
         moved=[],
@@ -339,8 +344,8 @@ def test_min_p(device: str, batch_size: int, min_p: float):
     min_p_logitproc = logitproc_dict["min_p"]
     # Create batch update where each request demands
     # the same min_p value
-    added = [(rdx, SamplingParams(min_p=min_p), [])
-             for rdx in range(batch_size)]
+    added: BatchAddType = [(rdx, SamplingParams(min_p=min_p), [])
+                           for rdx in range(batch_size)]
     batch_update = BatchUpdate(
         removed=[],
         moved=[],
@@ -389,10 +394,10 @@ def test_min_tokens_penalty(device: str, batch_size: int):
 
     # Create batch update where each request demands
     # a different min_tokens value
-    added = [(rdx,
-              SamplingParams(min_tokens=min_tokens_dict[rdx][0],
-                             max_tokens=None), [])
-             for rdx in range(batch_size)]
+    added: BatchAddType = [(rdx,
+                            SamplingParams(min_tokens=min_tokens_dict[rdx][0],
+                                           max_tokens=None), [])
+                           for rdx in range(batch_size)]
     batch_update = BatchUpdate(
         removed=[],
         moved=[],
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 10720e9c1b8..f45fd9fc6c6 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -625,7 +625,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
-            min_p=None if not is_tpu or self.no_min_p else self.min_p[:num_reqs],
+            min_p=None
+            if not is_tpu or self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,

From 8318f1a0a0b5d2b77d3b7d793e02b44f54be1053 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 27 May 2025 16:25:15 -0400
Subject: [PATCH 030/180] removed errant todo

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f45fd9fc6c6..9e9d378ba4e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -472,8 +472,6 @@ def swap_states(self, i1: int, i2: int) -> None:
                 self.allowed_token_ids_mask_cpu_tensor[i2], \
                     self.allowed_token_ids_mask_cpu_tensor[i1]
 
-        # TODO need to handle LogitsProcessors here
-
         self.block_table.swap_row(i1, i2)
 
     def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:

From 7ebec2d4fd9e1fbea0c931f5bbc892b5a264dac8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 2 Jun 2025 18:12:14 -0400
Subject: [PATCH 031/180] fixed req removal bookkeeping

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 32 +++++++++++++++++++-----------
 vllm/v1/worker/gpu_input_batch.py  | 23 +++++++++++++++++----
 vllm/v1/worker/gpu_model_runner.py |  2 +-
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 1e61c9fa62d..b72b780f8a3 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -8,6 +8,9 @@
 from torch._prims_common import DeviceLikeType
 
 from vllm import SamplingParams
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 @dataclasses.dataclass
@@ -144,8 +147,8 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.pin_memory = pin_memory
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
-        self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
-            ()), torch.tensor(()))
+        self.logits_slice = (self._device_tensor([], torch.int32),
+                             self._device_tensor([], torch.int32))
 
     @classmethod
     def requires_nongreedy(cls) -> bool:
@@ -206,8 +209,12 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.device = device
         self.pin_memory = pin_memory
 
-        self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (torch.tensor(
-            ()), torch.tensor(()))
+        # (req_idx_tensor,eos_tok_id_tensor)
+        self.logits_slice: tuple[torch.Tensor,
+                                 torch.Tensor] = (self._device_tensor(
+                                     [], torch.int32),
+                                                  self._device_tensor(
+                                                      [], torch.int32))
 
     @classmethod
     def requires_nongreedy(cls) -> bool:
@@ -216,6 +223,14 @@ def requires_nongreedy(cls) -> bool:
     def update_states(self, batch_update: Optional[BatchUpdate] = None):
         needs_update = False
         if batch_update:
+            # Process added requests.
+            for index, sampling_params, output_tok_ids in batch_update.added:
+                if ((min_tokens := sampling_params.min_tokens)
+                        and len(output_tok_ids) < min_tokens):
+                    self.min_toks[index] = (min_tokens, output_tok_ids,
+                                            sampling_params.all_stop_token_ids)
+                    needs_update = True
+
             if self.min_toks:
                 # Process removed and moved requests.
                 for index in batch_update.removed:
@@ -227,14 +242,6 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
                         self.min_toks[to_index] = entry
                         needs_update = True
 
-            # Process added requests.
-            for index, sampling_params, output_tok_ids in batch_update.added:
-                if ((min_tokens := sampling_params.min_tokens)
-                        and len(output_tok_ids) < min_tokens):
-                    self.min_toks[index] = (min_tokens, output_tok_ids,
-                                            sampling_params.all_stop_token_ids)
-                    needs_update = True
-
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
             to_remove = tuple(index for index, (min_toks, out_tok_ids,
@@ -265,5 +272,6 @@ def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if self.min_toks:
+            # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9e9d378ba4e..011cbba8019 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -474,13 +474,27 @@ def swap_states(self, i1: int, i2: int) -> None:
 
         self.block_table.swap_row(i1, i2)
 
-    def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:
+    def condense(
+        self, empty_req_indices: list[int]
+    ) -> tuple[list[tuple[int, int]], list[int]]:
+        """Slide non-empty requests down into empty indices.
+
+        Any consecutive empty indices at the very end of the list are not
+        filled.
+
+        Args:
+          empty_req_indices: empty indices which may be filled.
+
+        Returns:
+          swaps: list of (from,to) swap tuples for moved requests
+          empty_req_indices: indices not filled by condensation
+        """
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
             self._req_ids.clear()
             self.req_output_token_ids.clear()
-            return []
+            return [], []
 
         # NOTE(woosuk): This function assumes that the empty_req_indices
         # is sorted in descending order.
@@ -492,11 +506,12 @@ def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = empty_req_indices.pop()
+            empty_index = empty_req_indices[-1]
             if empty_index >= last_req_index:
                 break
 
             # Swap the states.
+            empty_req_indices.pop()
             swaps.append((last_req_index, empty_index))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
@@ -555,7 +570,7 @@ def condense(self, empty_req_indices: list[int]) -> list[tuple[int, int]]:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-        return swaps
+        return swaps, empty_req_indices
 
     def refresh_sampling_metadata(self):
         self.sampling_metadata = self._make_sampling_metadata()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0b25aee6408..45bc3a7688f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -502,7 +502,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         # Condense the batched states if there are empty indices.
         if removed_req_indices:
-            moved = self.input_batch.condense(removed_req_indices)
+            moved, removed = self.input_batch.condense(removed_req_indices)
         else:
             moved = []
 

From 481c98595ababf16605d2e743b6904a0d8a1c8c7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 2 Jun 2025 18:26:09 -0400
Subject: [PATCH 032/180] proper order of add in logitsprocs

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index b72b780f8a3..451b30e9178 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -91,6 +91,14 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
             return
 
         needs_update = False
+        # Process added requests.
+        for index, sampling_params, _ in batch_update.added:
+            min_p = sampling_params.min_p
+            self.min_p_cpu[index] = min_p
+            if min_p:
+                self.min_p_count += 1
+                needs_update = True
+
         if self.min_p_count:
             # Process removed and moved requests.
             for index in batch_update.removed:
@@ -104,14 +112,6 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
                 if min_p:
                     needs_update = True
 
-        # Process added requests.
-        for index, sampling_params, _ in batch_update.added:
-            min_p = sampling_params.min_p
-            self.min_p_cpu[index] = min_p
-            if min_p:
-                self.min_p_count += 1
-                needs_update = True
-
         # Update tensors if needed.
         size = batch_update.batch_size
         if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
@@ -159,6 +159,12 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
             return
 
         needs_update = False
+        # Process added requests.
+        for index, sampling_params, _ in batch_update.added:
+            if lb := sampling_params.logit_bias:
+                self.biases[index] = lb
+                needs_update = True
+
         if self.biases:
             # Process removed and moved requests.
             for index in batch_update.removed:
@@ -170,12 +176,6 @@ def update_states(self, batch_update: Optional[BatchUpdate] = None):
                     self.biases[to_index] = entry
                     needs_update = True
 
-        # Process added requests.
-        for index, sampling_params, _ in batch_update.added:
-            if lb := sampling_params.logit_bias:
-                self.biases[index] = lb
-                needs_update = True
-
         # Update tensors if needed.
         if self.biases and needs_update:
             reqs, tok_ids, biases = [], [], []

From 87a1835541a13af5afc8e49c86344a0594cef4d4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 4 Jun 2025 06:46:02 -0400
Subject: [PATCH 033/180] revert backend changes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py |  8 +++-----
 vllm/v1/attention/backends/mla/common.py | 23 ++++++-----------------
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a59a1a3e4a6..a9f748d026f 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
-from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -321,10 +320,9 @@ def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
         # populated on first build() call.
         self.aot_sliding_window: Optional[tuple[int, int]] = None
 
-    def reorder_batch(
-            self, input_batch: "InputBatch",
-            scheduler_output: "SchedulerOutput") -> Sequence[tuple[int, int]]:
-        return ()
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index cc36ed3b94a..e6b4f640463 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -189,7 +189,6 @@
 
 import functools
 from abc import abstractmethod
-from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
 
@@ -384,11 +383,8 @@ def __init__(self,
             )
         self.block_table = block_table
 
-    def reorder_batch(
-        self,
-        input_batch: "InputBatch",
-        scheduler_output: "SchedulerOutput",
-    ) -> Sequence[tuple[int, int]]:
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are and
         # the front and the "prefill" requests are at the using the least amount
         # swaps possible. (NOTE for now we loosely use "decode" to mean requests
@@ -425,8 +421,8 @@ def reorder_batch(
         # the above loop
         num_decodes = len(decodes)
         num_prefills = len(prefills)
+        modified_batch = False
 
-        swaps = []
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
             # with the prefill closest to the front of the batch
@@ -434,15 +430,8 @@ def reorder_batch(
             if decode_idx < num_decodes:
                 break
 
-            prefill_idx = prefills[i - 1]
-            input_batch.swap_states(prefill_idx, decode_idx)
-
-            # Using "move" operation of LogitsProcessors via temporary slot
-            # currently.
-            # TODO possibly add more direct swap operation to LPs
-            swaps.append((prefill_idx, input_batch.max_num_reqs))
-            swaps.append((decode_idx, prefill_idx))
-            swaps.append((input_batch.max_num_reqs, decode_idx))
+            input_batch.swap_states(prefills[i - 1], decode_idx)
+            modified_batch = True
 
         # Save for next `build` call
         # TODO(lucas): this is a bit of a hack, we should probably have a
@@ -452,7 +441,7 @@ def reorder_batch(
         self._num_decode_tokens = num_decode_tokens
         self._num_prefill_tokens = num_prefill_tokens
 
-        return swaps
+        return modified_batch
 
     def _build_decode(self, block_table_tensor: torch.Tensor,
                       seq_lens: torch.Tensor):

From ecf26ac0b9f3cd8313525977badb1529ced281c6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 4 Jun 2025 15:51:43 -0400
Subject: [PATCH 034/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |   6 +-
 tests/v1/worker/test_gpu_input_batch.py   |   4 +-
 vllm/v1/sample/logits_processor.py        | 124 +++++++++++++++++-----
 vllm/v1/worker/gpu_input_batch.py         |  77 ++++++++++----
 vllm/v1/worker/gpu_model_runner.py        |  69 ++++--------
 5 files changed, 182 insertions(+), 98 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 4ab0c73da7f..66e105f4d37 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -306,7 +306,7 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
         batch_size=batch_size,
     )
     # Register batch update with logit processor
-    logit_bias_logitproc.update_states(batch_update)
+    logit_bias_logitproc.commit_state_changes(batch_update)
     # Emulate application of greedy logits processors in engine
     logits = _fake_apply_greedy_logits_processors(fake_logits,
                                                   sampling_metadata)
@@ -353,7 +353,7 @@ def test_min_p(device: str, batch_size: int, min_p: float):
         batch_size=batch_size,
     )
     # Register batch update with logit processor
-    min_p_logitproc.update_states(batch_update)
+    min_p_logitproc.commit_state_changes(batch_update)
     # Emulate application of non-greedy logits processors in engine
     logits = _fake_apply_nongreedy_logits_processors(fake_logits,
                                                      sampling_metadata)
@@ -405,7 +405,7 @@ def test_min_tokens_penalty(device: str, batch_size: int):
         batch_size=batch_size,
     )
     # Register batch update with logit processor
-    min_tokens_logitproc.update_states(batch_update)
+    min_tokens_logitproc.commit_state_changes(batch_update)
     # Emulate application of greedy logits processors in engine
     logits = _fake_apply_greedy_logits_processors(fake_logits,
                                                   sampling_metadata)
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 72547e86b0e..59d93fe3619 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -352,7 +352,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         req = reordered_reqs[req_index]
         ref_input_batch.add_request(req, req_index)
 
-    input_batch.refresh_sampling_metadata()
-    ref_input_batch.refresh_sampling_metadata()
+    input_batch.refresh()
+    ref_input_batch.refresh()
 
     _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 451b30e9178..2f7efa79dab 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import dataclasses
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
 from typing import Optional
 
 import torch
@@ -12,31 +11,97 @@
 
 logger = init_logger(__name__)
 
+# (index, params, output_tok_ids) for new
+# requests added to the batch.
+AddedRequestType=tuple[int, SamplingParams, list[int]]
+# (a, b) batch indices of any requests
+# swapped within the batch.
+SwappedRequestType=tuple[int, int]
+# (from, to) batch indices of any requests
+# moved within the batch.
+MovedRequestType=tuple[int, int]
+# Batch indices of any removed requests.
+RemovedRequestType=int
 
 @dataclasses.dataclass
 class BatchUpdate:
     # The current number of requests in the batch.
-    batch_size: int
-    # Batch indices of any removed requests.
-    removed: Sequence[int] = ()
-    # (from, to) batch indices of any requests
-    # moved within the batch.
-    moved: Sequence[tuple[int, int]] = ()
-    # (index, params, output_tok_ids) for new
-    # requests added to the batch.
-    added: Sequence[tuple[int, SamplingParams, list[int]]] = ()
+    batch_size: int = 0 # Must be updated
+    _removed: list[RemovedRequestType] = []
+    _is_removed_sorted: bool = False
+    moved: list[MovedRequestType] = []
+    swapped: list[SwappedRequestType] = []
+    added: list[AddedRequestType] = []
+
+    def _sort_removed(self)->None:
+        """Sort removed request indices in
+        descending order.
+        
+        Idempotent after first call, until
+        reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted=True
+
+    @property
+    def removed(self) -> list[RemovedRequestType]:
+        self._sort_removed()
+        return self._removed
+
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+
+    def num_removed(self) -> int:
+        return len(self._removed)
+
+    def peek_removed(self) -> int:
+        self._sort_removed()
+        return self._removed[-1]
+
+    def pop_removed_if_can(self) -> Optional[int]:
+        if self.has_removed():
+            self._sort_removed()
+            return self._removed.pop()
+        return None
+
+    def append_removed(self,req_index: int) -> None:
+        self._removed.append(req_index)
+
+    def reset(self):
+        self.batch_size = 0
+        self._removed = []
+        self._is_removed_sorted = False
+        self.moved = []
+        self.swapped = []
+        self.added = []
 
 
 class LogitsProcessor(ABC):
+    batch_update: BatchUpdate
+
+    def __init__(self):
+        # Empty batch update
+        self.batch_update = BatchUpdate()
 
     @abstractmethod
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
 
+    @classmethod
     @abstractmethod
-    def update_states(
+    def requires_nongreedy(cls) -> bool:
+        """True if logits processor is incompatible with
+        greedy sampling.
+        TODO(andy): won't be utilized until logits
+        processors are user-extensible
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _commit_state_changes(
         self,
-        batch_update: Optional[BatchUpdate] = None,
+        batch_update: BatchUpdate,
     ) -> None:
         """Called when there are new output tokens, prior
         to each forward pass.
@@ -47,16 +112,24 @@ def update_states(
         """
         raise NotImplementedError
 
-    @classmethod
-    @abstractmethod
-    def requires_nongreedy(cls) -> bool:
-        """True if logits processor is incompatible with
-        greedy sampling.
-        TODO(andy): won't be utilized until logits
-        processors are user-extensible
-        """
-        raise NotImplementedError
+    def add_request(self, request_info: AddedRequestType) -> None:
+        self.batch_update.added.append(request_info)
+
+    def remove_request(self, 
+                       request_index: RemovedRequestType) -> None:
+        self.batch_update.removed.append(request_index)
+
+    def move_request(self,
+                     from_index: int, to_index: int) -> None:
+        self.batch_update.moved.append((from_index,to_index))
+
+    def swap_requests(self,
+                      a_index: int, b_index: int) -> None:
+        self.batch_update.swapped.append((a_index,b_index))
 
+    def commit_state_changes(self, batch_size: int) -> None:
+        self.batch_update.batch_size = batch_size
+        self._commit_state_changes(self.batch_update)
 
 ###### ----- LogitsProcessor impls below here
 
@@ -65,6 +138,7 @@ class MinPLogitsProcessor(LogitsProcessor):
 
     def __init__(self, max_num_reqs: int, pin_memory: bool,
                  device: DeviceLikeType):
+        super().__init__()
         self.min_p_count: int = 0
 
         self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
@@ -86,7 +160,7 @@ def requires_nongreedy(cls) -> bool:
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def update_states(self, batch_update: Optional[BatchUpdate] = None):
+    def _commit_state_changes(self, batch_update: BatchUpdate):
         if not batch_update:
             return
 
@@ -142,6 +216,7 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 class LogitBiasLogitsProcessor(LogitsProcessor):
 
     def __init__(self, pin_memory: bool, device: torch.device):
+        super().__init__()
         self.biases: dict[int, dict[int, float]] = {}
         self.device = device
         self.pin_memory = pin_memory
@@ -154,7 +229,7 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def update_states(self, batch_update: Optional[BatchUpdate] = None):
+    def _commit_state_changes(self, batch_update: BatchUpdate):
         if not batch_update:
             return
 
@@ -205,6 +280,7 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
     def __init__(self, pin_memory: bool, device: torch.device):
         # index -> (min_toks, output_token_ids, stop_token_ids)
+        super().__init__()
         self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
         self.device = device
         self.pin_memory = pin_memory
@@ -220,7 +296,7 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def update_states(self, batch_update: Optional[BatchUpdate] = None):
+    def _commit_state_changes(self, batch_update: BatchUpdate):
         needs_update = False
         if batch_update:
             # Process added requests.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 0b2a5cdc4f1..c9c6246e1a8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -20,7 +20,11 @@
                                              LogitBiasLogitsProcessor,
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor)
+                                             MinTokensLogitsProcessor,
+                                             RemovedRequestType,
+                                             AddedRequestType,
+                                             MovedRequestType,
+                                             SwappedRequestType)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
@@ -253,21 +257,38 @@ def __init__(
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
+        # Internal representation of per-step batch state changes.
+        # Should reset each step.
+        self.batch_update = BatchUpdate()
+
     @property
     def req_ids(self) -> list[str]:
         # None elements should only be present transiently
         # while performing state updates to the batch.
         return cast(list[str], self._req_ids)
 
+    def _get_next_add_index(self) -> int:
+        if req_index := self.batch_update.pop_removed_if_can():
+            # Fill the empty index.
+            return req_index
+        # Append to end
+        return self.num_reqs
+
+    def _register_add_request(self, request: "CachedRequestState") -> None:
+        """Track add-request operations"""
+        req_index = self._get_next_add_index()
+        assert req_index < self.max_num_reqs
+        self.batch_update.added.append((req_index,request.sampling_params,
+                                        request.output_token_ids))
+
+    def has_step_removed_requests(self) -> bool:
+        return self.batch_update.has_removed()
+
     def add_request(
         self,
         request: "CachedRequestState",
-        req_index: Optional[int] = None,
     ) -> int:
-        if req_index is None:
-            req_index = self.num_reqs
-        assert req_index < self.max_num_reqs
-
+        req_index=self._register_add_request(request)
         req_id = request.req_id
         if req_index == len(self._req_ids):
             self._req_ids.append(req_id)
@@ -385,6 +406,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
+        self.batch_update.append_removed(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -418,7 +440,13 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.bad_words_token_ids.pop(req_index, None)
         return req_index
 
+    def _register_swap_requests(self, a: int, b: int) -> None:
+        """Track swap operations (exchanges of requests at
+        respective differing indices.)"""
+        self.batch_update.swapped.append((a,b))
+
     def swap_states(self, i1: int, i2: int) -> None:
+        self.batch_update.swapped.append((i1,i2))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -475,9 +503,12 @@ def swap_states(self, i1: int, i2: int) -> None:
 
         self.block_table.swap_row(i1, i2)
 
-    def condense(
-        self, empty_req_indices: list[int]
-    ) -> tuple[list[tuple[int, int]], list[int]]:
+    def _register_move_request(self,
+                               from_idx: int, 
+                               to_idx: int) -> None:
+        self.batch_update.moved.append((from_idx,to_idx))
+
+    def condense(self) -> None:
         """Slide non-empty requests down into empty indices.
 
         Any consecutive empty indices at the very end of the list are not
@@ -499,21 +530,23 @@ def condense(
 
         # NOTE(woosuk): This function assumes that the empty_req_indices
         # is sorted in descending order.
-        last_req_index = num_reqs + len(empty_req_indices) - 1
-        swaps = []
-        while empty_req_indices:
+        last_req_index = num_reqs + self.batch_update.num_removed() - 1
+        while self.batch_update.has_removed():
             # Find the largest non-empty index.
+            empty_req_indices = self.batch_update.removed
             while last_req_index in empty_req_indices:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = empty_req_indices[-1]
+            empty_index = self.batch_update.peek_removed()
             if empty_index >= last_req_index:
                 break
 
-            # Swap the states.
-            empty_req_indices.pop()
-            swaps.append((last_req_index, empty_index))
+            # Move active request down into empty request
+            # index.
+            self.batch_update.pop_removed_if_can()
+            self.batch_update.moved.append((last_req_index, 
+                                            empty_index))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -571,9 +604,15 @@ def condense(
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-        return swaps, empty_req_indices
+    def _commit_logit_procs_state_changes(self) -> None:
+        """Apply batch add/remove/permute to logits procs' states"""
+        for logit_proc in self.logit_procs + self.nongreedy_logits_procs:
+            logit_proc.commit_state_changes(self.num_reqs,self.batch_update)
+        # Clear state change representation to prepare for next step
+        self.batch_update.reset()
 
-    def refresh_sampling_metadata(self):
+    def refresh(self):
+        self._commit_logit_procs_state_changes()
         self.sampling_metadata = self._make_sampling_metadata()
 
     def logit_procs_update_states(
@@ -586,7 +625,7 @@ def logit_procs_update_states(
 
         # Update states of logits processors
         for processor in chain(self.logit_procs, self.nongreedy_logits_procs):
-            processor.update_states(
+            processor.commit_state_changes(
                 BatchUpdate(
                     removed=removed,
                     moved=moved,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index df34c980d4f..2df9b28e4e6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,7 +5,6 @@
 import gc
 import time
 import weakref
-from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
@@ -289,10 +288,7 @@ def __init__(
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
 
-    def _may_reorder_batch(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> Sequence[tuple[int, int]]:
+    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         """
         Update the order of requests in the batch based on the attention
         backend's needs. For example, some attention backends (namely MLA) may
@@ -303,9 +299,9 @@ def _may_reorder_batch(
             scheduler_output: The scheduler output.
 
         Returns:
-            Sequence of swap tuples
+            True if the batch was reordered, False otherwise.
         """
-        swaps = self.attn_metadata_builders[0].reorder_batch(
+        batch_reordered = self.attn_metadata_builders[0].reorder_batch(
             self.input_batch, scheduler_output)
 
         # For models with multiple KV cache groups, the groups should agree on
@@ -315,7 +311,7 @@ def _may_reorder_batch(
         for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
             assert not self.attn_metadata_builders[i].reorder_batch(
                 self.input_batch, scheduler_output)
-        return swaps
+        return batch_reordered
 
     # Note: used for model runner override.
     def _init_device_properties(self) -> None:
@@ -348,11 +344,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # then resubmitted with the same ID. In this case, we treat them as two
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
-        removed_req_indices: list[int] = []
         for req_id in scheduler_output.finished_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            if req_index is not None:
-                removed_req_indices.append(req_index)
+            self.input_batch.remove_request(req_id)
 
         # Free the cached encoder outputs.
         for req_id, input_id in scheduler_output.free_encoder_input_ids:
@@ -375,9 +368,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # have low request overlap (e.g., alternating between two distinct
         # sets of requests), this optimization becomes very inefficient.
         for req_id in unscheduled_req_ids:
-            req_index = self.input_batch.remove_request(req_id)
-            assert req_index is not None
-            removed_req_indices.append(req_index)
+            assert self.input_batch.remove_request(req_id)
 
         req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
@@ -504,46 +495,23 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
 
         # Check if the batch has changed. If not, we can skip copying the
         # sampling metadata from CPU to GPU.
-        batch_changed = (len(removed_req_indices) > 0
-                         or len(req_ids_to_add) > 0)
+        has_removed_requests = self.input_batch.has_step_removed_requests()
+        batch_changed = has_removed_requests or len(req_ids_to_add) > 0
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
-        removed = removed_req_indices
-        added = []
-        removed_req_indices.sort(reverse=True)
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
-            if removed_req_indices:
-                # Fill the empty index.
-                req_index = removed_req_indices.pop()
-            else:
-                # Append to the end.
-                req_index = None
-            req_index = self.input_batch.add_request(req_state, req_index)
-            added.append((req_index, req_state.sampling_params,
-                          req_state.output_token_ids))
+            self.input_batch.add_request(req_state)
 
         # Condense the batched states if there are empty indices.
-        if removed_req_indices:
-            moved, removed = self.input_batch.condense(removed_req_indices)
-        else:
-            moved = []
-
-        # Some attention backends (namely MLA) may want to separate requests
-        # based on if the attention computation will be compute-bound or
-        # memory-bound. This gives them a hook to do that.
-        if swaps := self._may_reorder_batch(scheduler_output):
-            moved.extend(swaps)
-            batch_changed = True
-
-        if batch_changed:
-            self.input_batch.refresh_sampling_metadata()
-            self.input_batch.logit_procs_update_states(
-                removed=removed,
-                moved=moved,
-                added=added,
-            )
+        if self.input_batch.has_step_removed_requests():
+            self.input_batch.condense()
+
+        batch_reordered = self._may_reorder_batch(scheduler_output)
+
+        if batch_changed or batch_reordered:
+            self.input_batch.refresh()
 
     def _get_cumsum_and_arange(
         self,
@@ -1867,6 +1835,7 @@ def _dummy_sampler_run(
             all_random=False,
             top_p=dummy_tensors(0.9),
             top_k=dummy_tensors(logits.size(1) - 1),
+            min_p=None,
             generators={},
             max_num_logprobs=None,
             no_penalties=True,
@@ -1875,10 +1844,10 @@ def _dummy_sampler_run(
             presence_penalties=dummy_tensors(0.1),
             repetition_penalties=dummy_tensors(0.1),
             output_token_ids=[[] for _ in range(num_reqs)],
+            min_tokens={},
+            logit_bias=[None for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logits_procs=[],
-            nongreedy_logits_procs=[],
         )
         try:
             sampler_output = self.sampler(logits=logits,

From e51a1e448119de7b0325560b2666e3b3077282c5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 4 Jun 2025 17:34:18 -0400
Subject: [PATCH 035/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 330 +++++++++++++++++++++++------
 vllm/v1/worker/gpu_input_batch.py  |  10 +-
 2 files changed, 272 insertions(+), 68 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 2f7efa79dab..8039b1ad57b 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import dataclasses
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Sequence
 
 import torch
 from torch._prims_common import DeviceLikeType
@@ -65,9 +65,6 @@ def pop_removed_if_can(self) -> Optional[int]:
             return self._removed.pop()
         return None
 
-    def append_removed(self,req_index: int) -> None:
-        self._removed.append(req_index)
-
     def reset(self):
         self.batch_size = 0
         self._removed = []
@@ -76,6 +73,10 @@ def reset(self):
         self.swapped = []
         self.added = []
 
+    def is_empty(self) -> bool:
+        return not(self._removed or self.added or 
+                   self.moved or self.swapped)
+
 
 class LogitsProcessor(ABC):
     batch_update: BatchUpdate
@@ -99,37 +100,124 @@ def requires_nongreedy(cls) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def _commit_state_changes(
-        self,
-        batch_update: BatchUpdate,
-    ) -> None:
-        """Called when there are new output tokens, prior
-        to each forward pass.
+    def _commit_prologue(self) -> dict:
+        """Invoked first when committing state changes.
+        
+        Returns:
+          A dictionary containing any state to pass on
+        """
+        raise NotImplementedError
+    
+    @abstractmethod
+    def _commit_add_requests(self, reqs: list[AddedRequestType], 
+                             state: dict) -> None:
+        """Update logits processor state with added requests.
+        
+        Invoked after prologue when committing state changes.
+
+        Args:
+          reqs: list of requests ((index, params, output tokens) tuples) to add
+          state: state dictionary which can be modified in-place
+        """
+        raise NotImplementedError
+    
+    @abstractmethod
+    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
+                                state: dict) -> None:
+        """Update logits processor state with removed requests.
+        
+        Invoked after added requests are processed.
 
         Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
+          reqs: list of request indices to remove
+          state: state dictionary which can be modified in-place
         """
         raise NotImplementedError
+    
+    @abstractmethod
+    def _commit_move_requests(self, moves: list[MovedRequestType],
+                              state: dict) -> None:
+        """Update logits processor state with moved requests.
+        
+        Invoked after removed requests are processed.
 
-    def add_request(self, request_info: AddedRequestType) -> None:
+        Args:
+          moves: list of one-way (from_index, to_index) move tuples
+          state: state dictionary which can be modified in-place
+        """
+        raise NotImplementedError
+    
+    @abstractmethod
+    def _commit_swap_requests(self, swaps: list[SwappedRequestType],
+                              state: dict) -> None:
+        """Update logits processor state with swapped requests.
+        
+        Invoked after moved requests are processed.
+
+        Args:
+          swaps: list of bidirectional (a_index, b_index) swap tuples
+          state: state dictionary which can be modified in-place
+        """
+        raise NotImplementedError
+    
+    @abstractmethod
+    def _commit_epilogue(self, state: dict) -> None:
+        """Invoked second-to-last when committing state changes.
+        
+        Returns:
+          state: state dictionary which can be modified in-place
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _commit_finally(self, state: dict) -> None:
+        """Always invoked last .
+        
+        Returns:
+          state: state dictionary
+        """
+        raise NotImplementedError
+
+    def _get_batch_size(self) -> int:
+        return self.batch_update.batch_size
+
+    def register_add_request(self, request_info: AddedRequestType) -> None:
         self.batch_update.added.append(request_info)
 
-    def remove_request(self, 
+    def register_remove_request(self, 
                        request_index: RemovedRequestType) -> None:
         self.batch_update.removed.append(request_index)
 
-    def move_request(self,
+    def register_move_request(self,
                      from_index: int, to_index: int) -> None:
         self.batch_update.moved.append((from_index,to_index))
 
-    def swap_requests(self,
+    def register_swap_requests(self,
                       a_index: int, b_index: int) -> None:
         self.batch_update.swapped.append((a_index,b_index))
 
     def commit_state_changes(self, batch_size: int) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Updates logits processor state to reflect batch
+        state changes.
+
+        Args:
+            batch_size: number of batch elements
+        """
+        if self.batch_update.is_empty():
+            return
         self.batch_update.batch_size = batch_size
-        self._commit_state_changes(self.batch_update)
+
+        # Invoke subclass-defined commit pipeline
+        state=self._commit_prologue()
+        self._commit_add_requests(self.batch_update.added,state)
+        self._commit_remove_requests(self.batch_update.removed,
+                                     state)
+        self._commit_move_requests(self.batch_update.moved,state)
+        self._commit_swap_requests(self.batch_update.swapped,state)
+        self._commit_epilogue(state)
 
 ###### ----- LogitsProcessor impls below here
 
@@ -160,36 +248,64 @@ def requires_nongreedy(cls) -> bool:
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def _commit_state_changes(self, batch_update: BatchUpdate):
-        if not batch_update:
-            return
+    def _commit_prologue(self) -> dict:
+        """Set up commit pipeline"""
+        return {"needs_update": False}
 
-        needs_update = False
-        # Process added requests.
-        for index, sampling_params, _ in batch_update.added:
+    def _commit_add_requests(self, reqs: list[AddedRequestType], 
+                             state: dict) -> None:
+        """Process added requests"""
+        assert "needs_update" in state
+        for index, sampling_params, _ in reqs:
             min_p = sampling_params.min_p
             self.min_p_cpu[index] = min_p
             if min_p:
                 self.min_p_count += 1
-                needs_update = True
-
-        if self.min_p_count:
-            # Process removed and moved requests.
-            for index in batch_update.removed:
-                if self.min_p_cpu[index]:
-                    self.min_p_count -= 1
-                    needs_update = True
-
-            for from_index, to_index in batch_update.moved:
-                min_p = self.min_p_cpu[from_index]
-                self.min_p_cpu[to_index] = min_p
-                if min_p:
-                    needs_update = True
-
-        # Update tensors if needed.
-        size = batch_update.batch_size
-        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
+                state["needs_update"] = True
+
+    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
+                                state: dict) -> None:
+        """Process removed requests"""
+        if not (self.min_p_count and reqs):
+            return        
+        assert "needs_update" in state
+        for index in reqs:
+            if self.min_p_cpu[index]:
+                self.min_p_count -= 1
+                state["needs_update"] = True
+
+    def _commit_move_requests(self, moves: list[MovedRequestType],
+                               state: dict) -> None:
+        """Process moved (i1 -> i2) requests"""
+        if not (self.min_p_count and moves):
+            return
+        assert "needs_update" in state
+        for from_index, to_index in moves:
+            min_p = self.min_p_cpu[from_index]
+            self.min_p_cpu[to_index] = min_p
+            if min_p:
+                state["needs_update"] = True
 
+    def _commit_swap_requests(self, swaps: list[SwappedRequestType], 
+                              state: dict) -> None:
+        """Process swapped (i1 <-> i2) requests"""
+        if not (self.min_p_count and swaps):
+            return
+        assert "needs_update" in state
+        for adx, bdx in swaps:
+            min_p_a = self.min_p_cpu[adx]
+            min_p_b = self.min_p_cpu[bdx]
+            if min_p_a or min_p_b:
+                state["needs_update"] = True
+            self.min_p_cpu[adx] = min_p_b
+            self.min_p_cpu[bdx] = min_p_a
+
+    def _commit_epilogue(self, state: dict) -> None:
+        """Update tensors if needed"""
+        assert "needs_update" in state
+        size = self._get_batch_size()
+        if self.min_p_count and (state["needs_update"] or 
+                                 self.min_p.shape[0] != size):
             self.min_p = self.min_p_device[:size]
             self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
             self.min_p.unsqueeze_(1)
@@ -229,30 +345,59 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _commit_state_changes(self, batch_update: BatchUpdate):
-        if not batch_update:
-            return
+    def _commit_prologue(self) -> dict:
+        """Set up commit pipeline"""
+        return {"needs_update": False}
 
-        needs_update = False
-        # Process added requests.
-        for index, sampling_params, _ in batch_update.added:
+    def _commit_add_requests(self, reqs: list[AddedRequestType], 
+                             state: dict) -> None:
+        """Process added requests"""
+        assert "needs_update" in state
+        for index, sampling_params, _ in reqs:
             if lb := sampling_params.logit_bias:
                 self.biases[index] = lb
-                needs_update = True
+                state["needs_update"] = True
 
-        if self.biases:
-            # Process removed and moved requests.
-            for index in batch_update.removed:
-                if self.biases.pop(index, None):
-                    needs_update = True
-
-            for from_index, to_index in batch_update.moved:
-                if entry := self.biases.pop(from_index, None):
-                    self.biases[to_index] = entry
-                    needs_update = True
-
-        # Update tensors if needed.
-        if self.biases and needs_update:
+    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
+                                state: dict) -> None:
+        """Process removed requests"""
+        if not (self.biases and reqs):
+            return
+        assert "needs_update" in state
+        for index in reqs:
+            if self.biases.pop(index, None):
+                state["needs_update"] = True
+
+    def _commit_move_requests(self, moves: list[MovedRequestType],
+                               state: dict) -> None:
+        """Process moved (i1 -> i2) requests"""
+        if not (self.biases and moves):
+            return
+        assert "needs_update" in state
+        for from_index, to_index in moves:
+            if entry := self.biases.pop(from_index, None):
+                self.biases[to_index] = entry
+                state["needs_update"] = True
+
+    def _commit_swap_requests(self, swaps: list[SwappedRequestType], 
+                              state: dict) -> None:
+        """Process swapped (i1 <-> i2) requests"""
+        if not (self.biases and swaps):
+            return
+        assert "needs_update" in state
+        for a_index, b_index in swaps:
+            a_entry = self.biases.pop(a_index, None)
+            b_entry = self.biases.pop(b_index, None)
+            state["needs_update"] = bool(a_entry or b_entry)
+            if a_entry:
+                self.biases[b_index] = a_entry
+            if b_entry:
+                self.biases[a_index] = b_entry
+
+    def _commit_epilogue(self, state: dict) -> None:
+        """Update tensors if needed"""
+        assert "needs_update" in state
+        if self.biases and state["needs_update"]:
             reqs, tok_ids, biases = [], [], []
             for req, lb in self.biases.items():
                 reqs.extend([req] * len(lb))
@@ -296,6 +441,69 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
+    def _commit_prologue(self) -> dict:
+        """Set up commit pipeline"""
+        return {"needs_update": False}
+
+    def _commit_add_requests(self, reqs: list[AddedRequestType], 
+                             state: dict) -> None:
+        """Process added requests"""
+        assert "needs_update" in state
+        for index, sampling_params, _ in reqs:
+            if lb := sampling_params.logit_bias:
+                self.biases[index] = lb
+                state["needs_update"] = True
+
+    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
+                                state: dict) -> None:
+        """Process removed requests"""
+        if not (self.biases and reqs):
+            return
+        assert "needs_update" in state
+        for index in reqs:
+            if self.biases.pop(index, None):
+                state["needs_update"] = True
+
+    def _commit_move_requests(self, moves: list[MovedRequestType],
+                               state: dict) -> None:
+        """Process moved (i1 -> i2) requests"""
+        if not (self.biases and moves):
+            return
+        assert "needs_update" in state
+        for from_index, to_index in moves:
+            if entry := self.biases.pop(from_index, None):
+                self.biases[to_index] = entry
+                state["needs_update"] = True
+
+    def _commit_swap_requests(self, swaps: list[SwappedRequestType], 
+                              state: dict) -> None:
+        """Process swapped (i1 <-> i2) requests"""
+        if not (self.biases and swaps):
+            return
+        assert "needs_update" in state
+        for a_index, b_index in swaps:
+            a_entry = self.biases.pop(a_index, None)
+            b_entry = self.biases.pop(b_index, None)
+            state["needs_update"] = bool(a_entry or b_entry)
+            if a_entry:
+                self.biases[b_index] = a_entry
+            if b_entry:
+                self.biases[a_index] = b_entry
+
+    def _commit_epilogue(self, state: dict) -> None:
+        """Update tensors if needed"""
+        assert "needs_update" in state
+        if self.biases and state["needs_update"]:
+            reqs, tok_ids, biases = [], [], []
+            for req, lb in self.biases.items():
+                reqs.extend([req] * len(lb))
+                tok_ids.extend(lb.keys())
+                biases.extend(lb.values())
+
+            self.bias_tensor = self._device_tensor(biases, torch.float32)
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
+
     def _commit_state_changes(self, batch_update: BatchUpdate):
         needs_update = False
         if batch_update:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c9c6246e1a8..bdb23c5f1aa 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -274,12 +274,13 @@ def _get_next_add_index(self) -> int:
         # Append to end
         return self.num_reqs
 
-    def _register_add_request(self, request: "CachedRequestState") -> None:
+    def _register_add_request(self, request: "CachedRequestState") -> int:
         """Track add-request operations"""
         req_index = self._get_next_add_index()
         assert req_index < self.max_num_reqs
         self.batch_update.added.append((req_index,request.sampling_params,
                                         request.output_token_ids))
+        return req_index
 
     def has_step_removed_requests(self) -> bool:
         return self.batch_update.has_removed()
@@ -406,7 +407,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.batch_update.append_removed(req_index)
+        self.batch_update.removed.append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -440,11 +441,6 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.bad_words_token_ids.pop(req_index, None)
         return req_index
 
-    def _register_swap_requests(self, a: int, b: int) -> None:
-        """Track swap operations (exchanges of requests at
-        respective differing indices.)"""
-        self.batch_update.swapped.append((a,b))
-
     def swap_states(self, i1: int, i2: int) -> None:
         self.batch_update.swapped.append((i1,i2))
         old_id_i1 = self._req_ids[i1]

From dc2d57ec7d2a100c7f3795b1e4d3b15b8aa9b5d0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 4 Jun 2025 18:28:00 -0400
Subject: [PATCH 036/180] restructure

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 357 +++++++----------------------
 vllm/v1/worker/gpu_input_batch.py  | 122 +++++-----
 vllm/v1/worker/gpu_model_runner.py |   5 +-
 3 files changed, 138 insertions(+), 346 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 8039b1ad57b..a68078f1d2a 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import dataclasses
 from abc import ABC, abstractmethod
-from typing import Optional, Sequence
+from collections.abc import Sequence
+from typing import Optional
 
 import torch
 from torch._prims_common import DeviceLikeType
@@ -13,27 +14,28 @@
 
 # (index, params, output_tok_ids) for new
 # requests added to the batch.
-AddedRequestType=tuple[int, SamplingParams, list[int]]
+AddedRequestType = tuple[int, SamplingParams, list[int]]
 # (a, b) batch indices of any requests
 # swapped within the batch.
-SwappedRequestType=tuple[int, int]
+SwappedRequestType = tuple[int, int]
 # (from, to) batch indices of any requests
 # moved within the batch.
-MovedRequestType=tuple[int, int]
+MovedRequestType = tuple[int, int]
 # Batch indices of any removed requests.
-RemovedRequestType=int
+RemovedRequestType = int
+
 
 @dataclasses.dataclass
 class BatchUpdate:
     # The current number of requests in the batch.
-    batch_size: int = 0 # Must be updated
+    batch_size: int = 0  # Must be updated
     _removed: list[RemovedRequestType] = []
     _is_removed_sorted: bool = False
     moved: list[MovedRequestType] = []
     swapped: list[SwappedRequestType] = []
     added: list[AddedRequestType] = []
 
-    def _sort_removed(self)->None:
+    def _sort_removed(self) -> None:
         """Sort removed request indices in
         descending order.
         
@@ -42,7 +44,7 @@ def _sort_removed(self)->None:
         """
         if not self._is_removed_sorted:
             self._removed.sort(reverse=True)
-            self._is_removed_sorted=True
+            self._is_removed_sorted = True
 
     @property
     def removed(self) -> list[RemovedRequestType]:
@@ -73,10 +75,6 @@ def reset(self):
         self.swapped = []
         self.added = []
 
-    def is_empty(self) -> bool:
-        return not(self._removed or self.added or 
-                   self.moved or self.swapped)
-
 
 class LogitsProcessor(ABC):
     batch_update: BatchUpdate
@@ -100,124 +98,36 @@ def requires_nongreedy(cls) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def _commit_prologue(self) -> dict:
-        """Invoked first when committing state changes.
-        
-        Returns:
-          A dictionary containing any state to pass on
-        """
-        raise NotImplementedError
-    
-    @abstractmethod
-    def _commit_add_requests(self, reqs: list[AddedRequestType], 
-                             state: dict) -> None:
-        """Update logits processor state with added requests.
-        
-        Invoked after prologue when committing state changes.
-
-        Args:
-          reqs: list of requests ((index, params, output tokens) tuples) to add
-          state: state dictionary which can be modified in-place
-        """
-        raise NotImplementedError
-    
-    @abstractmethod
-    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
-                                state: dict) -> None:
-        """Update logits processor state with removed requests.
-        
-        Invoked after added requests are processed.
-
-        Args:
-          reqs: list of request indices to remove
-          state: state dictionary which can be modified in-place
-        """
-        raise NotImplementedError
-    
-    @abstractmethod
-    def _commit_move_requests(self, moves: list[MovedRequestType],
-                              state: dict) -> None:
-        """Update logits processor state with moved requests.
-        
-        Invoked after removed requests are processed.
-
-        Args:
-          moves: list of one-way (from_index, to_index) move tuples
-          state: state dictionary which can be modified in-place
-        """
-        raise NotImplementedError
-    
-    @abstractmethod
-    def _commit_swap_requests(self, swaps: list[SwappedRequestType],
-                              state: dict) -> None:
-        """Update logits processor state with swapped requests.
-        
-        Invoked after moved requests are processed.
+    def _commit_state_changes(
+        self,
+        batch_update: BatchUpdate,
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
 
         Args:
-          swaps: list of bidirectional (a_index, b_index) swap tuples
-          state: state dictionary which can be modified in-place
-        """
-        raise NotImplementedError
-    
-    @abstractmethod
-    def _commit_epilogue(self, state: dict) -> None:
-        """Invoked second-to-last when committing state changes.
-        
-        Returns:
-          state: state dictionary which can be modified in-place
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def _commit_finally(self, state: dict) -> None:
-        """Always invoked last .
-        
-        Returns:
-          state: state dictionary
-        """
-        raise NotImplementedError
-
-    def _get_batch_size(self) -> int:
-        return self.batch_update.batch_size
-
     def register_add_request(self, request_info: AddedRequestType) -> None:
         self.batch_update.added.append(request_info)
 
-    def register_remove_request(self, 
-                       request_index: RemovedRequestType) -> None:
+    def register_remove_request(self,
+                                request_index: RemovedRequestType) -> None:
         self.batch_update.removed.append(request_index)
 
-    def register_move_request(self,
-                     from_index: int, to_index: int) -> None:
-        self.batch_update.moved.append((from_index,to_index))
+    def register_move_request(self, from_index: int, to_index: int) -> None:
+        self.batch_update.moved.append((from_index, to_index))
 
-    def register_swap_requests(self,
-                      a_index: int, b_index: int) -> None:
-        self.batch_update.swapped.append((a_index,b_index))
+    def register_swap_requests(self, a_index: int, b_index: int) -> None:
+        self.batch_update.swapped.append((a_index, b_index))
 
     def commit_state_changes(self, batch_size: int) -> None:
-        """Called when there are new output tokens, prior
-        to each forward pass.
-
-        Updates logits processor state to reflect batch
-        state changes.
-
-        Args:
-            batch_size: number of batch elements
-        """
-        if self.batch_update.is_empty():
-            return
         self.batch_update.batch_size = batch_size
+        self._commit_state_changes(self.batch_update)
 
-        # Invoke subclass-defined commit pipeline
-        state=self._commit_prologue()
-        self._commit_add_requests(self.batch_update.added,state)
-        self._commit_remove_requests(self.batch_update.removed,
-                                     state)
-        self._commit_move_requests(self.batch_update.moved,state)
-        self._commit_swap_requests(self.batch_update.swapped,state)
-        self._commit_epilogue(state)
 
 ###### ----- LogitsProcessor impls below here
 
@@ -248,64 +158,45 @@ def requires_nongreedy(cls) -> bool:
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def _commit_prologue(self) -> dict:
-        """Set up commit pipeline"""
-        return {"needs_update": False}
+    def _commit_state_changes(self, batch_update: BatchUpdate):
+        if not batch_update:
+            return
 
-    def _commit_add_requests(self, reqs: list[AddedRequestType], 
-                             state: dict) -> None:
-        """Process added requests"""
-        assert "needs_update" in state
-        for index, sampling_params, _ in reqs:
+        needs_update = False
+        # Process added requests.
+        for index, sampling_params, _ in batch_update.added:
             min_p = sampling_params.min_p
             self.min_p_cpu[index] = min_p
             if min_p:
                 self.min_p_count += 1
-                state["needs_update"] = True
-
-    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
-                                state: dict) -> None:
-        """Process removed requests"""
-        if not (self.min_p_count and reqs):
-            return        
-        assert "needs_update" in state
-        for index in reqs:
-            if self.min_p_cpu[index]:
-                self.min_p_count -= 1
-                state["needs_update"] = True
-
-    def _commit_move_requests(self, moves: list[MovedRequestType],
-                               state: dict) -> None:
-        """Process moved (i1 -> i2) requests"""
-        if not (self.min_p_count and moves):
-            return
-        assert "needs_update" in state
-        for from_index, to_index in moves:
-            min_p = self.min_p_cpu[from_index]
-            self.min_p_cpu[to_index] = min_p
-            if min_p:
-                state["needs_update"] = True
+                needs_update = True
 
-    def _commit_swap_requests(self, swaps: list[SwappedRequestType], 
-                              state: dict) -> None:
-        """Process swapped (i1 <-> i2) requests"""
-        if not (self.min_p_count and swaps):
-            return
-        assert "needs_update" in state
-        for adx, bdx in swaps:
-            min_p_a = self.min_p_cpu[adx]
-            min_p_b = self.min_p_cpu[bdx]
-            if min_p_a or min_p_b:
-                state["needs_update"] = True
-            self.min_p_cpu[adx] = min_p_b
-            self.min_p_cpu[bdx] = min_p_a
-
-    def _commit_epilogue(self, state: dict) -> None:
-        """Update tensors if needed"""
-        assert "needs_update" in state
-        size = self._get_batch_size()
-        if self.min_p_count and (state["needs_update"] or 
-                                 self.min_p.shape[0] != size):
+        if self.min_p_count:
+            # Process removed requests.
+            for index in batch_update.removed:
+                if self.min_p_cpu[index]:
+                    self.min_p_count -= 1
+                    needs_update = True
+
+            # Process moved (i1 -> i2) requests
+            for from_index, to_index in batch_update.moved:
+                min_p = self.min_p_cpu[from_index]
+                self.min_p_cpu[to_index] = min_p
+                if min_p:
+                    needs_update = True
+
+            # Process swapped (i1 <-> i2) requests
+            for adx, bdx in batch_update.swapped:
+                min_p_a = self.min_p_cpu[adx]
+                min_p_b = self.min_p_cpu[bdx]
+                if min_p_a or min_p_b:
+                    needs_update = True
+                self.min_p_cpu[adx] = min_p_b
+                self.min_p_cpu[bdx] = min_p_a
+
+        # Update tensors if needed.
+        size = batch_update.batch_size
+        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
             self.min_p = self.min_p_device[:size]
             self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
             self.min_p.unsqueeze_(1)
@@ -345,59 +236,30 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _commit_prologue(self) -> dict:
-        """Set up commit pipeline"""
-        return {"needs_update": False}
+    def _commit_state_changes(self, batch_update: BatchUpdate):
+        if not batch_update:
+            return
 
-    def _commit_add_requests(self, reqs: list[AddedRequestType], 
-                             state: dict) -> None:
-        """Process added requests"""
-        assert "needs_update" in state
-        for index, sampling_params, _ in reqs:
+        needs_update = False
+        # Process added requests.
+        for index, sampling_params, _ in batch_update.added:
             if lb := sampling_params.logit_bias:
                 self.biases[index] = lb
-                state["needs_update"] = True
+                needs_update = True
 
-    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
-                                state: dict) -> None:
-        """Process removed requests"""
-        if not (self.biases and reqs):
-            return
-        assert "needs_update" in state
-        for index in reqs:
-            if self.biases.pop(index, None):
-                state["needs_update"] = True
-
-    def _commit_move_requests(self, moves: list[MovedRequestType],
-                               state: dict) -> None:
-        """Process moved (i1 -> i2) requests"""
-        if not (self.biases and moves):
-            return
-        assert "needs_update" in state
-        for from_index, to_index in moves:
-            if entry := self.biases.pop(from_index, None):
-                self.biases[to_index] = entry
-                state["needs_update"] = True
-
-    def _commit_swap_requests(self, swaps: list[SwappedRequestType], 
-                              state: dict) -> None:
-        """Process swapped (i1 <-> i2) requests"""
-        if not (self.biases and swaps):
-            return
-        assert "needs_update" in state
-        for a_index, b_index in swaps:
-            a_entry = self.biases.pop(a_index, None)
-            b_entry = self.biases.pop(b_index, None)
-            state["needs_update"] = bool(a_entry or b_entry)
-            if a_entry:
-                self.biases[b_index] = a_entry
-            if b_entry:
-                self.biases[a_index] = b_entry
-
-    def _commit_epilogue(self, state: dict) -> None:
-        """Update tensors if needed"""
-        assert "needs_update" in state
-        if self.biases and state["needs_update"]:
+        if self.biases:
+            # Process removed and moved requests.
+            for index in batch_update.removed:
+                if self.biases.pop(index, None):
+                    needs_update = True
+
+            for from_index, to_index in batch_update.moved:
+                if entry := self.biases.pop(from_index, None):
+                    self.biases[to_index] = entry
+                    needs_update = True
+
+        # Update tensors if needed.
+        if self.biases and needs_update:
             reqs, tok_ids, biases = [], [], []
             for req, lb in self.biases.items():
                 reqs.extend([req] * len(lb))
@@ -441,69 +303,6 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _commit_prologue(self) -> dict:
-        """Set up commit pipeline"""
-        return {"needs_update": False}
-
-    def _commit_add_requests(self, reqs: list[AddedRequestType], 
-                             state: dict) -> None:
-        """Process added requests"""
-        assert "needs_update" in state
-        for index, sampling_params, _ in reqs:
-            if lb := sampling_params.logit_bias:
-                self.biases[index] = lb
-                state["needs_update"] = True
-
-    def _commit_remove_requests(self, reqs: list[RemovedRequestType], 
-                                state: dict) -> None:
-        """Process removed requests"""
-        if not (self.biases and reqs):
-            return
-        assert "needs_update" in state
-        for index in reqs:
-            if self.biases.pop(index, None):
-                state["needs_update"] = True
-
-    def _commit_move_requests(self, moves: list[MovedRequestType],
-                               state: dict) -> None:
-        """Process moved (i1 -> i2) requests"""
-        if not (self.biases and moves):
-            return
-        assert "needs_update" in state
-        for from_index, to_index in moves:
-            if entry := self.biases.pop(from_index, None):
-                self.biases[to_index] = entry
-                state["needs_update"] = True
-
-    def _commit_swap_requests(self, swaps: list[SwappedRequestType], 
-                              state: dict) -> None:
-        """Process swapped (i1 <-> i2) requests"""
-        if not (self.biases and swaps):
-            return
-        assert "needs_update" in state
-        for a_index, b_index in swaps:
-            a_entry = self.biases.pop(a_index, None)
-            b_entry = self.biases.pop(b_index, None)
-            state["needs_update"] = bool(a_entry or b_entry)
-            if a_entry:
-                self.biases[b_index] = a_entry
-            if b_entry:
-                self.biases[a_index] = b_entry
-
-    def _commit_epilogue(self, state: dict) -> None:
-        """Update tensors if needed"""
-        assert "needs_update" in state
-        if self.biases and state["needs_update"]:
-            reqs, tok_ids, biases = [], [], []
-            for req, lb in self.biases.items():
-                reqs.extend([req] * len(lb))
-                tok_ids.extend(lb.keys())
-                biases.extend(lb.values())
-
-            self.bias_tensor = self._device_tensor(biases, torch.float32)
-            self.logits_slice = (self._device_tensor(reqs, torch.int32),
-                                 self._device_tensor(tok_ids, torch.int32))
-
     def _commit_state_changes(self, batch_update: BatchUpdate):
         needs_update = False
         if batch_update:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index bdb23c5f1aa..5f6d851dfe9 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Datastructures defining an input batch
 
-from collections.abc import Sequence
 from dataclasses import dataclass
-from itertools import chain
 from typing import Optional, cast
 
 import numpy as np
@@ -20,11 +18,7 @@
                                              LogitBiasLogitsProcessor,
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor,
-                                             RemovedRequestType,
-                                             AddedRequestType,
-                                             MovedRequestType,
-                                             SwappedRequestType)
+                                             MinTokensLogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
@@ -226,21 +220,26 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        # Define logits processors
-        # TODO(andy): logits processor list should be extensible via engine
-        # constructor argument; for now the list is fixed.
-        self.logit_procs: list[LogitsProcessor] = [
-            MinTokensLogitsProcessor(pin_memory=pin_memory, device=device),
-            LogitBiasLogitsProcessor(pin_memory=pin_memory, device=device),
-        ]
-        self.min_p_logitsproc = MinPLogitsProcessor(
-            pin_memory=pin_memory,
-            device=device,
-            # +1 for temporary swap space
-            max_num_reqs=max_num_reqs + 1)
-        self.nongreedy_logits_procs: list[LogitsProcessor] = [
-            self.min_p_logitsproc
-        ]
+        if not is_tpu:
+            # Internal representation of per-step batch state changes.
+            # Should reset each step.
+            self.batch_update = BatchUpdate()
+
+            # Define logits processors
+            # TODO(andy): logits processor list should be extensible via engine
+            # constructor argument; for now the list is fixed.
+            self.logit_procs: list[LogitsProcessor] = [
+                MinTokensLogitsProcessor(pin_memory=pin_memory, device=device),
+                LogitBiasLogitsProcessor(pin_memory=pin_memory, device=device),
+            ]
+            self.min_p_logitsproc = MinPLogitsProcessor(
+                pin_memory=pin_memory,
+                device=device,
+                # +1 for temporary swap space
+                max_num_reqs=max_num_reqs + 1)
+            self.nongreedy_logits_procs: list[LogitsProcessor] = [
+                self.min_p_logitsproc
+            ]
 
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
@@ -257,10 +256,6 @@ def __init__(
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
-        # Internal representation of per-step batch state changes.
-        # Should reset each step.
-        self.batch_update = BatchUpdate()
-
     @property
     def req_ids(self) -> list[str]:
         # None elements should only be present transiently
@@ -278,8 +273,8 @@ def _register_add_request(self, request: "CachedRequestState") -> int:
         """Track add-request operations"""
         req_index = self._get_next_add_index()
         assert req_index < self.max_num_reqs
-        self.batch_update.added.append((req_index,request.sampling_params,
-                                        request.output_token_ids))
+        self.batch_update.added.append(
+            (req_index, request.sampling_params, request.output_token_ids))
         return req_index
 
     def has_step_removed_requests(self) -> bool:
@@ -288,8 +283,17 @@ def has_step_removed_requests(self) -> bool:
     def add_request(
         self,
         request: "CachedRequestState",
+        req_index: Optional[int] = None,
     ) -> int:
-        req_index=self._register_add_request(request)
+        if is_tpu:
+            # TODO(andy): update TPU implementation
+            if req_index is None:
+                req_index = self.num_reqs
+            assert req_index < self.max_num_reqs
+        else:
+            # Ignore req_index argument on GPU
+            req_index = self._register_add_request(request)
+
         req_id = request.req_id
         if req_index == len(self._req_ids):
             self._req_ids.append(req_id)
@@ -407,7 +411,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.batch_update.removed.append(req_index)
+        if not is_tpu:
+            # TODO(andy): TPU implementation does not support this path
+            self.batch_update.removed.append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -442,7 +448,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
-        self.batch_update.swapped.append((i1,i2))
+        if not is_tpu:
+            # TODO(andy): TPU implementation does not support this path
+            self.batch_update.swapped.append((i1, i2))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -499,12 +507,10 @@ def swap_states(self, i1: int, i2: int) -> None:
 
         self.block_table.swap_row(i1, i2)
 
-    def _register_move_request(self,
-                               from_idx: int, 
-                               to_idx: int) -> None:
-        self.batch_update.moved.append((from_idx,to_idx))
+    def _register_move_request(self, from_idx: int, to_idx: int) -> None:
+        self.batch_update.moved.append((from_idx, to_idx))
 
-    def condense(self) -> None:
+    def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
         """Slide non-empty requests down into empty indices.
 
         Any consecutive empty indices at the very end of the list are not
@@ -517,32 +523,37 @@ def condense(self) -> None:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
         """
+        if is_tpu:
+            assert empty_req_indices is not None
+        else:
+            assert empty_req_indices is None
+            empty_req_indices = self.batch_update.removed
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
             self._req_ids.clear()
             self.req_output_token_ids.clear()
-            return [], []
+            return
 
         # NOTE(woosuk): This function assumes that the empty_req_indices
         # is sorted in descending order.
-        last_req_index = num_reqs + self.batch_update.num_removed() - 1
-        while self.batch_update.has_removed():
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
             # Find the largest non-empty index.
-            empty_req_indices = self.batch_update.removed
             while last_req_index in empty_req_indices:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = self.batch_update.peek_removed()
+            empty_index = (empty_req_indices.pop()
+                           if is_tpu else self.batch_update.peek_removed())
             if empty_index >= last_req_index:
                 break
 
             # Move active request down into empty request
             # index.
-            self.batch_update.pop_removed_if_can()
-            self.batch_update.moved.append((last_req_index, 
-                                            empty_index))
+            if not is_tpu:
+                self.batch_update.pop_removed_if_can()
+                self.batch_update.moved.append((last_req_index, empty_index))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -603,32 +614,15 @@ def condense(self) -> None:
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
         for logit_proc in self.logit_procs + self.nongreedy_logits_procs:
-            logit_proc.commit_state_changes(self.num_reqs,self.batch_update)
+            logit_proc.commit_state_changes(self.num_reqs)
         # Clear state change representation to prepare for next step
         self.batch_update.reset()
 
     def refresh(self):
-        self._commit_logit_procs_state_changes()
+        if not is_tpu:
+            self._commit_logit_procs_state_changes()
         self.sampling_metadata = self._make_sampling_metadata()
 
-    def logit_procs_update_states(
-        self,
-        removed: Sequence[int] = (),
-        moved: Sequence[tuple[int, int]] = (),
-        added: Sequence[tuple[int, SamplingParams, list[int]]] = ()
-    ) -> None:
-        """Update logits processor state after batch remove/move/add"""
-
-        # Update states of logits processors
-        for processor in chain(self.logit_procs, self.nongreedy_logits_procs):
-            processor.commit_state_changes(
-                BatchUpdate(
-                    removed=removed,
-                    moved=moved,
-                    added=added,
-                    batch_size=self.num_reqs,
-                ))
-
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
         if not self.all_greedy:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2df9b28e4e6..c0e8d547682 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1835,7 +1835,6 @@ def _dummy_sampler_run(
             all_random=False,
             top_p=dummy_tensors(0.9),
             top_k=dummy_tensors(logits.size(1) - 1),
-            min_p=None,
             generators={},
             max_num_logprobs=None,
             no_penalties=True,
@@ -1844,10 +1843,10 @@ def _dummy_sampler_run(
             presence_penalties=dummy_tensors(0.1),
             repetition_penalties=dummy_tensors(0.1),
             output_token_ids=[[] for _ in range(num_reqs)],
-            min_tokens={},
-            logit_bias=[None for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
+            logits_procs=[],
+            nongreedy_logits_procs=[],
         )
         try:
             sampler_output = self.sampler(logits=logits,

From b7d0779b67f101064bc0ce77fd85fea491229ef5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 4 Jun 2025 19:08:49 -0400
Subject: [PATCH 037/180] fixed some tests

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 requirements/test.txt                     | 22 +--------
 tests/v1/sample/test_logits_processors.py |  6 +--
 vllm/v1/sample/logits_processor.py        | 58 +++++++++++++++++++----
 vllm/v1/worker/gpu_input_batch.py         |  2 +-
 4 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index ee2dacbed19..fb0eede080f 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -31,10 +31,6 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
-async-timeout==5.0.1
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -143,11 +139,6 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.3.0
-    # via
-    #   anyio
-    #   hypothesis
-    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -693,6 +684,7 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
+    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -755,13 +747,8 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
-toml==0.10.2
-    # via datamodel-code-generator
 tomli==2.2.1
-    # via
-    #   black
-    #   pytest
-    #   schemathesis
+    # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -835,18 +822,13 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
-    #   anyio
-    #   black
-    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
-    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
-    #   rich
     #   torch
     #   typer
     #   typing-inspection
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 66e105f4d37..be3f72973c8 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -306,7 +306,7 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
         batch_size=batch_size,
     )
     # Register batch update with logit processor
-    logit_bias_logitproc.commit_state_changes(batch_update)
+    logit_bias_logitproc.update_state(batch_update)
     # Emulate application of greedy logits processors in engine
     logits = _fake_apply_greedy_logits_processors(fake_logits,
                                                   sampling_metadata)
@@ -353,7 +353,7 @@ def test_min_p(device: str, batch_size: int, min_p: float):
         batch_size=batch_size,
     )
     # Register batch update with logit processor
-    min_p_logitproc.commit_state_changes(batch_update)
+    min_p_logitproc.update_state(batch_update)
     # Emulate application of non-greedy logits processors in engine
     logits = _fake_apply_nongreedy_logits_processors(fake_logits,
                                                      sampling_metadata)
@@ -405,7 +405,7 @@ def test_min_tokens_penalty(device: str, batch_size: int):
         batch_size=batch_size,
     )
     # Register batch update with logit processor
-    min_tokens_logitproc.commit_state_changes(batch_update)
+    min_tokens_logitproc.update_state(batch_update)
     # Emulate application of greedy logits processors in engine
     logits = _fake_apply_greedy_logits_processors(fake_logits,
                                                   sampling_metadata)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index a68078f1d2a..3fb171d926d 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -24,8 +24,6 @@
 # Batch indices of any removed requests.
 RemovedRequestType = int
 
-
-@dataclasses.dataclass
 class BatchUpdate:
     # The current number of requests in the batch.
     batch_size: int = 0  # Must be updated
@@ -35,6 +33,24 @@ class BatchUpdate:
     swapped: list[SwappedRequestType] = []
     added: list[AddedRequestType] = []
 
+    def __init__(self, 
+                 removed: Optional[list[RemovedRequestType]] = None,
+                 moved: Optional[list[MovedRequestType]] = None,
+                 swapped: Optional[list[SwappedRequestType]] = None,
+                 added: Optional[list[AddedRequestType]] = None,
+                 batch_size: Optional[int] = None) -> None:
+        if removed is not None:
+            self._removed = removed
+        if moved is not None:
+            self.moved = moved
+        if swapped is not None:
+            self.swapped = swapped
+        if added is not None:
+            self.added = added
+        if batch_size is not None:
+            self.batch_size = batch_size
+
+
     def _sort_removed(self) -> None:
         """Sort removed request indices in
         descending order.
@@ -98,7 +114,7 @@ def requires_nongreedy(cls) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def _commit_state_changes(
+    def _update_state(
         self,
         batch_update: BatchUpdate,
     ) -> None:
@@ -124,9 +140,9 @@ def register_move_request(self, from_index: int, to_index: int) -> None:
     def register_swap_requests(self, a_index: int, b_index: int) -> None:
         self.batch_update.swapped.append((a_index, b_index))
 
-    def commit_state_changes(self, batch_size: int) -> None:
+    def update_state(self, batch_size: int) -> None:
         self.batch_update.batch_size = batch_size
-        self._commit_state_changes(self.batch_update)
+        self._update_state(self.batch_update)
 
 
 ###### ----- LogitsProcessor impls below here
@@ -158,7 +174,7 @@ def requires_nongreedy(cls) -> bool:
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def _commit_state_changes(self, batch_update: BatchUpdate):
+    def _update_state(self, batch_update: BatchUpdate):
         if not batch_update:
             return
 
@@ -236,7 +252,7 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _commit_state_changes(self, batch_update: BatchUpdate):
+    def _update_state(self, batch_update: BatchUpdate):
         if not batch_update:
             return
 
@@ -248,16 +264,27 @@ def _commit_state_changes(self, batch_update: BatchUpdate):
                 needs_update = True
 
         if self.biases:
-            # Process removed and moved requests.
+            # Process removed requests.
             for index in batch_update.removed:
                 if self.biases.pop(index, None):
                     needs_update = True
 
+            # Process moved requests.
             for from_index, to_index in batch_update.moved:
                 if entry := self.biases.pop(from_index, None):
                     self.biases[to_index] = entry
                     needs_update = True
 
+            # Process swapped requests.
+            for a_index, b_index in batch_update.swapped:
+                a_entry = self.biases.pop(a_index, None)
+                b_entry = self.biases.pop(b_index, None)
+                needs_update |= bool(a_entry or b_entry)
+                if a_entry:
+                    self.biases[b_index] = a_entry
+                if b_entry:
+                    self.biases[a_index] = b_entry
+
         # Update tensors if needed.
         if self.biases and needs_update:
             reqs, tok_ids, biases = [], [], []
@@ -303,7 +330,7 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _commit_state_changes(self, batch_update: BatchUpdate):
+    def _update_state(self, batch_update: BatchUpdate):
         needs_update = False
         if batch_update:
             # Process added requests.
@@ -315,16 +342,27 @@ def _commit_state_changes(self, batch_update: BatchUpdate):
                     needs_update = True
 
             if self.min_toks:
-                # Process removed and moved requests.
+                # Process removed requests.
                 for index in batch_update.removed:
                     if self.min_toks.pop(index, None):
                         needs_update = True
 
+                # Process moved requests.
                 for from_index, to_index in batch_update.moved:
                     if entry := self.min_toks.pop(from_index, None):
                         self.min_toks[to_index] = entry
                         needs_update = True
 
+                # Process swapped requests.
+                for a_index, b_index in batch_update.swapped:
+                    a_entry = self.min_toks.pop(a_index, None)
+                    b_entry = self.min_toks.pop(b_index, None)
+                    needs_update |= bool(a_entry or b_entry)
+                    if a_entry:
+                        self.min_toks[b_index] = a_entry
+                    if b_entry:
+                        self.min_toks[a_index] = b_entry
+
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
             to_remove = tuple(index for index, (min_toks, out_tok_ids,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 5f6d851dfe9..9f6e46baa61 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -614,7 +614,7 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
         for logit_proc in self.logit_procs + self.nongreedy_logits_procs:
-            logit_proc.commit_state_changes(self.num_reqs)
+            logit_proc.update_state(self.num_reqs)
         # Clear state change representation to prepare for next step
         self.batch_update.reset()
 

From abf42cc56c6028225281ec5b350c09c2c1c322e5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 6 Jun 2025 03:37:15 -0400
Subject: [PATCH 038/180] bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9f6e46baa61..7541a8d395c 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -263,7 +263,7 @@ def req_ids(self) -> list[str]:
         return cast(list[str], self._req_ids)
 
     def _get_next_add_index(self) -> int:
-        if req_index := self.batch_update.pop_removed_if_can():
+        if (req_index := self.batch_update.pop_removed_if_can()) is not None:
             # Fill the empty index.
             return req_index
         # Append to end

From 60e501669fbfd8d214b5ff4b1347d63b24b6aa97 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 6 Jun 2025 05:02:42 -0400
Subject: [PATCH 039/180] bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 8 +++++---
 vllm/v1/worker/gpu_input_batch.py  | 8 +++++++-
 vllm/v1/worker/gpu_model_runner.py | 1 -
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 3fb171d926d..6073533d4eb 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import dataclasses
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from typing import Optional
@@ -24,6 +23,7 @@
 # Batch indices of any removed requests.
 RemovedRequestType = int
 
+
 class BatchUpdate:
     # The current number of requests in the batch.
     batch_size: int = 0  # Must be updated
@@ -33,7 +33,7 @@ class BatchUpdate:
     swapped: list[SwappedRequestType] = []
     added: list[AddedRequestType] = []
 
-    def __init__(self, 
+    def __init__(self,
                  removed: Optional[list[RemovedRequestType]] = None,
                  moved: Optional[list[MovedRequestType]] = None,
                  swapped: Optional[list[SwappedRequestType]] = None,
@@ -50,7 +50,6 @@ def __init__(self,
         if batch_size is not None:
             self.batch_size = batch_size
 
-
     def _sort_removed(self) -> None:
         """Sort removed request indices in
         descending order.
@@ -67,6 +66,9 @@ def removed(self) -> list[RemovedRequestType]:
         self._sort_removed()
         return self._removed
 
+    def removed_append(self, index: int) -> None:
+        self._removed.append(index)
+
     def has_removed(self) -> bool:
         return bool(self._removed)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7541a8d395c..ed883d85bd6 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -413,7 +413,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
             return None
         if not is_tpu:
             # TODO(andy): TPU implementation does not support this path
-            self.batch_update.removed.append(req_index)
+            self.batch_update.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -528,6 +528,8 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
         else:
             assert empty_req_indices is None
             empty_req_indices = self.batch_update.removed
+            print("\n\n\n", f"empty_req_indices: {empty_req_indices}",
+                  "\n\n\n")
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -546,6 +548,8 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
             # Find the smallest empty index.
             empty_index = (empty_req_indices.pop()
                            if is_tpu else self.batch_update.peek_removed())
+            print("\n\n\n", f"- empty: {empty_index} last: {last_req_index}",
+                  "\n\n\n")
             if empty_index >= last_req_index:
                 break
 
@@ -553,6 +557,8 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
             # index.
             if not is_tpu:
                 self.batch_update.pop_removed_if_can()
+                print("\n\n\n", f"- empty_req_indices: {empty_req_indices}",
+                      "\n\n\n")
                 self.batch_update.moved.append((last_req_index, empty_index))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 11251f66095..ea1bc592a61 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1153,7 +1153,6 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, IntermediateTensors]:
-
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
             if not has_kv_transfer_group():

From 17c10ca6363fdbc55fed9074892cf7f8f3b44538 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 6 Jun 2025 21:52:50 -0400
Subject: [PATCH 040/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 6073533d4eb..3fc26371be8 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -26,12 +26,12 @@
 
 class BatchUpdate:
     # The current number of requests in the batch.
-    batch_size: int = 0  # Must be updated
-    _removed: list[RemovedRequestType] = []
-    _is_removed_sorted: bool = False
-    moved: list[MovedRequestType] = []
-    swapped: list[SwappedRequestType] = []
-    added: list[AddedRequestType] = []
+    batch_size: int
+    _removed: list[RemovedRequestType]
+    _is_removed_sorted: bool
+    moved: list[MovedRequestType]
+    swapped: list[SwappedRequestType]
+    added: list[AddedRequestType]
 
     def __init__(self,
                  removed: Optional[list[RemovedRequestType]] = None,
@@ -39,16 +39,12 @@ def __init__(self,
                  swapped: Optional[list[SwappedRequestType]] = None,
                  added: Optional[list[AddedRequestType]] = None,
                  batch_size: Optional[int] = None) -> None:
-        if removed is not None:
-            self._removed = removed
-        if moved is not None:
-            self.moved = moved
-        if swapped is not None:
-            self.swapped = swapped
-        if added is not None:
-            self.added = added
-        if batch_size is not None:
-            self.batch_size = batch_size
+        self._removed = removed or []
+        self.moved = moved or []
+        self.swapped = swapped or []
+        self.added = added or []
+        self.batch_size = 0 if batch_size is None else batch_size
+        self._is_removed_sorted = False
 
     def _sort_removed(self) -> None:
         """Sort removed request indices in

From 849d8292a657ae14144e576ac6395ee3c522f643 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 6 Jun 2025 22:20:17 -0400
Subject: [PATCH 041/180] bugfix - redundant batch update

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  1 +
 vllm/v1/sample/logits_processor.py        | 30 +++--------------------
 vllm/v1/worker/gpu_input_batch.py         |  3 ++-
 3 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index be3f72973c8..d61262fe576 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -66,6 +66,7 @@ def _create_logit_bias(
     for i in range(batch_size):
         logit_bias = {min(i, vocab_size - 1): bias_value}
         res.append(logit_bias)
+    print("\n\n\n", f"logit_bias: {logit_bias}", "\n\n\n")
     return res
 
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 3fc26371be8..15c130a00f6 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -91,11 +91,6 @@ def reset(self):
 
 
 class LogitsProcessor(ABC):
-    batch_update: BatchUpdate
-
-    def __init__(self):
-        # Empty batch update
-        self.batch_update = BatchUpdate()
 
     @abstractmethod
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
@@ -112,7 +107,7 @@ def requires_nongreedy(cls) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def _update_state(
+    def update_state(
         self,
         batch_update: BatchUpdate,
     ) -> None:
@@ -125,23 +120,6 @@ def _update_state(
         """
         raise NotImplementedError
 
-    def register_add_request(self, request_info: AddedRequestType) -> None:
-        self.batch_update.added.append(request_info)
-
-    def register_remove_request(self,
-                                request_index: RemovedRequestType) -> None:
-        self.batch_update.removed.append(request_index)
-
-    def register_move_request(self, from_index: int, to_index: int) -> None:
-        self.batch_update.moved.append((from_index, to_index))
-
-    def register_swap_requests(self, a_index: int, b_index: int) -> None:
-        self.batch_update.swapped.append((a_index, b_index))
-
-    def update_state(self, batch_size: int) -> None:
-        self.batch_update.batch_size = batch_size
-        self._update_state(self.batch_update)
-
 
 ###### ----- LogitsProcessor impls below here
 
@@ -172,7 +150,7 @@ def requires_nongreedy(cls) -> bool:
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def _update_state(self, batch_update: BatchUpdate):
+    def update_state(self, batch_update: BatchUpdate):
         if not batch_update:
             return
 
@@ -250,7 +228,7 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _update_state(self, batch_update: BatchUpdate):
+    def update_state(self, batch_update: BatchUpdate):
         if not batch_update:
             return
 
@@ -328,7 +306,7 @@ def __init__(self, pin_memory: bool, device: torch.device):
     def requires_nongreedy(cls) -> bool:
         return False
 
-    def _update_state(self, batch_update: BatchUpdate):
+    def update_state(self, batch_update: BatchUpdate):
         needs_update = False
         if batch_update:
             # Process added requests.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ed883d85bd6..9bf4f99bc8f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -619,8 +619,9 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
 
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
+        self.batch_update.batch_size = self.num_reqs
         for logit_proc in self.logit_procs + self.nongreedy_logits_procs:
-            logit_proc.update_state(self.num_reqs)
+            logit_proc.update_state(self.batch_update)
         # Clear state change representation to prepare for next step
         self.batch_update.reset()
 

From 03a836b7b4f58495c57de5c9a95167f801efd21e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 6 Jun 2025 23:43:52 -0400
Subject: [PATCH 042/180] min tokens test bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 23 ++++++----
 vllm/v1/sample/logits_processor.py        | 56 +++++++++++------------
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index d61262fe576..6ffb24e3db8 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -66,7 +66,6 @@ def _create_logit_bias(
     for i in range(batch_size):
         logit_bias = {min(i, vocab_size - 1): bias_value}
         res.append(logit_bias)
-    print("\n\n\n", f"logit_bias: {logit_bias}", "\n\n\n")
     return res
 
 
@@ -288,7 +287,8 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    logit_bias_logitproc = logitproc_dict["logit_bias"]
+    logit_bias_logitproc: LogitBiasLogitsProcessor = logitproc_dict[
+        "logit_bias"]
     # Create batch update where each request demands a
     # different logit bias
     logit_bias_list = _create_logit_bias(
@@ -386,19 +386,24 @@ def test_min_tokens_penalty(device: str, batch_size: int):
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
     sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    min_tokens_logitproc = logitproc_dict["min_tokens"]
-    batch_indices_for_min_token_penalty = np.random.randint(
-        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
+    min_tokens_logitproc: MinTokensLogitsProcessor = logitproc_dict[
+        "min_tokens"]
+    batch_indices_for_min_token_penalty = (
+        [0] if batch_size == 1 else np.random.randint(
+            0, batch_size - 1, size=np.random.randint(1, batch_size)).tolist())
     min_tokens_dict = _generate_min_token_penalties_and_stop_tokens(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
         batch_indices_for_min_token_penalty)
 
     # Create batch update where each request demands
     # a different min_tokens value
-    added: BatchAddType = [(rdx,
-                            SamplingParams(min_tokens=min_tokens_dict[rdx][0],
-                                           max_tokens=None), [])
-                           for rdx in range(batch_size)]
+    added: BatchAddType = [
+        (rdx,
+         SamplingParams(min_tokens=min_tokens_dict[rdx][0],
+                        max_tokens=None,
+                        stop_token_ids=list(min_tokens_dict[rdx][1])), [])
+        for rdx in range(batch_size)
+    ]
     batch_update = BatchUpdate(
         removed=[],
         moved=[],
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 15c130a00f6..e63aca23f3e 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -308,36 +308,36 @@ def requires_nongreedy(cls) -> bool:
 
     def update_state(self, batch_update: BatchUpdate):
         needs_update = False
-        if batch_update:
-            # Process added requests.
-            for index, sampling_params, output_tok_ids in batch_update.added:
-                if ((min_tokens := sampling_params.min_tokens)
-                        and len(output_tok_ids) < min_tokens):
-                    self.min_toks[index] = (min_tokens, output_tok_ids,
-                                            sampling_params.all_stop_token_ids)
+
+        # Process added requests.
+        for index, sampling_params, output_tok_ids in batch_update.added:
+            if ((min_tokens := sampling_params.min_tokens)
+                    and len(output_tok_ids) < min_tokens):
+                self.min_toks[index] = (min_tokens, output_tok_ids,
+                                        sampling_params.all_stop_token_ids)
+                needs_update = True
+
+        if self.min_toks:
+            # Process removed requests.
+            for index in batch_update.removed:
+                if self.min_toks.pop(index, None):
+                    needs_update = True
+
+            # Process moved requests.
+            for from_index, to_index in batch_update.moved:
+                if entry := self.min_toks.pop(from_index, None):
+                    self.min_toks[to_index] = entry
                     needs_update = True
 
-            if self.min_toks:
-                # Process removed requests.
-                for index in batch_update.removed:
-                    if self.min_toks.pop(index, None):
-                        needs_update = True
-
-                # Process moved requests.
-                for from_index, to_index in batch_update.moved:
-                    if entry := self.min_toks.pop(from_index, None):
-                        self.min_toks[to_index] = entry
-                        needs_update = True
-
-                # Process swapped requests.
-                for a_index, b_index in batch_update.swapped:
-                    a_entry = self.min_toks.pop(a_index, None)
-                    b_entry = self.min_toks.pop(b_index, None)
-                    needs_update |= bool(a_entry or b_entry)
-                    if a_entry:
-                        self.min_toks[b_index] = a_entry
-                    if b_entry:
-                        self.min_toks[a_index] = b_entry
+            # Process swapped requests.
+            for a_index, b_index in batch_update.swapped:
+                a_entry = self.min_toks.pop(a_index, None)
+                b_entry = self.min_toks.pop(b_index, None)
+                needs_update |= bool(a_entry or b_entry)
+                if a_entry:
+                    self.min_toks[b_index] = a_entry
+                if b_entry:
+                    self.min_toks[a_index] = b_entry
 
         if self.min_toks:
             # Check for any requests that have attained their min tokens.

From d92a3f3c47d07315dcaf6c173fee54a1cd2894cc Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 6 Jun 2025 23:47:00 -0400
Subject: [PATCH 043/180] remove prints

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9bf4f99bc8f..a856a178c38 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -528,8 +528,6 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
         else:
             assert empty_req_indices is None
             empty_req_indices = self.batch_update.removed
-            print("\n\n\n", f"empty_req_indices: {empty_req_indices}",
-                  "\n\n\n")
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -548,8 +546,6 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
             # Find the smallest empty index.
             empty_index = (empty_req_indices.pop()
                            if is_tpu else self.batch_update.peek_removed())
-            print("\n\n\n", f"- empty: {empty_index} last: {last_req_index}",
-                  "\n\n\n")
             if empty_index >= last_req_index:
                 break
 
@@ -557,8 +553,6 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
             # index.
             if not is_tpu:
                 self.batch_update.pop_removed_if_can()
-                print("\n\n\n", f"- empty_req_indices: {empty_req_indices}",
-                      "\n\n\n")
                 self.batch_update.moved.append((last_req_index, empty_index))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]

From ef2294dbdd0c52ca888fb93b55d092076b7cc27d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Sat, 7 Jun 2025 23:27:59 -0400
Subject: [PATCH 044/180] rejection sampling test bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_rejection_sampler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index f35c3e194fa..f1e2353e26f 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -57,7 +57,6 @@ def create_sampling_metadata(
         all_random=not all_greedy,
         top_p=top_p,
         top_k=top_k,
-        min_p=torch.empty(1, ),
         generators=generators,
         max_num_logprobs=0,
         no_penalties=False,
@@ -66,10 +65,10 @@ def create_sampling_metadata(
         presence_penalties=torch.tensor([]),
         repetition_penalties=torch.tensor([]),
         output_token_ids=[],
-        min_tokens={},
-        logit_bias=[None],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
+        logits_procs=[],
+        nongreedy_logits_procs=[],
     )
 
 

From 198db4820caa18a9071699acf41dc30806fa35ff Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Sat, 7 Jun 2025 23:37:48 -0400
Subject: [PATCH 045/180] sampler test bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_sampler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 70e874d3ea7..dd070593073 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -213,7 +213,7 @@ def test_sampler_presence_penalty(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, _ = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     output_token_ids = sampling_metadata.output_token_ids
     sampling_metadata.presence_penalties = _create_penalty_tensor(
@@ -258,7 +258,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, _ = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     sampling_metadata.frequency_penalties = _create_penalty_tensor(
         batch_size, frequency_penalty, torch.device(device))
@@ -312,7 +312,7 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, _ = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     sampling_metadata.repetition_penalties = _create_penalty_tensor(
         batch_size, repetition_penalty, torch.device(device))
@@ -358,7 +358,7 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, _ = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     mask = _create_allowed_token_ids(
         batch_size=batch_size,
@@ -398,7 +398,7 @@ def test_sampler_bad_words(device: str, batch_size: int,
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, _ = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
         batch_size, VOCAB_SIZE, bad_words_lengths)

From 2f2550bad05bf1f39102fb20dc535b7956643229 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Sat, 7 Jun 2025 23:39:48 -0400
Subject: [PATCH 046/180] removed logitsprocs where not needed in test

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_sampler.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index dd070593073..817ff7e31ec 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -149,19 +149,8 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logits_procs=[
-            MinTokensLogitsProcessor(pin_memory=PIN_MEMORY_AVAILABLE,
-                                     device=device),
-            LogitBiasLogitsProcessor(pin_memory=PIN_MEMORY_AVAILABLE,
-                                     device=device),
-        ],
-        nongreedy_logits_procs=[
-            MinPLogitsProcessor(
-                pin_memory=PIN_MEMORY_AVAILABLE,
-                device=device,
-                # +1 for temporary swap space
-                max_num_reqs=MAX_NUM_REQS + 1)
-        ])
+        logits_procs=[],
+        nongreedy_logits_procs=[])
     return fake_sampling_metadata
 
 

From 5fc130b2a7d1bba9d1f19dcf96586e89a76f25a6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Sun, 8 Jun 2025 20:46:54 -0400
Subject: [PATCH 047/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 61 +++++++++----------
 tests/v1/sample/test_rejection_sampler.py |  4 +-
 tests/v1/sample/test_sampler.py           |  8 +--
 tests/v1/sample/utils.py                  |  2 +-
 vllm/v1/sample/metadata.py                |  5 +-
 vllm/v1/sample/sampler.py                 |  8 +--
 vllm/v1/worker/gpu_input_batch.py         | 37 +++++-------
 vllm/v1/worker/gpu_model_runner.py        |  6 +-
 vllm/v1/worker/utils.py                   | 71 +++++++++++++++++++++++
 9 files changed, 128 insertions(+), 74 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 6ffb24e3db8..ebee8424f0f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -12,10 +12,12 @@
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.sample.logits_processor import (BatchUpdate,
                                              LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
-                                             MinPLogitsProcessor,
                                              MinTokensLogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITSPROC_ID,
+                                  STR_MIN_P_LOGITSPROC_ID,
+                                  STR_MIN_TOKENS_LOGITSPROC_ID,
+                                  init_hard_coded_logitsprocs)
 
 BatchAddType = Sequence[tuple[int, SamplingParams, list[int]]]
 
@@ -137,7 +139,7 @@ def _create_default_sampling_metadata(
     batch_size: int,
     vocab_size: int,
     device: torch.device,
-) -> tuple[SamplingMetadata, dict[str, LogitsProcessor]]:
+) -> SamplingMetadata:
     output_token_ids: list[list[int]] = []
     prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
@@ -148,15 +150,11 @@ def _create_default_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=PIN_MEMORY_AVAILABLE, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=PIN_MEMORY_AVAILABLE, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=PIN_MEMORY_AVAILABLE,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=MAX_NUM_REQS + 1)
+    logitsprocs = init_hard_coded_logitsprocs(
+        pin_memory_available=PIN_MEMORY_AVAILABLE,
+        max_num_reqs=MAX_NUM_REQS + 1,
+        device=device)
+
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
@@ -174,25 +172,17 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logits_procs=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
-        ],
-        nongreedy_logits_procs=[min_p_logitproc])
-    return fake_sampling_metadata, {
-        "min_tokens": min_tokens_logitproc,
-        "logit_bias": logit_bias_logitproc,
-        "min_p": min_p_logitproc
-    }
+        logitsprocs=logitsprocs)
+    return fake_sampling_metadata
 
 
 def _fake_apply_greedy_logits_processors(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    """Imitate greedy logit processor application in engine
+    """Imitate greedy-compatible logit processor application in engine
     core"""
-    for processor in sampling_metadata.logits_procs:
+    for processor in sampling_metadata.logitsprocs.greedy_list:
         logits = processor.apply(logits)
     return logits
 
@@ -201,9 +191,9 @@ def _fake_apply_nongreedy_logits_processors(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    """Imitate non-greedy logit processoed application in engine
+    """Imitate non-greedy-only logit processor application in engine
     core"""
-    for processor in sampling_metadata.nongreedy_logits_procs:
+    for processor in sampling_metadata.logitsprocs.nongreedy_list:
         logits = processor.apply(logits)
     return logits
 
@@ -285,10 +275,11 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
     # Create fake logits where each token is assigned the same
     # logit value.
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    logit_bias_logitproc: LogitBiasLogitsProcessor = logitproc_dict[
-        "logit_bias"]
+    logit_bias_logitproc: LogitBiasLogitsProcessor = (
+        sampling_metadata.logitsprocs.get_logitsproc_by_id(
+            STR_LOGITS_BIAS_LOGITSPROC_ID))
     # Create batch update where each request demands a
     # different logit bias
     logit_bias_list = _create_logit_bias(
@@ -339,10 +330,11 @@ def test_min_p(device: str, batch_size: int, min_p: float):
         fake_logits[i, 0] = 10.0  # High logit for first token
         fake_logits[i, 1:] = 1e-2  # Others remain low
 
-    sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
 
-    min_p_logitproc = logitproc_dict["min_p"]
+    min_p_logitproc = (sampling_metadata.logitsprocs.get_logitsproc_by_id(
+        STR_MIN_P_LOGITSPROC_ID))
     # Create batch update where each request demands
     # the same min_p value
     added: BatchAddType = [(rdx, SamplingParams(min_p=min_p), [])
@@ -384,10 +376,11 @@ def test_min_tokens_penalty(device: str, batch_size: int):
     """
     torch.set_default_device(device)
     fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata, logitproc_dict = _create_default_sampling_metadata(
+    sampling_metadata = _create_default_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    min_tokens_logitproc: MinTokensLogitsProcessor = logitproc_dict[
-        "min_tokens"]
+    min_tokens_logitproc: MinTokensLogitsProcessor = (
+        sampling_metadata.logitsprocs.get_logitsproc_by_id(
+            STR_MIN_TOKENS_LOGITSPROC_ID))
     batch_indices_for_min_token_penalty = (
         [0] if batch_size == 1 else np.random.randint(
             0, batch_size - 1, size=np.random.randint(1, batch_size)).tolist())
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index f1e2353e26f..e8a866f9274 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -10,6 +10,7 @@
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.worker.utils import LogitsProcessorObjects
 
 DEVICE = "cuda"
 
@@ -67,8 +68,7 @@ def create_sampling_metadata(
         output_token_ids=[],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logits_procs=[],
-        nongreedy_logits_procs=[],
+        logitsprocs=LogitsProcessorObjects(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 817ff7e31ec..ee36e993433 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,11 +9,9 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
+from vllm.v1.worker.utils import LogitsProcessorObjects
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
@@ -149,8 +147,8 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logits_procs=[],
-        nongreedy_logits_procs=[])
+        logitsprocs=LogitsProcessorObjects(),
+    )
     return fake_sampling_metadata
 
 
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 8c111f846b4..3ae0859355e 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -133,4 +133,4 @@ def compute_correct_cumulative_logprob(
     token_ids = completion_output.token_ids
     logprobs = completion_output.logprobs
     assert logprobs is not None
-    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
\ No newline at end of file
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 2a9b9906c62..eeb378170a1 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.sample.logits_processor import LogitsProcessor
+from vllm.v1.worker.utils import LogitsProcessorObjects
 
 
 @dataclass
@@ -42,8 +42,7 @@ class SamplingMetadata:
     # Some logits processors don't affect greedy decoding (or if they do,
     # only due to precision errors); "non-greedy" processors are
     # only applied to random-sampled requests in the batch.
-    logits_procs: list[LogitsProcessor]
-    nongreedy_logits_procs: list[LogitsProcessor]
+    logitsprocs: LogitsProcessorObjects
 
     # TODO(andy): Because newest logits processors implementation
     # does not support TPU yet, the old min_p field is still required
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 6ea1845fb07..90f88be8498 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -45,8 +45,8 @@ def forward(
         # Apply bad words exclusion.
         logits = self.apply_bad_words(logits, sampling_metadata)
 
-        # Apply logits processors.
-        for processor in sampling_metadata.logits_procs:
+        # Apply greedy-sampling-compatible logits processors.
+        for processor in sampling_metadata.logitsprocs.greedy_list:
             logits = processor.apply(logits)
 
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -113,8 +113,8 @@ def sample(
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
-        # Apply logits processors.
-        for processor in sampling_metadata.nongreedy_logits_procs:
+        # Apply logits processors only compatible with nongreedy.
+        for processor in sampling_metadata.logitsprocs.nongreedy_list:
             logits = processor.apply(logits)
 
         # Apply top_k and/or top_p.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index a856a178c38..dd9b1eea953 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -14,14 +14,11 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.logits_processor import (BatchUpdate,
-                                             LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor import BatchUpdate, MinPLogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
+from vllm.v1.worker.utils import init_hard_coded_logitsprocs
 
 # TODO(andy): TPU implementation does not support
 # latest logits processor implementation
@@ -225,21 +222,15 @@ def __init__(
             # Should reset each step.
             self.batch_update = BatchUpdate()
 
-            # Define logits processors
+            # Define logits processors. Note that Min-P logitsproc is returned
+            # both on its own as min_p_logitsproc (to support spec decoding
+            # compatibility check) and also as part of logits_procs
             # TODO(andy): logits processor list should be extensible via engine
             # constructor argument; for now the list is fixed.
-            self.logit_procs: list[LogitsProcessor] = [
-                MinTokensLogitsProcessor(pin_memory=pin_memory, device=device),
-                LogitBiasLogitsProcessor(pin_memory=pin_memory, device=device),
-            ]
-            self.min_p_logitsproc = MinPLogitsProcessor(
-                pin_memory=pin_memory,
-                device=device,
-                # +1 for temporary swap space
-                max_num_reqs=max_num_reqs + 1)
-            self.nongreedy_logits_procs: list[LogitsProcessor] = [
-                self.min_p_logitsproc
-            ]
+            self.logitsprocs = init_hard_coded_logitsprocs(
+                pin_memory_available=pin_memory,
+                max_num_reqs=max_num_reqs + 1,
+                device=device)
 
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
@@ -614,7 +605,7 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
         self.batch_update.batch_size = self.num_reqs
-        for logit_proc in self.logit_procs + self.nongreedy_logits_procs:
+        for logit_proc in self.logitsprocs.all_list:
             logit_proc.update_state(self.batch_update)
         # Clear state change representation to prepare for next step
         self.batch_update.reset()
@@ -681,8 +672,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             no_penalties=self.no_penalties,
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
-            logits_procs=self.logit_procs,
-            nongreedy_logits_procs=self.nongreedy_logits_procs,
+            logitsprocs=self.logitsprocs,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
@@ -728,7 +718,10 @@ def make_lora_inputs(
 
     def get_min_p_by_req_id(self, req_id: str) -> float:
         assert req_id in self.req_id_to_index
-        return self.min_p_logitsproc.get_min_p_by_index(
+        min_p_logitsproc = self.logitsprocs.get_logitsproc_by_id("min_p")
+        assert min_p_logitsproc is not None and isinstance(
+            min_p_logitsproc, MinPLogitsProcessor)
+        return min_p_logitsproc.get_min_p_by_index(
             self.req_id_to_index[req_id])
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 62dbb9d187e..c53bdb7c403 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -58,7 +58,8 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
+from .utils import (LogitsProcessorObjects, gather_mm_placeholders,
+                    initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
@@ -1847,8 +1848,7 @@ def _dummy_sampler_run(
             output_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logits_procs=[],
-            nongreedy_logits_procs=[],
+            logitsprocs=LogitsProcessorObjects(),
         )
         try:
             sampler_output = self.sampler(logits=logits,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 055cf01530f..71407c3a8e8 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,10 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
 from typing import Optional
 
 import torch
 
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
+from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor)
+
+# Logits processor id strs
+STR_MIN_P_LOGITSPROC_ID = "min_p"
+STR_MIN_TOKENS_LOGITSPROC_ID = "min_tokens"
+STR_LOGITS_BIAS_LOGITSPROC_ID = "logit_bias"
 
 
 def sanity_check_mm_encoder_outputs(
@@ -109,3 +119,64 @@ def initialize_kv_cache_for_kv_sharing(
         kv_caches[layer_name] = kv_caches[target_layer_name]
         group_idx = layer_to_kv_cache_group_idx[target_layer_name]
         kv_cache_groups[group_idx].layer_names.append(layer_name)
+
+
+@dataclass
+class LogitsProcessorObjects:
+    """Encapsulates initialized logitsproc objects.
+    
+    Each logits processor has a unique id.
+    """
+    nongreedy: dict[str, LogitsProcessor] = field(
+        default_factory=dict)  # id -> nongreedy-sampling-only logitsproc
+    greedy: dict[str, LogitsProcessor] = field(
+        default_factory=dict)  # id -> greedy-sampling compatible logitsproc
+
+    def __post_init__(self):
+        """Guarantee unique ids"""
+        if (self.nongreedy.keys() & self.greedy.keys()):
+            raise ValueError("Greedy and non-greedy logits "
+                             "processors must not share ids")
+
+    def get_logitsproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
+        """Find logits processor by id, if it exists"""
+        return self.all.get(id, None)
+
+    @property
+    def all(self) -> dict[str, LogitsProcessor]:
+        """All logits processors"""
+        return self.greedy | self.nongreedy
+
+    @property
+    def nongreedy_list(self) -> list[LogitsProcessor]:
+        return list(self.nongreedy.values())
+
+    @property
+    def greedy_list(self) -> list[LogitsProcessor]:
+        return list(self.greedy.values())
+
+    @property
+    def all_list(self) -> list[LogitsProcessor]:
+        """List of all logits processors"""
+        return self.nongreedy_list + self.greedy_list
+
+
+def init_hard_coded_logitsprocs(
+        pin_memory_available: bool, max_num_reqs: int,
+        device: torch.device) -> LogitsProcessorObjects:
+    min_tokens_logitproc = MinTokensLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    logit_bias_logitproc = LogitBiasLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    min_p_logitproc = MinPLogitsProcessor(
+        pin_memory=pin_memory_available,
+        device=device,
+        # +1 for temporary swap space
+        max_num_reqs=max_num_reqs + 1)
+    return LogitsProcessorObjects(
+        greedy={
+            STR_MIN_TOKENS_LOGITSPROC_ID: min_tokens_logitproc,
+            STR_LOGITS_BIAS_LOGITSPROC_ID: logit_bias_logitproc
+        },
+        nongreedy={STR_MIN_P_LOGITSPROC_ID: min_p_logitproc},
+    )

From 7b8f299d426aefeebf22bad45dba9991cffbcb26 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 9 Jun 2025 00:05:11 -0400
Subject: [PATCH 048/180] sampling_params min-p check

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 8 ++++++++
 vllm/sampling_params.py                     | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index ac0f3eb5883..bb28bab4500 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -51,6 +51,14 @@ def test_penalties(model):
     )
     _ = model.generate(PROMPT, params)
 
+def test_min_p_greedy_fails(model):
+    """Check that min-p fails for greedy sampling."""
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT,
+                           SamplingParams(
+        temperature=0.0,
+        min_p=0.5
+    ))
 
 def test_stop(model):
     """Check that we respect the stop words."""
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 7abdcecca47..871a6ed5062 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -426,6 +426,9 @@ def _verify_args(self) -> None:
         if not 0.0 <= self.min_p <= 1.0:
             raise ValueError("min_p must be in [0, 1], got "
                              f"{self.min_p}.")
+        if self.min_p > 0.0 and self.temperature == 0.0:
+            raise ValueError("min_p > 0.0 requires random sampling "
+                             "but temperature == 0.0")
         if self.max_tokens is not None and self.max_tokens < 1:
             raise ValueError(
                 f"max_tokens must be at least 1, got {self.max_tokens}.")

From 0515848a8b190e517702978a826c2fef813cf0df Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 9 Jun 2025 00:52:09 -0400
Subject: [PATCH 049/180] small test optimization

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index ebee8424f0f..ff44aeb4e2f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -180,8 +180,8 @@ def _fake_apply_greedy_logits_processors(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    """Imitate greedy-compatible logit processor application in engine
-    core"""
+    """Imitate greedy-compatible logit processor application
+    in engine"""
     for processor in sampling_metadata.logitsprocs.greedy_list:
         logits = processor.apply(logits)
     return logits
@@ -410,8 +410,8 @@ def test_min_tokens_penalty(device: str, batch_size: int):
                                                   sampling_metadata)
     logits = logits.cpu()
     for batch_idx in range(batch_size):
+        _, stop_token_ids = min_tokens_dict.get(batch_idx, (0, set()))
         for token_id in range(VOCAB_SIZE):
-            _, stop_token_ids = min_tokens_dict.get(batch_idx, (0, set()))
             if token_id in stop_token_ids:
                 assert logits[batch_idx][token_id] == -float("inf")
             else:

From c3928984032d2466bebe695095e3149132fbb45b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 10 Jun 2025 14:44:25 -0400
Subject: [PATCH 050/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 208 +++++++---------------
 vllm/v1/worker/utils.py                   |   2 +-
 2 files changed, 66 insertions(+), 144 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index ff44aeb4e2f..ea8bd1f1cf2 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from collections.abc import Sequence
-from typing import Optional
+from collections.abc import Callable, Sequence
+from typing import NamedTuple, Optional
 
 import numpy as np
 import pytest
@@ -10,17 +10,14 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import (BatchUpdate,
-                                             LogitBiasLogitsProcessor,
-                                             MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor import (AddedRequestType, BatchUpdate,
+                                             LogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITSPROC_ID,
                                   STR_MIN_P_LOGITSPROC_ID,
                                   STR_MIN_TOKENS_LOGITSPROC_ID,
                                   init_hard_coded_logitsprocs)
 
-BatchAddType = Sequence[tuple[int, SamplingParams, list[int]]]
-
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
 VOCAB_SIZE = 1024
@@ -30,6 +27,15 @@
     for i in range(1 if current_platform.device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
+STR_NO_LOGITSPROC = "none"
+
+
+class TestFakes(NamedTuple):
+    logits: torch.Tensor
+    sampling_metadata: SamplingMetadata
+
+    def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
+        return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
 
 
 def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
@@ -71,69 +77,6 @@ def _create_logit_bias(
     return res
 
 
-def _create_allowed_token_ids(
-    batch_size: int,
-    vocab_size: int,
-    num_allowed_token_ids: int,
-    device: torch.device,
-) -> Optional[torch.Tensor]:
-    mask: Optional[torch.Tensor] = None
-    for i in range(batch_size):
-        if i % 2 == 1:
-            continue
-        if mask is None:
-            mask = torch.zeros((batch_size, vocab_size),
-                               dtype=torch.bool,
-                               device=device)
-        start = min(i, vocab_size - 1)
-        end = min(i + num_allowed_token_ids, vocab_size - 1)
-        mask[i, start:end] = True
-    return mask
-
-
-def _create_bad_words_token_ids(
-        batch_size: int, vocab_size: int,
-        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
-    bad_words_token_ids = {}
-    for batch_idx in range(batch_size):
-        token_ids_single_batch = []
-        for bad_words_length in bad_words_lengths:
-            token_ids = np.random.choice(vocab_size,
-                                         size=bad_words_length,
-                                         replace=True).tolist()
-            token_ids_single_batch.append(token_ids)
-        bad_words_token_ids[batch_idx] = token_ids_single_batch
-    if batch_size >= 2:
-        # Test no bad_words for some batch
-        no_bad_words_batch_idx = np.random.choice(batch_size)
-        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
-    return bad_words_token_ids
-
-
-def _update_output_token_ids_for_bad_words(
-        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
-    bad_words_last_tokens = {}
-    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
-        output_token_ids = metadata.output_token_ids[batch_idx]
-        bad_words_last_token: list[int] = []
-        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
-            if len(bad_word_token_ids) == 1:
-                # Single token id always affects logits
-                bad_words_last_token.append(bad_word_token_ids[0])
-            else:
-                prefix_length = len(bad_word_token_ids) - 1
-                has_bad_words = np.random.choice([True, False])
-                if has_bad_words:
-                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
-                    bad_words_last_token.append(bad_word_token_ids[-1])
-                    break  # Maximum one update to output_token_ids
-                else:  # Make sure no accidental match to bad words
-                    output_token_ids[-1] = (bad_word_token_ids[-2] +
-                                            1) % vocab_size
-        bad_words_last_tokens[batch_idx] = bad_words_last_token
-    return bad_words_last_tokens
-
-
 def _create_default_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -177,23 +120,21 @@ def _create_default_sampling_metadata(
 
 
 def _fake_apply_greedy_logits_processors(
-    logits: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
+        test_fakes: TestFakes) -> torch.Tensor:
     """Imitate greedy-compatible logit processor application
     in engine"""
-    for processor in sampling_metadata.logitsprocs.greedy_list:
+    logits = test_fakes.logits
+    for processor in test_fakes.sampling_metadata.logitsprocs.greedy_list:
         logits = processor.apply(logits)
     return logits
 
 
 def _fake_apply_nongreedy_logits_processors(
-    logits: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
+        test_fakes: TestFakes) -> torch.Tensor:
     """Imitate non-greedy-only logit processor application in engine
     core"""
-    for processor in sampling_metadata.logitsprocs.nongreedy_list:
+    logits = test_fakes.logits
+    for processor in test_fakes.sampling_metadata.logitsprocs.nongreedy_list:
         logits = processor.apply(logits)
     return logits
 
@@ -227,39 +168,31 @@ def _generate_min_token_penalties_and_stop_tokens(
     return min_tokens
 
 
-def _create_weighted_output_token_list(
-        batch_size: int,
-        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
-    """
-    Creates an output token list where each token occurs a distinct
-    number of times.
-
-    For each batch, a random subset of token IDs is selected from the
-    vocabulary. The selected tokens are then added to the output token
-    list, each with a different frequency.
-
-    Returns:
-        tuple[list[list[int]], list[list[int]]]:
-            - The first element is the output token list, where each sublist
-              corresponds to a batch and contains tokens with weighted
-              frequencies.
-            - The second element is a list of distinct token IDs for each
-              batch, ordered by their frequency in the corresponding output
-              list.
-    """
-    output_token_ids: list[list[int]] = []
-    sorted_token_ids_in_output: list[list[int]] = []
-    for _ in range(batch_size):
-        distinct_token_ids = np.random.choice(vocab_size,
-                                              size=np.random.randint(1, 10),
-                                              replace=False).tolist()
-        sorted_token_ids_in_output.append(distinct_token_ids)
-        output_token_ids_for_batch = []
-        for index, token_id in enumerate(distinct_token_ids):
-            output_token_ids_for_batch.extend(
-                [token_id for _ in range(index + 1)])
-        output_token_ids.append(output_token_ids_for_batch)
-    return output_token_ids, sorted_token_ids_in_output
+def _test_setup(batch_size: int, device: str) -> TestFakes:
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    # Create one dominant token per batch, to support min-p test
+    for i in range(batch_size):
+        fake_logits[i, 0] = 10.0  # High logit for first token
+        fake_logits[i, 1:] = 1e-2  # Others remain low
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    return TestFakes(
+        logits=fake_logits,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+class LogitsprocTestHelpers(NamedTuple):
+    gen_request_fxn: Callable
+    eval_fxn: Callable
+
+
+logitsprocs_test_mapping = {
+    STR_NO_LOGITSPROC: None,
+    STR_LOGITS_BIAS_LOGITSPROC_ID: LogitsprocTestHelpers(),
+    STR_MIN_P_LOGITSPROC_ID: LogitsprocTestHelpers(),
+    STR_MIN_TOKENS_LOGITSPROC_ID: LogitsprocTestHelpers(),
+}
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -272,14 +205,12 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
     output.
     """
     torch.set_default_device(device)
+
     # Create fake logits where each token is assigned the same
     # logit value.
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    logit_bias_logitproc: LogitBiasLogitsProcessor = (
-        sampling_metadata.logitsprocs.get_logitsproc_by_id(
-            STR_LOGITS_BIAS_LOGITSPROC_ID))
+    test_fakes = _test_setup(batch_size, device)
+    logit_bias_logitproc = test_fakes.get_logitsproc_by_id(
+        STR_LOGITS_BIAS_LOGITSPROC_ID)
     # Create batch update where each request demands a
     # different logit bias
     logit_bias_list = _create_logit_bias(
@@ -287,7 +218,7 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
         vocab_size=VOCAB_SIZE,
         bias_value=bias_value,
     )
-    added: BatchAddType = [
+    added: Sequence[AddedRequestType] = [
         (rdx, SamplingParams(logit_bias=logit_bias_list[rdx]), [])
         for rdx in range(batch_size)
     ]
@@ -300,8 +231,7 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
     # Register batch update with logit processor
     logit_bias_logitproc.update_state(batch_update)
     # Emulate application of greedy logits processors in engine
-    logits = _fake_apply_greedy_logits_processors(fake_logits,
-                                                  sampling_metadata)
+    logits = _fake_apply_greedy_logits_processors(test_fakes)
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         logits_for_req = logits[batch_idx]
@@ -323,22 +253,16 @@ def test_min_p(device: str, batch_size: int, min_p: float):
     min_p * max_prob are masked with -inf.
     """
     torch.set_default_device(device)
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
 
-    # Create one dominant token per batch
-    for i in range(batch_size):
-        fake_logits[i, 0] = 10.0  # High logit for first token
-        fake_logits[i, 1:] = 1e-2  # Others remain low
-
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    test_fakes = _test_setup(batch_size, device)
 
-    min_p_logitproc = (sampling_metadata.logitsprocs.get_logitsproc_by_id(
-        STR_MIN_P_LOGITSPROC_ID))
+    min_p_logitproc = test_fakes.get_logitsproc_by_id(STR_MIN_P_LOGITSPROC_ID)
     # Create batch update where each request demands
     # the same min_p value
-    added: BatchAddType = [(rdx, SamplingParams(min_p=min_p), [])
-                           for rdx in range(batch_size)]
+    added: Sequence[AddedRequestType] = [(rdx, SamplingParams(min_p=min_p), [])
+                                         for rdx in range(batch_size)]
     batch_update = BatchUpdate(
         removed=[],
         moved=[],
@@ -348,8 +272,7 @@ def test_min_p(device: str, batch_size: int, min_p: float):
     # Register batch update with logit processor
     min_p_logitproc.update_state(batch_update)
     # Emulate application of non-greedy logits processors in engine
-    logits = _fake_apply_nongreedy_logits_processors(fake_logits,
-                                                     sampling_metadata)
+    logits = _fake_apply_nongreedy_logits_processors(test_fakes)
     logits = logits.cpu()
 
     for batch_idx in range(batch_size):
@@ -375,12 +298,9 @@ def test_min_tokens_penalty(device: str, batch_size: int):
     the stop token ids to -inf.
     """
     torch.set_default_device(device)
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
-    sampling_metadata = _create_default_sampling_metadata(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    min_tokens_logitproc: MinTokensLogitsProcessor = (
-        sampling_metadata.logitsprocs.get_logitsproc_by_id(
-            STR_MIN_TOKENS_LOGITSPROC_ID))
+    test_fakes = _test_setup(batch_size, device)
+    min_tokens_logitproc = test_fakes.get_logitsproc_by_id(
+        STR_MIN_TOKENS_LOGITSPROC_ID)
     batch_indices_for_min_token_penalty = (
         [0] if batch_size == 1 else np.random.randint(
             0, batch_size - 1, size=np.random.randint(1, batch_size)).tolist())
@@ -390,7 +310,7 @@ def test_min_tokens_penalty(device: str, batch_size: int):
 
     # Create batch update where each request demands
     # a different min_tokens value
-    added: BatchAddType = [
+    added: Sequence[AddedRequestType] = [
         (rdx,
          SamplingParams(min_tokens=min_tokens_dict[rdx][0],
                         max_tokens=None,
@@ -406,8 +326,7 @@ def test_min_tokens_penalty(device: str, batch_size: int):
     # Register batch update with logit processor
     min_tokens_logitproc.update_state(batch_update)
     # Emulate application of greedy logits processors in engine
-    logits = _fake_apply_greedy_logits_processors(fake_logits,
-                                                  sampling_metadata)
+    logits = _fake_apply_greedy_logits_processors(test_fakes)
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         _, stop_token_ids = min_tokens_dict.get(batch_idx, (0, set()))
@@ -416,3 +335,6 @@ def test_min_tokens_penalty(device: str, batch_size: int):
                 assert logits[batch_idx][token_id] == -float("inf")
             else:
                 assert logits[batch_idx][token_id] != -float("inf")
+
+
+# def test_mixed_batch_with_reordering(device: str, batch_size: int):
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 71407c3a8e8..f89edc411b9 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -11,7 +11,7 @@
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor)
 
-# Logits processor id strs
+# Logits processor id strsW
 STR_MIN_P_LOGITSPROC_ID = "min_p"
 STR_MIN_TOKENS_LOGITSPROC_ID = "min_tokens"
 STR_LOGITS_BIAS_LOGITSPROC_ID = "logit_bias"

From dc4b6b872d1dab8a36b96ecaca52cc1365065613 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 11 Jun 2025 12:31:05 -0400
Subject: [PATCH 051/180] wip tests

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 85 ++++++++++++++++++++---
 1 file changed, 76 insertions(+), 9 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index ea8bd1f1cf2..1450e61c25f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import random
 from collections.abc import Callable, Sequence
+from itertools import combinations
 from typing import NamedTuple, Optional
 
 import numpy as np
@@ -28,6 +30,8 @@
 ]
 MAX_NUM_PROMPT_TOKENS = 64
 STR_NO_LOGITSPROC = "none"
+MIN_TOKENS_LEN_THRESHOLD = 5
+REQS_PER_COMBO = 10
 
 
 class TestFakes(NamedTuple):
@@ -38,6 +42,13 @@ def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
         return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
 
 
+class RequestSpec(NamedTuple):
+    index: int
+    combo: set[str]
+    out_tokens: list[int]
+    params: SamplingParams
+
+
 def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
     fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
     return fake_logits
@@ -182,19 +193,78 @@ def _test_setup(batch_size: int, device: str) -> TestFakes:
     )
 
 
+def _min_p_params(kwargs: dict) -> None:
+    kwargs["min_p"] = 0.1
+
+
+def _logit_bias_params(kwargs: dict) -> None:
+    pass
+
+
+def _min_tokens_params(kwargs: dict) -> None:
+    pass
+
+
 class LogitsprocTestHelpers(NamedTuple):
-    gen_request_fxn: Callable
-    eval_fxn: Callable
+    gen_request_fxn: Optional[Callable] = None
+    eval_fxn: Optional[Callable] = None
 
 
 logitsprocs_test_mapping = {
-    STR_NO_LOGITSPROC: None,
-    STR_LOGITS_BIAS_LOGITSPROC_ID: LogitsprocTestHelpers(),
-    STR_MIN_P_LOGITSPROC_ID: LogitsprocTestHelpers(),
-    STR_MIN_TOKENS_LOGITSPROC_ID: LogitsprocTestHelpers(),
+    STR_LOGITS_BIAS_LOGITSPROC_ID:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params),
+    STR_MIN_P_LOGITSPROC_ID:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params),
+    STR_MIN_TOKENS_LOGITSPROC_ID:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params),
 }
 
 
+def _sampling_params_from_combo(combo: set[str]) -> SamplingParams:
+    # SamplingParams for req with no logitsprocs
+    kwargs = {"min_p": 0, "logit_bias": None, "min_tokens": 0}
+    for id in combo:
+        # Update SamplingParams based on logitsprocs configs
+        fxn = logitsprocs_test_mapping[id].gen_request_fxn
+        fxn(kwargs)
+    return SamplingParams(**kwargs)
+
+
+def _generate_req_spec(idx: int, combo: set[str]) -> RequestSpec:
+    return RequestSpec(index=idx,
+                       combo=combo,
+                       out_tokens=[0] *
+                       (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 1)),
+                       params=_sampling_params_from_combo(combo))
+
+
+def _generate_batch_spec(
+    reqs_per_combo: int,
+    logitsprocs_ids: set[str],
+) -> list[RequestSpec]:
+    all_combos = [
+        set(combo) for k in range(len(logitsprocs_ids) + 1)
+        for combo in combinations(logitsprocs_ids, k)
+    ]
+    batch_size = len(all_combos) * reqs_per_combo
+    batch_perm = random.sample(range(batch_size), k=batch_size)
+    return [
+        _generate_req_spec(idx, all_combos[pdx // reqs_per_combo])
+        for idx, pdx in enumerate(batch_perm)
+    ]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("reqs_per_combo", [REQS_PER_COMBO])
+@pytest.mark.parametrize("logitsprocs_under_test",
+                         [set(logitsprocs_test_mapping.keys())])
+def test_mixed_batch_with_reordering(device: str, reqs_per_combo: int,
+                                     logitsprocs_under_test: set[str]):
+    batch_spec = _generate_batch_spec(reqs_per_combo=reqs_per_combo,
+                                      logitsprocs_ids=logitsprocs_under_test)
+    print(batch_spec)
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("bias_value", [-0.1, 1.2])
@@ -335,6 +405,3 @@ def test_min_tokens_penalty(device: str, batch_size: int):
                 assert logits[batch_idx][token_id] == -float("inf")
             else:
                 assert logits[batch_idx][token_id] != -float("inf")
-
-
-# def test_mixed_batch_with_reordering(device: str, batch_size: int):

From 5fb16a65c71d61fb4d668e1c839cdf322b069088 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 11 Jun 2025 13:24:47 -0400
Subject: [PATCH 052/180] refactor

---
 tests/v1/sample/test_logits_processors.py | 71 ++++++++++++++++-------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 1450e61c25f..7df84765369 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -35,18 +35,31 @@
 
 
 class TestFakes(NamedTuple):
+    """Wraps fake data structures to support testing"""
     logits: torch.Tensor
     sampling_metadata: SamplingMetadata
 
     def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
+        """Shorthand for getting a specific logitproc from SamplingMetadata"""
         return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
 
 
-class RequestSpec(NamedTuple):
-    index: int
-    combo: set[str]
-    out_tokens: list[int]
-    params: SamplingParams
+class RequestParams:
+    """Encapsulates key params for a single request in a batch.
+    
+    Params can be customized based on the enabled logitsprocs
+    """
+    batch_index: int
+    combo: set[str]  # Logitsprocs enabled, specified by str id
+    out_tokens: list[int]  # Output tokens required for min tokens test
+    params: SamplingParams  # Settings customized for logitsprocs combo
+
+    def __init__(self, batch_index: int, combo: set[str]):
+        self.batch_index = batch_index
+        self.combo = combo
+        self.out_tokens = ([0] *
+                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 1)))
+        self.params = _sampling_params_from_combo(combo)
 
 
 def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
@@ -206,6 +219,7 @@ def _min_tokens_params(kwargs: dict) -> None:
 
 
 class LogitsprocTestHelpers(NamedTuple):
+    """Supports setting up and validating logitsprocs unit tests."""
     gen_request_fxn: Optional[Callable] = None
     eval_fxn: Optional[Callable] = None
 
@@ -221,35 +235,50 @@ class LogitsprocTestHelpers(NamedTuple):
 
 
 def _sampling_params_from_combo(combo: set[str]) -> SamplingParams:
+    """Customize SamplingParams for a specified combo of logitsprocs"""
     # SamplingParams for req with no logitsprocs
     kwargs = {"min_p": 0, "logit_bias": None, "min_tokens": 0}
     for id in combo:
-        # Update SamplingParams based on logitsprocs configs
+        # Update SamplingParams for each enabled logitproc
         fxn = logitsprocs_test_mapping[id].gen_request_fxn
         fxn(kwargs)
     return SamplingParams(**kwargs)
 
 
-def _generate_req_spec(idx: int, combo: set[str]) -> RequestSpec:
-    return RequestSpec(index=idx,
-                       combo=combo,
-                       out_tokens=[0] *
-                       (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 1)),
-                       params=_sampling_params_from_combo(combo))
-
-
-def _generate_batch_spec(
+def _generate_mixed_logitsprocs_batch_params(
     reqs_per_combo: int,
     logitsprocs_ids: set[str],
-) -> list[RequestSpec]:
+) -> list[RequestParams]:
+    """Define key params for a batch of requests with different
+    combinations of logitsprocs enabled per request.
+    
+    The batch will have `reqs_per_combo` repeats for all possible
+    combinations of the `logitsprocs_ids`, including the case where
+    no logitsprocs are enabled. The batch is randomly shuffled. The
+    size of the batch is `reqs_per_combo`
+    $\times \sum_{k=0}^n \binom{n}{k}$ where `n = len(logitsprocs_ids)`
+
+    Args:
+      reqs_per_combo: number of repeats of each combo of enabled logitsprocs
+      logitsprocs_ids: available logitsprocs, to enable in different
+                       combinations
+
+    Returns:
+      List of per-request params which configure the engine for that request's
+      enabled logitsprocs
+    """
+    # List of $\binom{n}{k}$ for $k \in [0,n]$ where `n = len(logitsprocs_ids)`
     all_combos = [
         set(combo) for k in range(len(logitsprocs_ids) + 1)
         for combo in combinations(logitsprocs_ids, k)
     ]
     batch_size = len(all_combos) * reqs_per_combo
+    # Generate multiple repeats of key params for each combo of
+    # logits procs; apply random inverse permutation to the iteration
+    # over logitsprocs combos, yielding shuffled batch
     batch_perm = random.sample(range(batch_size), k=batch_size)
     return [
-        _generate_req_spec(idx, all_combos[pdx // reqs_per_combo])
+        RequestParams(batch_index=idx, combo=all_combos[pdx // reqs_per_combo])
         for idx, pdx in enumerate(batch_perm)
     ]
 
@@ -260,8 +289,8 @@ def _generate_batch_spec(
                          [set(logitsprocs_test_mapping.keys())])
 def test_mixed_batch_with_reordering(device: str, reqs_per_combo: int,
                                      logitsprocs_under_test: set[str]):
-    batch_spec = _generate_batch_spec(reqs_per_combo=reqs_per_combo,
-                                      logitsprocs_ids=logitsprocs_under_test)
+    batch_spec = _generate_mixed_logitsprocs_batch_params(
+        reqs_per_combo=reqs_per_combo, logitsprocs_ids=logitsprocs_under_test)
     print(batch_spec)
 
 
@@ -270,9 +299,7 @@ def test_mixed_batch_with_reordering(device: str, reqs_per_combo: int,
 @pytest.mark.parametrize("bias_value", [-0.1, 1.2])
 def test_logit_bias(device: str, batch_size: int, bias_value: float):
     """
-    Test to verify that when the repetition penalty is enabled, tokens
-    are penalized based on their presence in the prompt or the existing
-    output.
+    Test to verify logit bias logits processor
     """
     torch.set_default_device(device)
 

From b0658c2f4dbc87ff4f59c15b86dfdc051daf87e4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 11 Jun 2025 19:27:32 -0400
Subject: [PATCH 053/180] passing mixed batch test for min_p and none

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 226 ++++++++++++++++------
 vllm/v1/worker/utils.py                   |  12 +-
 2 files changed, 171 insertions(+), 67 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 7df84765369..48265b601d1 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -2,7 +2,6 @@
 
 import random
 from collections.abc import Callable, Sequence
-from itertools import combinations
 from typing import NamedTuple, Optional
 
 import numpy as np
@@ -15,9 +14,9 @@
 from vllm.v1.sample.logits_processor import (AddedRequestType, BatchUpdate,
                                              LogitsProcessor)
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITSPROC_ID,
-                                  STR_MIN_P_LOGITSPROC_ID,
-                                  STR_MIN_TOKENS_LOGITSPROC_ID,
+from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
+                                  STR_MIN_P_LOGITPROC_ID,
+                                  STR_MIN_TOKENS_LOGITPROC_ID,
                                   init_hard_coded_logitsprocs)
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -29,9 +28,9 @@
     for i in range(1 if current_platform.device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
-STR_NO_LOGITSPROC = "none"
+STR_NO_LOGITPROC = "none"
 MIN_TOKENS_LEN_THRESHOLD = 5
-REQS_PER_COMBO = 10
+REQS_PER_LOGITPROC = 10
 
 
 class TestFakes(NamedTuple):
@@ -43,23 +42,34 @@ def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
         """Shorthand for getting a specific logitproc from SamplingMetadata"""
         return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
 
+    def get_logitsprocs(self) -> list[LogitsProcessor]:
+        return self.sampling_metadata.logitsprocs.all_list
+
 
 class RequestParams:
     """Encapsulates key params for a single request in a batch.
     
-    Params can be customized based on the enabled logitsprocs
+    Params can be customized based on the enabled logitproc
     """
     batch_index: int
-    combo: set[str]  # Logitsprocs enabled, specified by str id
+    logitproc_id: str  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
-    params: SamplingParams  # Settings customized for logitsprocs combo
+    params: SamplingParams  # Settings customized for logitproc
 
-    def __init__(self, batch_index: int, combo: set[str]):
+    def __init__(self, batch_index: int, logitproc_id: str):
         self.batch_index = batch_index
-        self.combo = combo
+        self.logitproc_id = logitproc_id
+        # Number of output tokens is randomly 0 or twice the min-tokens
+        # threshold which will be used in testing. Output token values
+        # don't matter *for these tests* so use 0 as a dummy value
         self.out_tokens = ([0] *
-                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 1)))
-        self.params = _sampling_params_from_combo(combo)
+                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.params = _sampling_params_from_logitproc(logitproc_id)
+
+    def __str__(self):
+        """For debugging"""
+        summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
+        return f"MyClass({summ})"
 
 
 def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
@@ -163,6 +173,14 @@ def _fake_apply_nongreedy_logits_processors(
     return logits
 
 
+def _fake_apply_all_logits_processors(test_fakes: TestFakes) -> torch.Tensor:
+    """Imitate application of logits processors in engine core"""
+    logits = test_fakes.logits
+    for processor in test_fakes.sampling_metadata.logitsprocs.all_list:
+        logits = processor.apply(logits)
+    return logits
+
+
 def _generate_min_token_penalties_and_stop_tokens(
     num_output_tokens: int, batch_size: int, vocab_size: int,
     batch_indices_for_min_token_penalty: list[int]
@@ -210,6 +228,33 @@ def _min_p_params(kwargs: dict) -> None:
     kwargs["min_p"] = 0.1
 
 
+def _min_p_validate(
+    test_fakes: TestFakes,
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: RequestParams,
+) -> bool:
+    logits = test_fakes.logits
+    for token_id in range(VOCAB_SIZE):
+        if token_id == 0:
+            # Dominant token should always be unmasked
+            if logits[batch_index][token_id] == -float("inf"):
+                print("Invalid: dominant token 0 masked (-inf)")
+                return False
+        else:
+            if request_params.params.min_p > 0.0:
+                # Non-dominant tokens should be masked when min_p > 0
+                if logits[batch_index][token_id] != -float("inf"):
+                    print(f"Invalid: non-dominant token {token_id} not masked")
+                    return False
+            else:
+                # No masking when min_p is 0
+                if logits[batch_index][token_id] == -float("inf"):
+                    print(f"Invalid: token {token_id} masked when min_p=0.0")
+                    return False
+    return True
+
+
 def _logit_bias_params(kwargs: dict) -> None:
     pass
 
@@ -218,80 +263,140 @@ def _min_tokens_params(kwargs: dict) -> None:
     pass
 
 
+def _none_validate(
+    test_fakes: TestFakes,
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: RequestParams,
+) -> bool:
+    """Validate that no logits processors are applied"""
+    return logits_new == test_fakes.logits
+
+
 class LogitsprocTestHelpers(NamedTuple):
     """Supports setting up and validating logitsprocs unit tests."""
+    eval_fxn: Callable
     gen_request_fxn: Optional[Callable] = None
-    eval_fxn: Optional[Callable] = None
 
 
 logitsprocs_test_mapping = {
-    STR_LOGITS_BIAS_LOGITSPROC_ID:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params),
-    STR_MIN_P_LOGITSPROC_ID:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params),
-    STR_MIN_TOKENS_LOGITSPROC_ID:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params),
+    STR_NO_LOGITPROC:
+    LogitsprocTestHelpers(eval_fxn=_min_p_validate),
+    STR_LOGITS_BIAS_LOGITPROC_ID:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
+                          eval_fxn=_min_p_validate),
+    STR_MIN_P_LOGITPROC_ID:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
+                          eval_fxn=_min_p_validate),
+    STR_MIN_TOKENS_LOGITPROC_ID:
+    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
+                          eval_fxn=_min_p_validate),
 }
 
 
-def _sampling_params_from_combo(combo: set[str]) -> SamplingParams:
-    """Customize SamplingParams for a specified combo of logitsprocs"""
-    # SamplingParams for req with no logitsprocs
+def _get_logitsprocs_under_test() -> list[str]:
+    return list(logitsprocs_test_mapping.keys())
+
+
+def _sampling_params_from_logitproc(logitproc_id: str) -> SamplingParams:
+    """Customize SamplingParams for a specified logitproc"""
+    # SamplingParams for req with no logitproc
     kwargs = {"min_p": 0, "logit_bias": None, "min_tokens": 0}
-    for id in combo:
-        # Update SamplingParams for each enabled logitproc
-        fxn = logitsprocs_test_mapping[id].gen_request_fxn
+    if fxn := logitsprocs_test_mapping[logitproc_id].gen_request_fxn:
         fxn(kwargs)
     return SamplingParams(**kwargs)
 
 
 def _generate_mixed_logitsprocs_batch_params(
-    reqs_per_combo: int,
-    logitsprocs_ids: set[str],
+    reqs_per_logitproc: int,
+    logitsprocs_ids: list[str],
 ) -> list[RequestParams]:
-    """Define key params for a batch of requests with different
-    combinations of logitsprocs enabled per request.
+    """Define key params for a batch of requests with a different
+    logitproc enabled per request.
     
-    The batch will have `reqs_per_combo` repeats for all possible
-    combinations of the `logitsprocs_ids`, including the case where
-    no logitsprocs are enabled. The batch is randomly shuffled. The
-    size of the batch is `reqs_per_combo`
-    $\times \sum_{k=0}^n \binom{n}{k}$ where `n = len(logitsprocs_ids)`
+    The batch will have `reqs_per_logitproc` repeats for all
+    `logitsprocs_ids` under test, including the case where
+    no logitsproc is enabled. The batch is randomly shuffled. The
+    size of the batch is `reqs_per_logitproc` times
+    `n = len(logitsprocs_ids)`
 
     Args:
-      reqs_per_combo: number of repeats of each combo of enabled logitsprocs
-      logitsprocs_ids: available logitsprocs, to enable in different
-                       combinations
+      reqs_per_logitproc: number of requests using each logitproc
+      logitsprocs_ids: logitsprocs under test
 
     Returns:
       List of per-request params which configure the engine for that request's
-      enabled logitsprocs
+      enabled logitproc
     """
-    # List of $\binom{n}{k}$ for $k \in [0,n]$ where `n = len(logitsprocs_ids)`
-    all_combos = [
-        set(combo) for k in range(len(logitsprocs_ids) + 1)
-        for combo in combinations(logitsprocs_ids, k)
-    ]
-    batch_size = len(all_combos) * reqs_per_combo
-    # Generate multiple repeats of key params for each combo of
-    # logits procs; apply random inverse permutation to the iteration
-    # over logitsprocs combos, yielding shuffled batch
+    batch_size = len(logitsprocs_ids) * reqs_per_logitproc
+    # Generate multiple repeats of key params for each logitproc;
+    # apply random inverse permutation to the iteration
+    # over logitsprocs, such that logitsprocs are shuffled.
     batch_perm = random.sample(range(batch_size), k=batch_size)
     return [
-        RequestParams(batch_index=idx, combo=all_combos[pdx // reqs_per_combo])
+        RequestParams(batch_index=idx,
+                      logitproc_id=logitsprocs_ids[pdx // reqs_per_logitproc])
         for idx, pdx in enumerate(batch_perm)
     ]
 
 
+def _fake_update_logitsprocs_state(
+    test_fakes: TestFakes,
+    batch_update: BatchUpdate,
+) -> None:
+    for logitproc in test_fakes.get_logitsprocs():
+        logitproc.update_state(batch_update)
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("reqs_per_combo", [REQS_PER_COMBO])
+@pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test",
-                         [set(logitsprocs_test_mapping.keys())])
-def test_mixed_batch_with_reordering(device: str, reqs_per_combo: int,
-                                     logitsprocs_under_test: set[str]):
-    batch_spec = _generate_mixed_logitsprocs_batch_params(
-        reqs_per_combo=reqs_per_combo, logitsprocs_ids=logitsprocs_under_test)
-    print(batch_spec)
+                         [_get_logitsprocs_under_test()])
+def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
+                                     logitsprocs_under_test: list[str]):
+    random.seed(42)
+    torch.set_default_device(device)
+
+    logitsprocs_under_test = ["min_p", "none"]
+
+    # Define a shuffled batch of requests which individually use a different
+    # logitproc, or no logitproc at all
+    batch_params = _generate_mixed_logitsprocs_batch_params(
+        reqs_per_logitproc=reqs_per_logitproc,
+        logitsprocs_ids=logitsprocs_under_test)
+    batch_size = len(batch_params)
+
+    # Create fake test data structures for testing.
+    test_fakes = _test_setup(batch_size, device)
+
+    # Construct logitsprocs batch update
+    added: Sequence[AddedRequestType] = [
+        (req_params.batch_index, req_params.params, req_params.out_tokens)
+        for req_params in batch_params
+    ]
+    fake_batch_update = BatchUpdate(
+        removed=[],
+        moved=[],
+        added=added,
+        batch_size=batch_size,
+    )
+
+    # Apply fake batch update to logitsprocs
+    _fake_update_logitsprocs_state(test_fakes, fake_batch_update)
+
+    # Emulate application of greedy logits processors in engine
+    logits_w_lp = _fake_apply_all_logits_processors(test_fakes)
+
+    # Validate logits for each fake request
+    for batch_index in range(batch_size):
+        request_params = batch_params[batch_index]
+        fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
+        assert fxn(test_fakes=test_fakes,
+                   logits_new=logits_w_lp,
+                   batch_index=batch_index,
+                   request_params=request_params), (
+                       f"Validation failed for batch_index={batch_index}, "
+                       f"req_params={request_params}")
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -303,11 +408,10 @@ def test_logit_bias(device: str, batch_size: int, bias_value: float):
     """
     torch.set_default_device(device)
 
-    # Create fake logits where each token is assigned the same
-    # logit value.
+    # Create fake test data structures for testing
     test_fakes = _test_setup(batch_size, device)
     logit_bias_logitproc = test_fakes.get_logitsproc_by_id(
-        STR_LOGITS_BIAS_LOGITSPROC_ID)
+        STR_LOGITS_BIAS_LOGITPROC_ID)
     # Create batch update where each request demands a
     # different logit bias
     logit_bias_list = _create_logit_bias(
@@ -355,7 +459,7 @@ def test_min_p(device: str, batch_size: int, min_p: float):
     # logit value.
     test_fakes = _test_setup(batch_size, device)
 
-    min_p_logitproc = test_fakes.get_logitsproc_by_id(STR_MIN_P_LOGITSPROC_ID)
+    min_p_logitproc = test_fakes.get_logitsproc_by_id(STR_MIN_P_LOGITPROC_ID)
     # Create batch update where each request demands
     # the same min_p value
     added: Sequence[AddedRequestType] = [(rdx, SamplingParams(min_p=min_p), [])
@@ -397,7 +501,7 @@ def test_min_tokens_penalty(device: str, batch_size: int):
     torch.set_default_device(device)
     test_fakes = _test_setup(batch_size, device)
     min_tokens_logitproc = test_fakes.get_logitsproc_by_id(
-        STR_MIN_TOKENS_LOGITSPROC_ID)
+        STR_MIN_TOKENS_LOGITPROC_ID)
     batch_indices_for_min_token_penalty = (
         [0] if batch_size == 1 else np.random.randint(
             0, batch_size - 1, size=np.random.randint(1, batch_size)).tolist())
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index f89edc411b9..e85990824d2 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -12,9 +12,9 @@
                                              MinTokensLogitsProcessor)
 
 # Logits processor id strsW
-STR_MIN_P_LOGITSPROC_ID = "min_p"
-STR_MIN_TOKENS_LOGITSPROC_ID = "min_tokens"
-STR_LOGITS_BIAS_LOGITSPROC_ID = "logit_bias"
+STR_MIN_P_LOGITPROC_ID = "min_p"
+STR_MIN_TOKENS_LOGITPROC_ID = "min_tokens"
+STR_LOGITS_BIAS_LOGITPROC_ID = "logit_bias"
 
 
 def sanity_check_mm_encoder_outputs(
@@ -175,8 +175,8 @@ def init_hard_coded_logitsprocs(
         max_num_reqs=max_num_reqs + 1)
     return LogitsProcessorObjects(
         greedy={
-            STR_MIN_TOKENS_LOGITSPROC_ID: min_tokens_logitproc,
-            STR_LOGITS_BIAS_LOGITSPROC_ID: logit_bias_logitproc
+            STR_MIN_TOKENS_LOGITPROC_ID: min_tokens_logitproc,
+            STR_LOGITS_BIAS_LOGITPROC_ID: logit_bias_logitproc
         },
-        nongreedy={STR_MIN_P_LOGITSPROC_ID: min_p_logitproc},
+        nongreedy={STR_MIN_P_LOGITPROC_ID: min_p_logitproc},
     )

From 0a20965e3f59906023cfa237778bcd88cebbd523 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 11 Jun 2025 22:09:29 -0400
Subject: [PATCH 054/180] mix batch test passes without reorder

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 121 +++++++++++++++++-----
 1 file changed, 94 insertions(+), 27 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 48265b601d1..bc971598bd9 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -103,8 +103,8 @@ def _create_logit_bias(
     batch_size: int,
     vocab_size: int,
     bias_value: float,
-) -> list[Optional[dict[int, float]]]:
-    res: list[Optional[dict[int, float]]] = []
+) -> list[dict[int, float]]:
+    res: list[dict[int, float]] = []
     for i in range(batch_size):
         logit_bias = {min(i, vocab_size - 1): bias_value}
         res.append(logit_bias)
@@ -157,7 +157,7 @@ def _fake_apply_greedy_logits_processors(
         test_fakes: TestFakes) -> torch.Tensor:
     """Imitate greedy-compatible logit processor application
     in engine"""
-    logits = test_fakes.logits
+    logits = test_fakes.logits.clone()
     for processor in test_fakes.sampling_metadata.logitsprocs.greedy_list:
         logits = processor.apply(logits)
     return logits
@@ -167,7 +167,7 @@ def _fake_apply_nongreedy_logits_processors(
         test_fakes: TestFakes) -> torch.Tensor:
     """Imitate non-greedy-only logit processor application in engine
     core"""
-    logits = test_fakes.logits
+    logits = test_fakes.logits.clone()
     for processor in test_fakes.sampling_metadata.logitsprocs.nongreedy_list:
         logits = processor.apply(logits)
     return logits
@@ -175,7 +175,7 @@ def _fake_apply_nongreedy_logits_processors(
 
 def _fake_apply_all_logits_processors(test_fakes: TestFakes) -> torch.Tensor:
     """Imitate application of logits processors in engine core"""
-    logits = test_fakes.logits
+    logits = test_fakes.logits.clone()
     for processor in test_fakes.sampling_metadata.logitsprocs.all_list:
         logits = processor.apply(logits)
     return logits
@@ -198,12 +198,11 @@ def _generate_min_token_penalties_and_stop_tokens(
     min_tokens: dict[int, tuple[int, set[int]]] = {}
     for index in range(batch_size):
         if index in batch_indices_for_min_token_penalty:
-            min_tokens[index] = (
-                np.random.randint(num_output_tokens + 1,
-                                  2 * num_output_tokens),
-                set(
+            min_tokens[index] = (np.random.randint(
+                num_output_tokens + 1, 2 * num_output_tokens), [
                     np.random.randint(0, vocab_size - 1)
-                    for _ in range(np.random.randint(0, vocab_size))))
+                    for _ in range(np.random.randint(0, vocab_size))
+                ])
         else:
             min_tokens[index] = (np.random.randint(0,
                                                    num_output_tokens), set())
@@ -224,43 +223,111 @@ def _test_setup(batch_size: int, device: str) -> TestFakes:
     )
 
 
+def _logit_bias_params(kwargs: dict) -> None:
+    kwargs["logit_bias"] = _create_logit_bias(
+        batch_size=1,
+        vocab_size=VOCAB_SIZE,
+        bias_value=random.choice([-0.1, 1.2]),
+    )[0]
+
+
 def _min_p_params(kwargs: dict) -> None:
     kwargs["min_p"] = 0.1
 
 
+def _min_tokens_params(kwargs: dict) -> None:
+    (
+        _,
+        kwargs["stop_token_ids"],
+    ) = _generate_min_token_penalties_and_stop_tokens(NUM_OUTPUT_TOKENS, 1,
+                                                      VOCAB_SIZE, [0])[0]
+    kwargs["min_tokens"] = MIN_TOKENS_LEN_THRESHOLD
+
+
+def _logit_bias_validate(
+    test_fakes: TestFakes,
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: RequestParams,
+) -> bool:
+    logit_bias = request_params.params.logit_bias
+    logits_old = test_fakes.logits[batch_index].cpu()
+    logits_new = logits_new[batch_index].cpu()
+    biased_index = 0
+    for token_id in range(VOCAB_SIZE):
+        logit_old_value = logits_old[token_id]
+        logit_new_value = logits_new[token_id]
+        if biased_index == token_id:
+            bias_value = logit_bias[token_id]
+            exp_value = bias_value + logit_old_value
+            if logit_new_value != pytest.approx(exp_value):
+                print(f"Biased token {token_id} logit value {logit_new_value} "
+                      f"does not match expected value {exp_value} "
+                      f"given bias {bias_value}")
+                return False
+
+        else:
+            if logit_new_value != pytest.approx(logit_old_value):
+                print(
+                    f"Unbiased token {token_id} logit value {logit_new_value} "
+                    f"does not match expected value {logit_old_value}")
+                return False
+
+    return True
+
+
 def _min_p_validate(
     test_fakes: TestFakes,
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: RequestParams,
 ) -> bool:
-    logits = test_fakes.logits
     for token_id in range(VOCAB_SIZE):
+        logits_for_token = logits_new[batch_index][token_id]
         if token_id == 0:
             # Dominant token should always be unmasked
-            if logits[batch_index][token_id] == -float("inf"):
+            if logits_for_token == -float("inf"):
                 print("Invalid: dominant token 0 masked (-inf)")
                 return False
         else:
             if request_params.params.min_p > 0.0:
                 # Non-dominant tokens should be masked when min_p > 0
-                if logits[batch_index][token_id] != -float("inf"):
+                if logits_for_token != -float("inf"):
                     print(f"Invalid: non-dominant token {token_id} not masked")
                     return False
             else:
                 # No masking when min_p is 0
-                if logits[batch_index][token_id] == -float("inf"):
+                if logits_for_token == -float("inf"):
                     print(f"Invalid: token {token_id} masked when min_p=0.0")
                     return False
     return True
 
 
-def _logit_bias_params(kwargs: dict) -> None:
-    pass
-
+def _min_tokens_validate(
+    test_fakes: TestFakes,
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: RequestParams,
+) -> bool:
+    num_out_tokens = len(request_params.out_tokens)
+    min_reached = num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
+    stop_token_ids = request_params.params.stop_token_ids
+    for token_id in range(VOCAB_SIZE):
+        logits_for_token = logits_new[batch_index][token_id]
+        if token_id in stop_token_ids and not min_reached:
+            if logits_for_token != -float("inf"):
+                print(f"Token {token_id} is a stop token and "
+                      "the sequence has not reached min length, "
+                      "but the token is not masked "
+                      f"(logit={logits_for_token})")
+                return False
+        else:
+            if logits_for_token == -float("inf"):
+                print(f"Token {token_id} should not be masked but "
+                      f"is (output len={len(num_out_tokens)})")
+                return False
 
-def _min_tokens_params(kwargs: dict) -> None:
-    pass
+    return True
 
 
 def _none_validate(
@@ -270,7 +337,8 @@ def _none_validate(
     request_params: RequestParams,
 ) -> bool:
     """Validate that no logits processors are applied"""
-    return logits_new == test_fakes.logits
+    return torch.all(
+        logits_new[batch_index] == test_fakes.logits.cpu()[batch_index])
 
 
 class LogitsprocTestHelpers(NamedTuple):
@@ -281,16 +349,16 @@ class LogitsprocTestHelpers(NamedTuple):
 
 logitsprocs_test_mapping = {
     STR_NO_LOGITPROC:
-    LogitsprocTestHelpers(eval_fxn=_min_p_validate),
+    LogitsprocTestHelpers(eval_fxn=_none_validate),
     STR_LOGITS_BIAS_LOGITPROC_ID:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
-                          eval_fxn=_min_p_validate),
+    LogitsprocTestHelpers(gen_request_fxn=_logit_bias_params,
+                          eval_fxn=_logit_bias_validate),
     STR_MIN_P_LOGITPROC_ID:
     LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
                           eval_fxn=_min_p_validate),
     STR_MIN_TOKENS_LOGITPROC_ID:
-    LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
-                          eval_fxn=_min_p_validate),
+    LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
+                          eval_fxn=_min_tokens_validate),
 }
 
 
@@ -357,8 +425,6 @@ def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
     random.seed(42)
     torch.set_default_device(device)
 
-    logitsprocs_under_test = ["min_p", "none"]
-
     # Define a shuffled batch of requests which individually use a different
     # logitproc, or no logitproc at all
     batch_params = _generate_mixed_logitsprocs_batch_params(
@@ -386,6 +452,7 @@ def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
 
     # Emulate application of greedy logits processors in engine
     logits_w_lp = _fake_apply_all_logits_processors(test_fakes)
+    logits_w_lp = logits_w_lp.cpu()
 
     # Validate logits for each fake request
     for batch_index in range(batch_size):

From bdea83cd5ab6dd4af3806182a61dba5ba378b69f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 12 Jun 2025 00:42:22 -0400
Subject: [PATCH 055/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 448 +++++-----------------
 tests/v1/sample/utils.py                  |  67 +++-
 vllm/v1/worker/utils.py                   |   3 +-
 3 files changed, 169 insertions(+), 349 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index bc971598bd9..c51daeb9449 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -8,15 +8,20 @@
 import pytest
 import torch
 
+from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
+                                   create_penalty_tensor,
+                                   create_prompt_tokens_tensor,
+                                   fake_apply_logits_processors,
+                                   fake_update_logitsprocs_state)
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import (AddedRequestType, BatchUpdate,
-                                             LogitsProcessor)
+from vllm.utils import is_pin_memory_available
+from vllm.v1.sample.logits_processor import AddedRequestType, BatchUpdate
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
                                   STR_MIN_P_LOGITPROC_ID,
                                   STR_MIN_TOKENS_LOGITPROC_ID,
+                                  STR_NO_LOGITPROC,
                                   init_hard_coded_logitsprocs)
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -28,25 +33,11 @@
     for i in range(1 if current_platform.device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
-STR_NO_LOGITPROC = "none"
 MIN_TOKENS_LEN_THRESHOLD = 5
 REQS_PER_LOGITPROC = 10
 
 
-class TestFakes(NamedTuple):
-    """Wraps fake data structures to support testing"""
-    logits: torch.Tensor
-    sampling_metadata: SamplingMetadata
-
-    def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
-        """Shorthand for getting a specific logitproc from SamplingMetadata"""
-        return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
-
-    def get_logitsprocs(self) -> list[LogitsProcessor]:
-        return self.sampling_metadata.logitsprocs.all_list
-
-
-class RequestParams:
+class LogitsProcsRequestParams:
     """Encapsulates key params for a single request in a batch.
     
     Params can be customized based on the enabled logitproc
@@ -72,51 +63,13 @@ def __str__(self):
         return f"MyClass({summ})"
 
 
-def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
-    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
-    return fake_logits
-
-
-def _create_penalty_tensor(batch_size: int, penalty_value: float,
-                           device: torch.device) -> torch.Tensor:
-    return torch.full((batch_size, ),
-                      fill_value=penalty_value,
-                      dtype=torch.float,
-                      device=device)
-
-
-def _create_prompt_tokens_tensor(
-    prompt_token_ids: list[list[int]],
-    vocab_size: int,
-    device: torch.device,
-) -> torch.Tensor:
-    return make_tensor_with_pad(
-        prompt_token_ids,
-        pad=vocab_size,
-        device=device,
-        dtype=torch.int64,
-        pin_memory=False,
-    )
-
-
-def _create_logit_bias(
-    batch_size: int,
-    vocab_size: int,
-    bias_value: float,
-) -> list[dict[int, float]]:
-    res: list[dict[int, float]] = []
-    for i in range(batch_size):
-        logit_bias = {min(i, vocab_size - 1): bias_value}
-        res.append(logit_bias)
-    return res
-
-
-def _create_default_sampling_metadata(
+def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
     vocab_size: int,
     device: torch.device,
 ) -> SamplingMetadata:
+    """Generate fake sampling metadata with fake logitsprocs"""
     output_token_ids: list[list[int]] = []
     prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
@@ -140,12 +93,12 @@ def _create_default_sampling_metadata(
         top_k=None,
         generators={},
         max_num_logprobs=0,
-        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
-                                                      vocab_size, device),
+        prompt_token_ids=create_prompt_tokens_tensor(prompt_token_ids,
+                                                     vocab_size, device),
         output_token_ids=output_token_ids,
-        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
-        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
-        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        frequency_penalties=create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=create_penalty_tensor(batch_size, 1.0, device),
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
@@ -153,111 +106,85 @@ def _create_default_sampling_metadata(
     return fake_sampling_metadata
 
 
-def _fake_apply_greedy_logits_processors(
-        test_fakes: TestFakes) -> torch.Tensor:
-    """Imitate greedy-compatible logit processor application
-    in engine"""
-    logits = test_fakes.logits.clone()
-    for processor in test_fakes.sampling_metadata.logitsprocs.greedy_list:
-        logits = processor.apply(logits)
-    return logits
-
-
-def _fake_apply_nongreedy_logits_processors(
-        test_fakes: TestFakes) -> torch.Tensor:
-    """Imitate non-greedy-only logit processor application in engine
-    core"""
-    logits = test_fakes.logits.clone()
-    for processor in test_fakes.sampling_metadata.logitsprocs.nongreedy_list:
-        logits = processor.apply(logits)
-    return logits
-
-
-def _fake_apply_all_logits_processors(test_fakes: TestFakes) -> torch.Tensor:
-    """Imitate application of logits processors in engine core"""
-    logits = test_fakes.logits.clone()
-    for processor in test_fakes.sampling_metadata.logitsprocs.all_list:
-        logits = processor.apply(logits)
-    return logits
-
-
-def _generate_min_token_penalties_and_stop_tokens(
-    num_output_tokens: int, batch_size: int, vocab_size: int,
-    batch_indices_for_min_token_penalty: list[int]
-) -> dict[int, tuple[int, set[int]]]:
-    """
-    Generates and returns a dict of minimum token penalties and
-    corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
-    batch.
-
-    If a batch index is included in `batch_indices_for_min_token_penalty`,
-    a higher `min_tokens` value is assigned (within a randomized range),
-    and a random set of stop token IDs is created. Otherwise, a lower
-    `min_tokens` value is assigned, and the stop token IDs set is empty.
-    """
-    min_tokens: dict[int, tuple[int, set[int]]] = {}
-    for index in range(batch_size):
-        if index in batch_indices_for_min_token_penalty:
-            min_tokens[index] = (np.random.randint(
-                num_output_tokens + 1, 2 * num_output_tokens), [
-                    np.random.randint(0, vocab_size - 1)
-                    for _ in range(np.random.randint(0, vocab_size))
-                ])
-        else:
-            min_tokens[index] = (np.random.randint(0,
-                                                   num_output_tokens), set())
-    return min_tokens
-
-
-def _test_setup(batch_size: int, device: str) -> TestFakes:
-    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
+    """Generate fake logits and sampling metadata"""
+    fake_logits = create_fake_logits(batch_size, VOCAB_SIZE)
     # Create one dominant token per batch, to support min-p test
     for i in range(batch_size):
         fake_logits[i, 0] = 10.0  # High logit for first token
         fake_logits[i, 1:] = 1e-2  # Others remain low
-    sampling_metadata = _create_default_sampling_metadata(
+    sampling_metadata = _generate_fake_sampling_metadata(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
-    return TestFakes(
+    return LogitsprocsTestFakes(
         logits=fake_logits,
         sampling_metadata=sampling_metadata,
     )
 
 
-def _logit_bias_params(kwargs: dict) -> None:
-    kwargs["logit_bias"] = _create_logit_bias(
-        batch_size=1,
-        vocab_size=VOCAB_SIZE,
-        bias_value=random.choice([-0.1, 1.2]),
-    )[0]
+def _sampling_params_from_logitproc(logitproc_id: str) -> SamplingParams:
+    """Customize request SamplingParams for a specified logitproc"""
+    # SamplingParams for req with no logitproc
+    kwargs = {"min_p": 0, "logit_bias": None, "min_tokens": 0}
+    if fxn := logitsprocs_test_mapping[logitproc_id].gen_request_fxn:
+        fxn(kwargs)
+    return SamplingParams(**kwargs)
 
 
-def _min_p_params(kwargs: dict) -> None:
-    kwargs["min_p"] = 0.1
+def _generate_mixed_logitsprocs_batch_params(
+    reqs_per_logitproc: int,
+    logitsprocs_ids: list[str],
+) -> list[LogitsProcsRequestParams]:
+    """Define key params for a batch of requests with a different
+    logitproc enabled per request.
+    
+    The batch will have `reqs_per_logitproc` repeats for all
+    `logitsprocs_ids` under test, including the case where
+    no logitsproc is enabled. The batch is randomly shuffled. The
+    size of the batch is `reqs_per_logitproc` times
+    `n = len(logitsprocs_ids)`
 
+    Args:
+      reqs_per_logitproc: number of requests using each logitproc
+      logitsprocs_ids: logitsprocs under test
 
-def _min_tokens_params(kwargs: dict) -> None:
-    (
-        _,
-        kwargs["stop_token_ids"],
-    ) = _generate_min_token_penalties_and_stop_tokens(NUM_OUTPUT_TOKENS, 1,
-                                                      VOCAB_SIZE, [0])[0]
-    kwargs["min_tokens"] = MIN_TOKENS_LEN_THRESHOLD
+    Returns:
+      List of per-request params which configure the engine for that request's
+      enabled logitproc
+    """
+    batch_size = len(logitsprocs_ids) * reqs_per_logitproc
+    # Generate multiple repeats of key params for each logitproc;
+    # apply random inverse permutation to the iteration
+    # over logitsprocs, such that logitsprocs are shuffled.
+    batch_perm = random.sample(range(batch_size), k=batch_size)
+    return [
+        LogitsProcsRequestParams(
+            batch_index=idx,
+            logitproc_id=logitsprocs_ids[pdx // reqs_per_logitproc])
+        for idx, pdx in enumerate(batch_perm)
+    ]
+
+
+def _logit_bias_params(kwargs: dict) -> None:
+    """Logit bias config"""
+    kwargs["logit_bias"] = {
+        random.randint(0, VOCAB_SIZE - 1): random.choice([-0.1, 0.2])
+    }
 
 
 def _logit_bias_validate(
-    test_fakes: TestFakes,
+    test_fakes: LogitsprocsTestFakes,
     logits_new: torch.Tensor,
     batch_index: int,
-    request_params: RequestParams,
+    request_params: LogitsProcsRequestParams,
 ) -> bool:
+    """Validate logit bias logitproc applied correctly"""
     logit_bias = request_params.params.logit_bias
     logits_old = test_fakes.logits[batch_index].cpu()
     logits_new = logits_new[batch_index].cpu()
-    biased_index = 0
     for token_id in range(VOCAB_SIZE):
         logit_old_value = logits_old[token_id]
         logit_new_value = logits_new[token_id]
-        if biased_index == token_id:
+        if token_id in logit_bias:
             bias_value = logit_bias[token_id]
             exp_value = bias_value + logit_old_value
             if logit_new_value != pytest.approx(exp_value):
@@ -276,12 +203,18 @@ def _logit_bias_validate(
     return True
 
 
+def _min_p_params(kwargs: dict) -> None:
+    """Min-p logitproc config"""
+    kwargs["min_p"] = 0.1
+
+
 def _min_p_validate(
-    test_fakes: TestFakes,
+    test_fakes: LogitsprocsTestFakes,
     logits_new: torch.Tensor,
     batch_index: int,
-    request_params: RequestParams,
+    request_params: LogitsProcsRequestParams,
 ) -> bool:
+    """Validate min-p logitproc applied correctly"""
     for token_id in range(VOCAB_SIZE):
         logits_for_token = logits_new[batch_index][token_id]
         if token_id == 0:
@@ -303,12 +236,22 @@ def _min_p_validate(
     return True
 
 
+def _min_tokens_params(kwargs: dict) -> None:
+    """Min-tokens logitproc config"""
+    kwargs["min_tokens"] = MIN_TOKENS_LEN_THRESHOLD
+    kwargs["stop_token_ids"] = [
+        np.random.randint(0, VOCAB_SIZE - 1)
+        for _ in range(np.random.randint(0, VOCAB_SIZE))
+    ]
+
+
 def _min_tokens_validate(
-    test_fakes: TestFakes,
+    test_fakes: LogitsprocsTestFakes,
     logits_new: torch.Tensor,
     batch_index: int,
-    request_params: RequestParams,
+    request_params: LogitsProcsRequestParams,
 ) -> bool:
+    """Validate min-tokens logitsproc applied correctly"""
     num_out_tokens = len(request_params.out_tokens)
     min_reached = num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
     stop_token_ids = request_params.params.stop_token_ids
@@ -331,10 +274,10 @@ def _min_tokens_validate(
 
 
 def _none_validate(
-    test_fakes: TestFakes,
+    test_fakes: LogitsprocsTestFakes,
     logits_new: torch.Tensor,
     batch_index: int,
-    request_params: RequestParams,
+    request_params: LogitsProcsRequestParams,
 ) -> bool:
     """Validate that no logits processors are applied"""
     return torch.all(
@@ -362,66 +305,18 @@ class LogitsprocTestHelpers(NamedTuple):
 }
 
 
-def _get_logitsprocs_under_test() -> list[str]:
-    return list(logitsprocs_test_mapping.keys())
-
-
-def _sampling_params_from_logitproc(logitproc_id: str) -> SamplingParams:
-    """Customize SamplingParams for a specified logitproc"""
-    # SamplingParams for req with no logitproc
-    kwargs = {"min_p": 0, "logit_bias": None, "min_tokens": 0}
-    if fxn := logitsprocs_test_mapping[logitproc_id].gen_request_fxn:
-        fxn(kwargs)
-    return SamplingParams(**kwargs)
-
-
-def _generate_mixed_logitsprocs_batch_params(
-    reqs_per_logitproc: int,
-    logitsprocs_ids: list[str],
-) -> list[RequestParams]:
-    """Define key params for a batch of requests with a different
-    logitproc enabled per request.
-    
-    The batch will have `reqs_per_logitproc` repeats for all
-    `logitsprocs_ids` under test, including the case where
-    no logitsproc is enabled. The batch is randomly shuffled. The
-    size of the batch is `reqs_per_logitproc` times
-    `n = len(logitsprocs_ids)`
-
-    Args:
-      reqs_per_logitproc: number of requests using each logitproc
-      logitsprocs_ids: logitsprocs under test
-
-    Returns:
-      List of per-request params which configure the engine for that request's
-      enabled logitproc
-    """
-    batch_size = len(logitsprocs_ids) * reqs_per_logitproc
-    # Generate multiple repeats of key params for each logitproc;
-    # apply random inverse permutation to the iteration
-    # over logitsprocs, such that logitsprocs are shuffled.
-    batch_perm = random.sample(range(batch_size), k=batch_size)
-    return [
-        RequestParams(batch_index=idx,
-                      logitproc_id=logitsprocs_ids[pdx // reqs_per_logitproc])
-        for idx, pdx in enumerate(batch_perm)
-    ]
-
-
-def _fake_update_logitsprocs_state(
-    test_fakes: TestFakes,
-    batch_update: BatchUpdate,
-) -> None:
-    for logitproc in test_fakes.get_logitsprocs():
-        logitproc.update_state(batch_update)
+def _get_test_cases() -> list[str]:
+    """Each test case is a set of logitsprocs"""
+    logitsprocs_ids = list(logitsprocs_test_mapping.keys())
+    return [[logitproc_id]
+            for logitproc_id in logitsprocs_ids] + [logitsprocs_ids]
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
-@pytest.mark.parametrize("logitsprocs_under_test",
-                         [_get_logitsprocs_under_test()])
-def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
-                                     logitsprocs_under_test: list[str]):
+@pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
+def test_logitsprocs(device: str, reqs_per_logitproc: int,
+                     logitsprocs_under_test: list[str]):
     random.seed(42)
     torch.set_default_device(device)
 
@@ -433,7 +328,7 @@ def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
     batch_size = len(batch_params)
 
     # Create fake test data structures for testing.
-    test_fakes = _test_setup(batch_size, device)
+    test_fakes = _generate_test_fakes(batch_size, device)
 
     # Construct logitsprocs batch update
     added: Sequence[AddedRequestType] = [
@@ -448,10 +343,10 @@ def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
     )
 
     # Apply fake batch update to logitsprocs
-    _fake_update_logitsprocs_state(test_fakes, fake_batch_update)
+    fake_update_logitsprocs_state(test_fakes, fake_batch_update)
 
     # Emulate application of greedy logits processors in engine
-    logits_w_lp = _fake_apply_all_logits_processors(test_fakes)
+    logits_w_lp = fake_apply_logits_processors(test_fakes)
     logits_w_lp = logits_w_lp.cpu()
 
     # Validate logits for each fake request
@@ -464,142 +359,3 @@ def test_mixed_batch_with_reordering(device: str, reqs_per_logitproc: int,
                    request_params=request_params), (
                        f"Validation failed for batch_index={batch_index}, "
                        f"req_params={request_params}")
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("bias_value", [-0.1, 1.2])
-def test_logit_bias(device: str, batch_size: int, bias_value: float):
-    """
-    Test to verify logit bias logits processor
-    """
-    torch.set_default_device(device)
-
-    # Create fake test data structures for testing
-    test_fakes = _test_setup(batch_size, device)
-    logit_bias_logitproc = test_fakes.get_logitsproc_by_id(
-        STR_LOGITS_BIAS_LOGITPROC_ID)
-    # Create batch update where each request demands a
-    # different logit bias
-    logit_bias_list = _create_logit_bias(
-        batch_size=batch_size,
-        vocab_size=VOCAB_SIZE,
-        bias_value=bias_value,
-    )
-    added: Sequence[AddedRequestType] = [
-        (rdx, SamplingParams(logit_bias=logit_bias_list[rdx]), [])
-        for rdx in range(batch_size)
-    ]
-    batch_update = BatchUpdate(
-        removed=[],
-        moved=[],
-        added=added,
-        batch_size=batch_size,
-    )
-    # Register batch update with logit processor
-    logit_bias_logitproc.update_state(batch_update)
-    # Emulate application of greedy logits processors in engine
-    logits = _fake_apply_greedy_logits_processors(test_fakes)
-    logits = logits.cpu()
-    for batch_idx in range(batch_size):
-        logits_for_req = logits[batch_idx]
-        biased_index = min(batch_idx, VOCAB_SIZE - 1)
-        for token_id in range(VOCAB_SIZE):
-            if biased_index == token_id:
-                assert logits_for_req[token_id] == pytest.approx(bias_value +
-                                                                 1e-2)
-            else:
-                assert logits_for_req[token_id] == pytest.approx(1e-2)
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-@pytest.mark.parametrize("min_p", [0.0, 0.1])
-def test_min_p(device: str, batch_size: int, min_p: float):
-    """
-    Tests that when min_p is applied, tokens with probability below 
-    min_p * max_prob are masked with -inf.
-    """
-    torch.set_default_device(device)
-
-    # Create fake logits where each token is assigned the same
-    # logit value.
-    test_fakes = _test_setup(batch_size, device)
-
-    min_p_logitproc = test_fakes.get_logitsproc_by_id(STR_MIN_P_LOGITPROC_ID)
-    # Create batch update where each request demands
-    # the same min_p value
-    added: Sequence[AddedRequestType] = [(rdx, SamplingParams(min_p=min_p), [])
-                                         for rdx in range(batch_size)]
-    batch_update = BatchUpdate(
-        removed=[],
-        moved=[],
-        added=added,
-        batch_size=batch_size,
-    )
-    # Register batch update with logit processor
-    min_p_logitproc.update_state(batch_update)
-    # Emulate application of non-greedy logits processors in engine
-    logits = _fake_apply_nongreedy_logits_processors(test_fakes)
-    logits = logits.cpu()
-
-    for batch_idx in range(batch_size):
-        for token_id in range(VOCAB_SIZE):
-            if token_id == 0:
-                # Dominant token should always be unmasked
-                assert logits[batch_idx][token_id] != -float("inf")
-            else:
-                if min_p > 0.0:
-                    # Non-dominant tokens should be masked when min_p > 0
-                    assert logits[batch_idx][token_id] == -float("inf")
-                else:
-                    # No masking when min_p is 0
-                    assert logits[batch_idx][token_id] != -float("inf")
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("batch_size", [1, 2, 32])
-def test_min_tokens_penalty(device: str, batch_size: int):
-    """
-    Tests that if the number of output tokens is less than
-    SamplingParams.min_tokens then we will set the logits for
-    the stop token ids to -inf.
-    """
-    torch.set_default_device(device)
-    test_fakes = _test_setup(batch_size, device)
-    min_tokens_logitproc = test_fakes.get_logitsproc_by_id(
-        STR_MIN_TOKENS_LOGITPROC_ID)
-    batch_indices_for_min_token_penalty = (
-        [0] if batch_size == 1 else np.random.randint(
-            0, batch_size - 1, size=np.random.randint(1, batch_size)).tolist())
-    min_tokens_dict = _generate_min_token_penalties_and_stop_tokens(
-        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
-        batch_indices_for_min_token_penalty)
-
-    # Create batch update where each request demands
-    # a different min_tokens value
-    added: Sequence[AddedRequestType] = [
-        (rdx,
-         SamplingParams(min_tokens=min_tokens_dict[rdx][0],
-                        max_tokens=None,
-                        stop_token_ids=list(min_tokens_dict[rdx][1])), [])
-        for rdx in range(batch_size)
-    ]
-    batch_update = BatchUpdate(
-        removed=[],
-        moved=[],
-        added=added,
-        batch_size=batch_size,
-    )
-    # Register batch update with logit processor
-    min_tokens_logitproc.update_state(batch_update)
-    # Emulate application of greedy logits processors in engine
-    logits = _fake_apply_greedy_logits_processors(test_fakes)
-    logits = logits.cpu()
-    for batch_idx in range(batch_size):
-        _, stop_token_ids = min_tokens_dict.get(batch_idx, (0, set()))
-        for token_id in range(VOCAB_SIZE):
-            if token_id in stop_token_ids:
-                assert logits[batch_idx][token_id] == -float("inf")
-            else:
-                assert logits[batch_idx][token_id] != -float("inf")
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 3ae0859355e..2e2c0e36bed 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -2,11 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
-from typing import Optional
+from typing import NamedTuple, Optional
 
 import regex as re
+import torch
 
 from vllm import CompletionOutput
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
+from vllm.v1.sample.metadata import SamplingMetadata
 
 
 class BatchLogprobsComposition(Enum):
@@ -133,4 +137,63 @@ def compute_correct_cumulative_logprob(
     token_ids = completion_output.token_ids
     logprobs = completion_output.logprobs
     assert logprobs is not None
-    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
\ No newline at end of file
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
+
+
+def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def create_penalty_tensor(batch_size: int, penalty_value: float,
+                          device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+class LogitsprocsTestFakes(NamedTuple):
+    """Wraps fake data structures to support testing"""
+    logits: torch.Tensor
+    sampling_metadata: SamplingMetadata
+
+    def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
+        """Shorthand for getting a specific logitproc from SamplingMetadata"""
+        return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
+
+    def get_logitsprocs(self) -> list[LogitsProcessor]:
+        return self.sampling_metadata.logitsprocs.all_list
+
+
+def fake_update_logitsprocs_state(
+    test_fakes: LogitsprocsTestFakes,
+    batch_update: BatchUpdate,
+) -> None:
+    """Imitate logits processors persistent batch state update
+    in engine core"""
+    for logitproc in test_fakes.get_logitsprocs():
+        logitproc.update_state(batch_update)
+
+
+def fake_apply_logits_processors(
+        test_fakes: LogitsprocsTestFakes) -> torch.Tensor:
+    """Imitate application of logits processors in engine core"""
+    logits = test_fakes.logits.clone()
+    for processor in test_fakes.sampling_metadata.logitsprocs.all_list:
+        logits = processor.apply(logits)
+    return logits
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e85990824d2..b38a6c2dbfd 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -11,7 +11,8 @@
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor)
 
-# Logits processor id strsW
+# Logits processor id strs
+STR_NO_LOGITPROC = "none"
 STR_MIN_P_LOGITPROC_ID = "min_p"
 STR_MIN_TOKENS_LOGITPROC_ID = "min_tokens"
 STR_LOGITS_BIAS_LOGITPROC_ID = "logit_bias"

From 5a5e38f3c064bcc63b49e2f70b7d13cd8bdc8db9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 12 Jun 2025 19:08:29 -0400
Subject: [PATCH 056/180] move-only

---
 vllm/v1/sample/logits_processor.py | 87 ++++++++++++++----------------
 vllm/v1/worker/gpu_input_batch.py  | 13 +++--
 2 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index e63aca23f3e..f3442eb562b 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from enum import Enum
 from typing import Optional
 
 import torch
@@ -11,6 +12,12 @@
 
 logger = init_logger(__name__)
 
+
+class MoveDirectionalityEnum(Enum):
+    UNIDIRECTIONAL = 0
+    SWAP = 1
+
+
 # (index, params, output_tok_ids) for new
 # requests added to the batch.
 AddedRequestType = tuple[int, SamplingParams, list[int]]
@@ -19,7 +26,7 @@
 SwappedRequestType = tuple[int, int]
 # (from, to) batch indices of any requests
 # moved within the batch.
-MovedRequestType = tuple[int, int]
+MovedRequestType = tuple[int, int, MoveDirectionalityEnum]
 # Batch indices of any removed requests.
 RemovedRequestType = int
 
@@ -30,18 +37,15 @@ class BatchUpdate:
     _removed: list[RemovedRequestType]
     _is_removed_sorted: bool
     moved: list[MovedRequestType]
-    swapped: list[SwappedRequestType]
     added: list[AddedRequestType]
 
     def __init__(self,
                  removed: Optional[list[RemovedRequestType]] = None,
                  moved: Optional[list[MovedRequestType]] = None,
-                 swapped: Optional[list[SwappedRequestType]] = None,
                  added: Optional[list[AddedRequestType]] = None,
                  batch_size: Optional[int] = None) -> None:
         self._removed = removed or []
         self.moved = moved or []
-        self.swapped = swapped or []
         self.added = added or []
         self.batch_size = 0 if batch_size is None else batch_size
         self._is_removed_sorted = False
@@ -86,7 +90,6 @@ def reset(self):
         self._removed = []
         self._is_removed_sorted = False
         self.moved = []
-        self.swapped = []
         self.added = []
 
 
@@ -158,33 +161,29 @@ def update_state(self, batch_update: BatchUpdate):
         # Process added requests.
         for index, sampling_params, _ in batch_update.added:
             min_p = sampling_params.min_p
-            self.min_p_cpu[index] = min_p
+            if self.min_p_cpu[index] != min_p:
+                needs_update = True
+                self.min_p_cpu[index] = min_p
             if min_p:
                 self.min_p_count += 1
-                needs_update = True
 
         if self.min_p_count:
             # Process removed requests.
+            needs_update |= bool(batch_update.removed)
             for index in batch_update.removed:
                 if self.min_p_cpu[index]:
                     self.min_p_count -= 1
-                    needs_update = True
 
-            # Process moved (i1 -> i2) requests
-            for from_index, to_index in batch_update.moved:
-                min_p = self.min_p_cpu[from_index]
-                self.min_p_cpu[to_index] = min_p
-                if min_p:
-                    needs_update = True
-
-            # Process swapped (i1 <-> i2) requests
-            for adx, bdx in batch_update.swapped:
-                min_p_a = self.min_p_cpu[adx]
-                min_p_b = self.min_p_cpu[bdx]
-                if min_p_a or min_p_b:
-                    needs_update = True
-                self.min_p_cpu[adx] = min_p_b
-                self.min_p_cpu[bdx] = min_p_a
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                change = (min_p_a :=
+                          self.min_p_cpu[adx]) != (min_p_b :=
+                                                   self.min_p_cpu[bdx])
+                needs_update |= change
+                if change:
+                    self.min_p_cpu[bdx] = min_p_a
+                    if direct == MoveDirectionalityEnum.SWAP:
+                        self.min_p_cpu[adx] = min_p_b
 
         # Update tensors if needed.
         size = batch_update.batch_size
@@ -245,21 +244,16 @@ def update_state(self, batch_update: BatchUpdate):
                 if self.biases.pop(index, None):
                     needs_update = True
 
-            # Process moved requests.
-            for from_index, to_index in batch_update.moved:
-                if entry := self.biases.pop(from_index, None):
-                    self.biases[to_index] = entry
-                    needs_update = True
-
-            # Process swapped requests.
-            for a_index, b_index in batch_update.swapped:
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for a_index, b_index, direct in batch_update.moved:
                 a_entry = self.biases.pop(a_index, None)
-                b_entry = self.biases.pop(b_index, None)
-                needs_update |= bool(a_entry or b_entry)
-                if a_entry:
-                    self.biases[b_index] = a_entry
-                if b_entry:
+                if direct == MoveDirectionalityEnum.SWAP and (
+                        b_entry := self.biases.pop(b_index, None)) is not None:
+                    needs_update = True
                     self.biases[a_index] = b_entry
+                if a_entry is not None:
+                    needs_update = True
+                    self.biases[b_index] = a_entry
 
         # Update tensors if needed.
         if self.biases and needs_update:
@@ -323,21 +317,18 @@ def update_state(self, batch_update: BatchUpdate):
                 if self.min_toks.pop(index, None):
                     needs_update = True
 
-            # Process moved requests.
-            for from_index, to_index in batch_update.moved:
-                if entry := self.min_toks.pop(from_index, None):
-                    self.min_toks[to_index] = entry
-                    needs_update = True
-
-            # Process swapped requests.
-            for a_index, b_index in batch_update.swapped:
+            # Process moved requests, unidirectional (a->b) and
+            # swapped (a<->b)
+            for a_index, b_index, direct in batch_update.moved:
                 a_entry = self.min_toks.pop(a_index, None)
-                b_entry = self.min_toks.pop(b_index, None)
-                needs_update |= bool(a_entry or b_entry)
+                if direct == MoveDirectionalityEnum.SWAP and (
+                        b_entry := self.min_toks.pop(b_index,
+                                                     None)) is not None:
+                    needs_update = True
+                    self.min_toks[a_index] = b_entry
                 if a_entry:
+                    needs_update = True
                     self.min_toks[b_index] = a_entry
-                if b_entry:
-                    self.min_toks[a_index] = b_entry
 
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 999c63a569b..7051914b512 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -14,7 +14,8 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.logits_processor import BatchUpdate, MinPLogitsProcessor
+from vllm.v1.sample.logits_processor import (BatchUpdate, MinPLogitsProcessor,
+                                             MoveDirectionalityEnum)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
@@ -441,7 +442,8 @@ def remove_request(self, req_id: str) -> Optional[int]:
     def swap_states(self, i1: int, i2: int) -> None:
         if not is_tpu:
             # TODO(andy): TPU implementation does not support this path
-            self.batch_update.swapped.append((i1, i2))
+            self.batch_update.moved.append(
+                (i1, i2, MoveDirectionalityEnum.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -499,7 +501,8 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.block_table.swap_row(i1, i2)
 
     def _register_move_request(self, from_idx: int, to_idx: int) -> None:
-        self.batch_update.moved.append((from_idx, to_idx))
+        self.batch_update.moved.append(
+            (from_idx, to_idx, MoveDirectionalityEnum.UNIDIRECTIONAL))
 
     def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
         """Slide non-empty requests down into empty indices.
@@ -544,7 +547,9 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
             # index.
             if not is_tpu:
                 self.batch_update.pop_removed_if_can()
-                self.batch_update.moved.append((last_req_index, empty_index))
+                self.batch_update.moved.append(
+                    (last_req_index, empty_index,
+                     MoveDirectionalityEnum.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None

From 9703c4a215d4156581b49daab0e7accb6532f2aa Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 13 Jun 2025 16:52:40 -0400
Subject: [PATCH 057/180] fake reordering logic

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 155 ++++++++++++++++------
 vllm/v1/sample/logits_processor.py        |   8 +-
 vllm/v1/worker/gpu_input_batch.py         |   5 +-
 3 files changed, 125 insertions(+), 43 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index c51daeb9449..5aee0778baa 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from collections.abc import Callable, Sequence
+from collections.abc import Callable
 from typing import NamedTuple, Optional
 
 import numpy as np
@@ -10,13 +10,11 @@
 
 from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
                                    create_penalty_tensor,
-                                   create_prompt_tokens_tensor,
-                                   fake_apply_logits_processors,
-                                   fake_update_logitsprocs_state)
+                                   create_prompt_tokens_tensor)
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
-from vllm.v1.sample.logits_processor import AddedRequestType, BatchUpdate
+from vllm.v1.sample.logits_processor import BatchUpdate, MoveDirectionalityEnum
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
                                   STR_MIN_P_LOGITPROC_ID,
@@ -322,40 +320,121 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
 
     # Define a shuffled batch of requests which individually use a different
     # logitproc, or no logitproc at all
-    batch_params = _generate_mixed_logitsprocs_batch_params(
+    workload_params = _generate_mixed_logitsprocs_batch_params(
         reqs_per_logitproc=reqs_per_logitproc,
         logitsprocs_ids=logitsprocs_under_test)
-    batch_size = len(batch_params)
+    workload_size = len(workload_params)
 
     # Create fake test data structures for testing.
-    test_fakes = _generate_test_fakes(batch_size, device)
-
-    # Construct logitsprocs batch update
-    added: Sequence[AddedRequestType] = [
-        (req_params.batch_index, req_params.params, req_params.out_tokens)
-        for req_params in batch_params
-    ]
-    fake_batch_update = BatchUpdate(
-        removed=[],
-        moved=[],
-        added=added,
-        batch_size=batch_size,
-    )
-
-    # Apply fake batch update to logitsprocs
-    fake_update_logitsprocs_state(test_fakes, fake_batch_update)
-
-    # Emulate application of greedy logits processors in engine
-    logits_w_lp = fake_apply_logits_processors(test_fakes)
-    logits_w_lp = logits_w_lp.cpu()
-
-    # Validate logits for each fake request
-    for batch_index in range(batch_size):
-        request_params = batch_params[batch_index]
-        fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
-        assert fxn(test_fakes=test_fakes,
-                   logits_new=logits_w_lp,
-                   batch_index=batch_index,
-                   request_params=request_params), (
-                       f"Validation failed for batch_index={batch_index}, "
-                       f"req_params={request_params}")
+    #test_fakes = _generate_test_fakes(workload_size, device)
+
+    max_add_remove_per_step = max(1, int(0.1 * workload_size))
+    wdx = 0  # Next request index in workload to add
+    persistent_batch = [
+    ]  # Persistent batch state, as list of workload indices
+
+    # Break when entire workload has been added previously and persistent
+    # batch is empty
+    while True:
+        workload_reqs_remaining = workload_size - wdx
+        curr_batch_size = len(persistent_batch)
+        if not (workload_reqs_remaining or curr_batch_size):
+            break
+
+        # 50% of steps: add no reqs
+        # Other 50%: add a limited number of reqs (less than the number
+        # of workload reqs remaining, less than an arbitrary max)
+        # If no workload reqs remain: 100% of steps have 0 adds
+        num_step_add = random.choice([
+            0,
+            random.randint(
+                1, min(max_add_remove_per_step, workload_reqs_remaining))
+        ]) if workload_reqs_remaining else 0
+
+        # 50% of steps: remove no requests
+        # Other 50%: remove a limited number of reqs (less than the number
+        # persistent batch reqs remaining, less than an arbitrary max)
+        # If persistent batch is empty: 100% of steps have 0 removals until
+        # more requests are added. Assume that removed requests are always
+        # drawn from the current batch, before new adds
+        num_step_remove = random.choice([
+            0,
+            random.randint(1, min(max_add_remove_per_step, curr_batch_size))
+        ]) if curr_batch_size else 0
+
+        num_step_add_replace = min(num_step_add, num_step_remove)
+
+        # Generate fake removed request indices from current persistent
+        # batch before adds
+        batch_update = BatchUpdate(
+            removed=random.sample(range(curr_batch_size), num_step_remove))
+
+        # Get added requests from workload
+        for add_req_params in workload_params[wdx:(wdx +
+                                                   num_step_add_replace)]:
+            # Replace as many removed requests as possible with added requests
+            add_remove_idx = batch_update.pop_removed_if_can()
+            batch_update.added.append((add_remove_idx, add_req_params.params,
+                                       add_req_params.out_tokens))
+            persistent_batch[add_remove_idx] = add_req_params
+
+        # Append remaining added requests to end of batch
+        add_reqs_append = workload_params[(wdx + num_step_add_replace):(
+            wdx + num_step_add)]
+        batch_update.added.extend([
+            (adx + curr_batch_size, add_req_params.params,
+             add_req_params.out_tokens)
+            for adx, add_req_params in enumerate(add_reqs_append)
+        ])
+        persistent_batch.extend(add_reqs_append)
+        pre_condense_batch_size = len(persistent_batch)
+        wdx += num_step_add  # Update workload offset
+
+        # Simulate condensing persistent batch
+        last_nonempty_index = pre_condense_batch_size - 1
+        condensed_to_idxs = set()
+        while batch_update.removed:
+            if (last_nonempty_index in batch_update.removed or 
+                last_nonempty_index in condensed_to_idxs):
+                last_nonempty_index -= 1
+                continue
+            # last_nonempty_index is the highest persistent batch index that was
+            # not removed
+            first_empty_index = batch_update.peek_removed_if_can()
+            assert first_empty_index is not None
+            if first_empty_index > last_nonempty_index:
+                break
+            # first_empty_index is the lowest removed persistent batch index
+            # that is less than last_nonempty_index
+            #
+            # move last_nonempty_index -> first_empty_index
+            batch_update.pop_removed_if_can()
+            condensed_to_idxs.add(first_empty_index)
+            persistent_batch[first_empty_index] = persistent_batch[
+                last_nonempty_index]
+            batch_update.moved.append((last_nonempty_index, first_empty_index,
+                                       MoveDirectionalityEnum.UNIDIRECTIONAL))
+
+            last_nonempty_index -= 1
+
+        # Now removed requests & gaps left by non-removed requests that got
+        # moved downward are grouped consecutively in the upper indices of
+        # the persistent batch. Truncate them to get condensed persistent batch
+        condensed_batch_size = curr_batch_size + num_step_add - num_step_remove
+        persistent_batch = persistent_batch[0:condensed_batch_size]
+
+        if condensed_batch_size > 1:
+            # Simulate arbitrary reorder_batch() in the kernel backend
+            # Generate a random number k of non-overlapping swap tuples
+            k = random.randint(0, condensed_batch_size // 2)
+            idxs = list(range(condensed_batch_size))
+            random.shuffle(idxs)
+            swaps = [
+                tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
+            ]
+            batch_update.moved.extend([
+                (sw[0], sw[1], MoveDirectionalityEnum.SWAP) for sw in swaps
+            ])
+            for adx, bdx in swaps:
+                persistent_batch[adx], persistent_batch[
+                    bdx] = persistent_batch[bdx], persistent_batch[adx]
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index f3442eb562b..e820bab1536 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -75,9 +75,11 @@ def has_removed(self) -> bool:
     def num_removed(self) -> int:
         return len(self._removed)
 
-    def peek_removed(self) -> int:
-        self._sort_removed()
-        return self._removed[-1]
+    def peek_removed_if_can(self) -> Optional[int]:
+        if self.num_removed():
+            self._sort_removed()
+            return self._removed[-1]
+        return None
 
     def pop_removed_if_can(self) -> Optional[int]:
         if self.has_removed():
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 7051914b512..9c03b7fb643 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -538,8 +538,9 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = (empty_req_indices.pop()
-                           if is_tpu else self.batch_update.peek_removed())
+            empty_index = (empty_req_indices.pop() if is_tpu else
+                           self.batch_update.peek_removed_if_can())
+            assert empty_index is not None
             if empty_index >= last_req_index:
                 break
 

From 6078602b39a90be2dd606631f9f4b59e7bbaebe5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 13 Jun 2025 17:15:20 -0400
Subject: [PATCH 058/180] fake logitsproc invocation against fake batch

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 41 +++++++++++++++++------
 tests/v1/sample/utils.py                  |  9 +++--
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 5aee0778baa..33affd06099 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -10,7 +10,9 @@
 
 from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
                                    create_penalty_tensor,
-                                   create_prompt_tokens_tensor)
+                                   create_prompt_tokens_tensor,
+                                   fake_apply_logitsprocs,
+                                   fake_update_logitsprocs_state)
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -40,13 +42,13 @@ class LogitsProcsRequestParams:
     
     Params can be customized based on the enabled logitproc
     """
-    batch_index: int
+    workload_index: int
     logitproc_id: str  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
     params: SamplingParams  # Settings customized for logitproc
 
-    def __init__(self, batch_index: int, logitproc_id: str):
-        self.batch_index = batch_index
+    def __init__(self, workload_index: int, logitproc_id: str):
+        self.workload_index = workload_index
         self.logitproc_id = logitproc_id
         # Number of output tokens is randomly 0 or twice the min-tokens
         # threshold which will be used in testing. Output token values
@@ -156,7 +158,7 @@ def _generate_mixed_logitsprocs_batch_params(
     batch_perm = random.sample(range(batch_size), k=batch_size)
     return [
         LogitsProcsRequestParams(
-            batch_index=idx,
+            workload_index=idx,
             logitproc_id=logitsprocs_ids[pdx // reqs_per_logitproc])
         for idx, pdx in enumerate(batch_perm)
     ]
@@ -326,11 +328,11 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
     workload_size = len(workload_params)
 
     # Create fake test data structures for testing.
-    #test_fakes = _generate_test_fakes(workload_size, device)
+    test_fakes = _generate_test_fakes(workload_size, device)
 
-    max_add_remove_per_step = max(1, int(0.1 * workload_size))
+    max_add_remove_per_step = max(1, int(0.2 * workload_size))
     wdx = 0  # Next request index in workload to add
-    persistent_batch = [
+    persistent_batch: list[LogitsProcsRequestParams] = [
     ]  # Persistent batch state, as list of workload indices
 
     # Break when entire workload has been added previously and persistent
@@ -394,8 +396,8 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
         last_nonempty_index = pre_condense_batch_size - 1
         condensed_to_idxs = set()
         while batch_update.removed:
-            if (last_nonempty_index in batch_update.removed or 
-                last_nonempty_index in condensed_to_idxs):
+            if (last_nonempty_index in batch_update.removed
+                    or last_nonempty_index in condensed_to_idxs):
                 last_nonempty_index -= 1
                 continue
             # last_nonempty_index is the highest persistent batch index that was
@@ -438,3 +440,22 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
             for adx, bdx in swaps:
                 persistent_batch[adx], persistent_batch[
                     bdx] = persistent_batch[bdx], persistent_batch[adx]
+        batch_update.batch_size = condensed_batch_size
+
+        # Apply fake batch update to logitsprocs
+        fake_update_logitsprocs_state(test_fakes, batch_update)
+
+        # Emulate application of greedy logits processors in engine
+        slice_idxs = [req.workload_index for req in persistent_batch]
+        logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
+
+        # # Validate logits for each fake request
+        # for batch_index in range(workload_size):
+        #     request_params = workload_params[batch_index]
+        #     fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
+        #     assert fxn(test_fakes=test_fakes,
+        #             logits_new=logits_w_lp,
+        #             batch_index=batch_index,
+        #             request_params=request_params), (
+        #                 f"Validation failed for batch_index={batch_index}, "
+        #                 f"req_params={request_params}")
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 2e2c0e36bed..fc4d061ced5 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -190,10 +190,13 @@ def fake_update_logitsprocs_state(
         logitproc.update_state(batch_update)
 
 
-def fake_apply_logits_processors(
-        test_fakes: LogitsprocsTestFakes) -> torch.Tensor:
+def fake_apply_logitsprocs(
+    test_fakes: LogitsprocsTestFakes,
+    slice_indices: list[int],
+) -> torch.Tensor:
     """Imitate application of logits processors in engine core"""
-    logits = test_fakes.logits.clone()
+    logits = test_fakes.logits[torch.tensor(slice_indices,
+                                            dtype=torch.long)].clone()
     for processor in test_fakes.sampling_metadata.logitsprocs.all_list:
         logits = processor.apply(logits)
     return logits

From ae5b60039370db6c4d3bfe74de37d0cfe6bcc719 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 13 Jun 2025 17:29:12 -0400
Subject: [PATCH 059/180] almost passing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 41 ++++++++++++++++-------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 33affd06099..91098138c39 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -173,13 +173,15 @@ def _logit_bias_params(kwargs: dict) -> None:
 
 def _logit_bias_validate(
     test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
 ) -> bool:
     """Validate logit bias logitproc applied correctly"""
     logit_bias = request_params.params.logit_bias
-    logits_old = test_fakes.logits[batch_index].cpu()
+    logits_old = (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
     logits_new = logits_new[batch_index].cpu()
     for token_id in range(VOCAB_SIZE):
         logit_old_value = logits_old[token_id]
@@ -210,6 +212,7 @@ def _min_p_params(kwargs: dict) -> None:
 
 def _min_p_validate(
     test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
@@ -247,6 +250,7 @@ def _min_tokens_params(kwargs: dict) -> None:
 
 def _min_tokens_validate(
     test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
@@ -275,13 +279,14 @@ def _min_tokens_validate(
 
 def _none_validate(
     test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
 ) -> bool:
     """Validate that no logits processors are applied"""
-    return torch.all(
-        logits_new[batch_index] == test_fakes.logits.cpu()[batch_index])
+    return torch.all(logits_new[batch_index] == (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu()))
 
 
 class LogitsprocTestHelpers(NamedTuple):
@@ -449,13 +454,23 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
         slice_idxs = [req.workload_index for req in persistent_batch]
         logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
 
-        # # Validate logits for each fake request
-        # for batch_index in range(workload_size):
-        #     request_params = workload_params[batch_index]
-        #     fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
-        #     assert fxn(test_fakes=test_fakes,
-        #             logits_new=logits_w_lp,
-        #             batch_index=batch_index,
-        #             request_params=request_params), (
-        #                 f"Validation failed for batch_index={batch_index}, "
-        #                 f"req_params={request_params}")
+        # Validate output
+        if not slice_idxs:
+            if logits_w_lp.shape[0] != 0:
+                raise ValueError(
+                    "Fake persistent batch is impty but logitsprocs "
+                    f"output batch with shape {logits_w_lp.shape}")
+            return
+
+        # Validate logits for each fake request
+        for batch_index in range(condensed_batch_size):
+            request_params = persistent_batch[batch_index]
+            fxn = logitsprocs_test_mapping[
+                request_params.logitproc_id].eval_fxn
+            assert fxn(test_fakes=test_fakes,
+                       persistent_batch=persistent_batch,
+                       logits_new=logits_w_lp,
+                       batch_index=batch_index,
+                       request_params=request_params), (
+                           f"Validation failed for batch_index={batch_index}, "
+                           f"req_params={request_params}")

From 89ea6dd8ca27da55d4cf44652c244e811fb52360 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 13 Jun 2025 18:09:00 -0400
Subject: [PATCH 060/180] wip refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 282 ++++++++++++----------
 1 file changed, 159 insertions(+), 123 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 91098138c39..5c8e2292f79 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -317,12 +317,151 @@ def _get_test_cases() -> list[str]:
             for logitproc_id in logitsprocs_ids] + [logitsprocs_ids]
 
 
+def _generate_fake_step_update(
+    persistent_batch: list[LogitsProcsRequestParams],
+    workload_params: list[LogitsProcsRequestParams],
+    wdx: int,
+) -> tuple[BatchUpdate, int]:
+    batch_size = len(persistent_batch)
+    workload_size = len(workload_params)
+    workload_reqs_remaining = workload_size - wdx
+    max_add_remove_per_step = max(1, int(0.2 * workload_size))
+
+    # 50% of steps: add no reqs
+    # Other 50%: add a limited number of reqs (less than the number
+    # of workload reqs remaining, less than an arbitrary max)
+    # If no workload reqs remain: 100% of steps have 0 adds
+    num_step_add = random.choice([
+        0,
+        random.randint(1, min(max_add_remove_per_step,
+                              workload_reqs_remaining))
+    ]) if workload_reqs_remaining else 0
+
+    # 50% of steps: remove no requests
+    # Other 50%: remove a limited number of reqs (less than the number
+    # persistent batch reqs remaining, less than an arbitrary max)
+    # If persistent batch is empty: 100% of steps have 0 removals until
+    # more requests are added. Assume that removed requests are always
+    # drawn from the current batch, before new adds
+    num_step_remove = random.choice([
+        0, random.randint(1, min(max_add_remove_per_step, batch_size))
+    ]) if batch_size else 0
+
+    num_step_add_replace = min(num_step_add, num_step_remove)
+
+    # Generate fake removed request indices from current persistent
+    # batch before adds
+    batch_update = BatchUpdate(
+        removed=random.sample(range(batch_size), num_step_remove))
+
+    # Get added requests from workload
+    for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
+        # Replace as many removed requests as possible with added requests
+        add_remove_idx = batch_update.pop_removed_if_can()
+        batch_update.added.append(
+            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+        persistent_batch[add_remove_idx] = add_req_params
+
+    # Append remaining added requests to end of batch
+    add_reqs_append = workload_params[(wdx +
+                                       num_step_add_replace):(wdx +
+                                                              num_step_add)]
+    batch_update.added.extend([
+        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        for adx, add_req_params in enumerate(add_reqs_append)
+    ])
+    persistent_batch.extend(add_reqs_append)
+    pre_condense_batch_size = len(persistent_batch)
+    wdx += num_step_add  # Update workload offset
+
+    # Simulate condensing persistent batch
+    last_nonempty_index = pre_condense_batch_size - 1
+    condensed_to_idxs = set()
+    while batch_update.removed:
+        if (last_nonempty_index in batch_update.removed
+                or last_nonempty_index in condensed_to_idxs):
+            last_nonempty_index -= 1
+            continue
+        # last_nonempty_index is the highest persistent batch index that was
+        # not removed
+        first_empty_index = batch_update.peek_removed_if_can()
+        assert first_empty_index is not None
+        if first_empty_index > last_nonempty_index:
+            break
+        # first_empty_index is the lowest removed persistent batch index
+        # that is less than last_nonempty_index
+        #
+        # move last_nonempty_index -> first_empty_index
+        batch_update.pop_removed_if_can()
+        condensed_to_idxs.add(first_empty_index)
+        persistent_batch[first_empty_index] = persistent_batch[
+            last_nonempty_index]
+        batch_update.moved.append((last_nonempty_index, first_empty_index,
+                                   MoveDirectionalityEnum.UNIDIRECTIONAL))
+
+        last_nonempty_index -= 1
+
+    # Now removed requests & gaps left by non-removed requests that got
+    # moved downward are grouped consecutively in the upper indices of
+    # the persistent batch. Truncate them to get condensed persistent batch
+    condensed_batch_size = batch_size + num_step_add - num_step_remove
+    persistent_batch = persistent_batch[0:condensed_batch_size]
+
+    if condensed_batch_size > 1:
+        # Simulate arbitrary reorder_batch() in the kernel backend
+        # Generate a random number k of non-overlapping swap tuples
+        k = random.randint(0, condensed_batch_size // 2)
+        idxs = list(range(condensed_batch_size))
+        random.shuffle(idxs)
+        swaps = [
+            tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
+        ]
+        batch_update.moved.extend([(sw[0], sw[1], MoveDirectionalityEnum.SWAP)
+                                   for sw in swaps])
+        for adx, bdx in swaps:
+            persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
+                bdx], persistent_batch[adx]
+    batch_update.batch_size = condensed_batch_size
+
+    return batch_update, workload_size - wdx
+
+
+def _assert_valid(
+    batch_update: BatchUpdate,
+    persistent_batch: list[LogitsProcsRequestParams],
+    test_fakes: LogitsprocsTestFakes,
+    slice_idxs: list[int],
+    logits_w_lp: torch.Tensor,
+):
+    if not slice_idxs:
+        # Trivial case of empty persistent batch
+        assert len(persistent_batch) == 0
+        if logits_w_lp.shape[0] != 0:
+            raise ValueError("Fake persistent batch is empty but logitsprocs "
+                             f"output batch has shape {logits_w_lp.shape}")
+        return
+
+    # Validate logits for each fake request
+    for batch_index in range(batch_update.batch_size):
+        request_params = persistent_batch[batch_index]
+        # Invoke the appropriate validation function for
+        # the logitproc employed by this request
+        fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
+        assert fxn(test_fakes=test_fakes,
+                   persistent_batch=persistent_batch,
+                   logits_new=logits_w_lp,
+                   batch_index=batch_index,
+                   request_params=request_params), (
+                       f"Validation failed for batch_index={batch_index}, "
+                       f"req_params={request_params}")
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
 def test_logitsprocs(device: str, reqs_per_logitproc: int,
                      logitsprocs_under_test: list[str]):
-    random.seed(42)
+    random.seed(40)
     torch.set_default_device(device)
 
     # Define a shuffled batch of requests which individually use a different
@@ -335,117 +474,27 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
     # Create fake test data structures for testing.
     test_fakes = _generate_test_fakes(workload_size, device)
 
-    max_add_remove_per_step = max(1, int(0.2 * workload_size))
     wdx = 0  # Next request index in workload to add
     persistent_batch: list[LogitsProcsRequestParams] = [
     ]  # Persistent batch state, as list of workload indices
 
     # Break when entire workload has been added previously and persistent
     # batch is empty
+    workload_reqs_remaining = workload_size
+    batch_size = 0
     while True:
-        workload_reqs_remaining = workload_size - wdx
-        curr_batch_size = len(persistent_batch)
-        if not (workload_reqs_remaining or curr_batch_size):
+        if not (workload_reqs_remaining or batch_size):
             break
 
-        # 50% of steps: add no reqs
-        # Other 50%: add a limited number of reqs (less than the number
-        # of workload reqs remaining, less than an arbitrary max)
-        # If no workload reqs remain: 100% of steps have 0 adds
-        num_step_add = random.choice([
-            0,
-            random.randint(
-                1, min(max_add_remove_per_step, workload_reqs_remaining))
-        ]) if workload_reqs_remaining else 0
-
-        # 50% of steps: remove no requests
-        # Other 50%: remove a limited number of reqs (less than the number
-        # persistent batch reqs remaining, less than an arbitrary max)
-        # If persistent batch is empty: 100% of steps have 0 removals until
-        # more requests are added. Assume that removed requests are always
-        # drawn from the current batch, before new adds
-        num_step_remove = random.choice([
-            0,
-            random.randint(1, min(max_add_remove_per_step, curr_batch_size))
-        ]) if curr_batch_size else 0
-
-        num_step_add_replace = min(num_step_add, num_step_remove)
-
-        # Generate fake removed request indices from current persistent
-        # batch before adds
-        batch_update = BatchUpdate(
-            removed=random.sample(range(curr_batch_size), num_step_remove))
-
-        # Get added requests from workload
-        for add_req_params in workload_params[wdx:(wdx +
-                                                   num_step_add_replace)]:
-            # Replace as many removed requests as possible with added requests
-            add_remove_idx = batch_update.pop_removed_if_can()
-            batch_update.added.append((add_remove_idx, add_req_params.params,
-                                       add_req_params.out_tokens))
-            persistent_batch[add_remove_idx] = add_req_params
-
-        # Append remaining added requests to end of batch
-        add_reqs_append = workload_params[(wdx + num_step_add_replace):(
-            wdx + num_step_add)]
-        batch_update.added.extend([
-            (adx + curr_batch_size, add_req_params.params,
-             add_req_params.out_tokens)
-            for adx, add_req_params in enumerate(add_reqs_append)
-        ])
-        persistent_batch.extend(add_reqs_append)
-        pre_condense_batch_size = len(persistent_batch)
-        wdx += num_step_add  # Update workload offset
-
-        # Simulate condensing persistent batch
-        last_nonempty_index = pre_condense_batch_size - 1
-        condensed_to_idxs = set()
-        while batch_update.removed:
-            if (last_nonempty_index in batch_update.removed
-                    or last_nonempty_index in condensed_to_idxs):
-                last_nonempty_index -= 1
-                continue
-            # last_nonempty_index is the highest persistent batch index that was
-            # not removed
-            first_empty_index = batch_update.peek_removed_if_can()
-            assert first_empty_index is not None
-            if first_empty_index > last_nonempty_index:
-                break
-            # first_empty_index is the lowest removed persistent batch index
-            # that is less than last_nonempty_index
-            #
-            # move last_nonempty_index -> first_empty_index
-            batch_update.pop_removed_if_can()
-            condensed_to_idxs.add(first_empty_index)
-            persistent_batch[first_empty_index] = persistent_batch[
-                last_nonempty_index]
-            batch_update.moved.append((last_nonempty_index, first_empty_index,
-                                       MoveDirectionalityEnum.UNIDIRECTIONAL))
-
-            last_nonempty_index -= 1
-
-        # Now removed requests & gaps left by non-removed requests that got
-        # moved downward are grouped consecutively in the upper indices of
-        # the persistent batch. Truncate them to get condensed persistent batch
-        condensed_batch_size = curr_batch_size + num_step_add - num_step_remove
-        persistent_batch = persistent_batch[0:condensed_batch_size]
-
-        if condensed_batch_size > 1:
-            # Simulate arbitrary reorder_batch() in the kernel backend
-            # Generate a random number k of non-overlapping swap tuples
-            k = random.randint(0, condensed_batch_size // 2)
-            idxs = list(range(condensed_batch_size))
-            random.shuffle(idxs)
-            swaps = [
-                tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
-            ]
-            batch_update.moved.extend([
-                (sw[0], sw[1], MoveDirectionalityEnum.SWAP) for sw in swaps
-            ])
-            for adx, bdx in swaps:
-                persistent_batch[adx], persistent_batch[
-                    bdx] = persistent_batch[bdx], persistent_batch[adx]
-        batch_update.batch_size = condensed_batch_size
+        (
+            batch_update,
+            workload_reqs_remaining,
+        ) = _generate_fake_step_update(
+            persistent_batch=persistent_batch,
+            workload_params=workload_params,
+            wdx=wdx,
+        )
+        batch_size = batch_update.batch_size
 
         # Apply fake batch update to logitsprocs
         fake_update_logitsprocs_state(test_fakes, batch_update)
@@ -454,23 +503,10 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
         slice_idxs = [req.workload_index for req in persistent_batch]
         logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
 
-        # Validate output
-        if not slice_idxs:
-            if logits_w_lp.shape[0] != 0:
-                raise ValueError(
-                    "Fake persistent batch is impty but logitsprocs "
-                    f"output batch with shape {logits_w_lp.shape}")
-            return
-
-        # Validate logits for each fake request
-        for batch_index in range(condensed_batch_size):
-            request_params = persistent_batch[batch_index]
-            fxn = logitsprocs_test_mapping[
-                request_params.logitproc_id].eval_fxn
-            assert fxn(test_fakes=test_fakes,
-                       persistent_batch=persistent_batch,
-                       logits_new=logits_w_lp,
-                       batch_index=batch_index,
-                       request_params=request_params), (
-                           f"Validation failed for batch_index={batch_index}, "
-                           f"req_params={request_params}")
+        _assert_valid(
+            batch_update=batch_update,
+            persistent_batch=persistent_batch,
+            test_fakes=test_fakes,
+            slice_idxs=slice_idxs,
+            logits_w_lp=logits_w_lp,
+        )

From 76438fba07684a2f341f5e67fe5f39a522df3dbd Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 13 Jun 2025 19:28:14 -0400
Subject: [PATCH 061/180] test fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 101 ++++++++++++++--------
 1 file changed, 63 insertions(+), 38 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 5c8e2292f79..1944b091103 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -171,13 +171,23 @@ def _logit_bias_params(kwargs: dict) -> None:
     }
 
 
+def _raise_error_invalid(
+    msg_suffix: str,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+) -> None:
+    raise ValueError(f"Validation failed for batch_index={batch_index}, "
+                     f"workload_index={request_params.workload_index}, "
+                     f"req_params={request_params}. Reason: {msg_suffix}")
+
+
 def _logit_bias_validate(
     test_fakes: LogitsprocsTestFakes,
     persistent_batch: list[LogitsProcsRequestParams],
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
-) -> bool:
+) -> None:
     """Validate logit bias logitproc applied correctly"""
     logit_bias = request_params.params.logit_bias
     logits_old = (
@@ -190,19 +200,20 @@ def _logit_bias_validate(
             bias_value = logit_bias[token_id]
             exp_value = bias_value + logit_old_value
             if logit_new_value != pytest.approx(exp_value):
-                print(f"Biased token {token_id} logit value {logit_new_value} "
-                      f"does not match expected value {exp_value} "
-                      f"given bias {bias_value}")
-                return False
+                _raise_error_invalid(msg_suffix=(
+                    f"Biased token {token_id} logit value {logit_new_value} "
+                    f"does not match expected value {exp_value} "
+                    f"given bias {bias_value}"),
+                                     batch_index=batch_index,
+                                     request_params=request_params)
 
         else:
             if logit_new_value != pytest.approx(logit_old_value):
-                print(
+                _raise_error_invalid(msg_suffix=(
                     f"Unbiased token {token_id} logit value {logit_new_value} "
-                    f"does not match expected value {logit_old_value}")
-                return False
-
-    return True
+                    f"does not match expected value {logit_old_value}"),
+                                     batch_index=batch_index,
+                                     request_params=request_params)
 
 
 def _min_p_params(kwargs: dict) -> None:
@@ -223,20 +234,27 @@ def _min_p_validate(
         if token_id == 0:
             # Dominant token should always be unmasked
             if logits_for_token == -float("inf"):
-                print("Invalid: dominant token 0 masked (-inf)")
-                return False
+                _raise_error_invalid(
+                    msg_suffix="Invalid: dominant token 0 masked (-inf)",
+                    batch_index=batch_index,
+                    request_params=request_params)
         else:
             if request_params.params.min_p > 0.0:
                 # Non-dominant tokens should be masked when min_p > 0
                 if logits_for_token != -float("inf"):
-                    print(f"Invalid: non-dominant token {token_id} not masked")
-                    return False
+                    _raise_error_invalid(
+                        msg_suffix=
+                        f"Invalid: non-dominant token {token_id} not masked",
+                        batch_index=batch_index,
+                        request_params=request_params)
             else:
                 # No masking when min_p is 0
                 if logits_for_token == -float("inf"):
-                    print(f"Invalid: token {token_id} masked when min_p=0.0")
-                    return False
-    return True
+                    _raise_error_invalid(
+                        msg_suffix=
+                        f"Invalid: token {token_id} masked when min_p=0.0",
+                        batch_index=batch_index,
+                        request_params=request_params)
 
 
 def _min_tokens_params(kwargs: dict) -> None:
@@ -263,18 +281,20 @@ def _min_tokens_validate(
         logits_for_token = logits_new[batch_index][token_id]
         if token_id in stop_token_ids and not min_reached:
             if logits_for_token != -float("inf"):
-                print(f"Token {token_id} is a stop token and "
-                      "the sequence has not reached min length, "
-                      "but the token is not masked "
-                      f"(logit={logits_for_token})")
-                return False
+                _raise_error_invalid(
+                    msg_suffix=(f"Token {token_id} is a stop token and "
+                                "the sequence has not reached min length, "
+                                "but the token is not masked "
+                                f"(logit={logits_for_token})"),
+                    batch_index=batch_index,
+                    request_params=request_params)
         else:
             if logits_for_token == -float("inf"):
-                print(f"Token {token_id} should not be masked but "
-                      f"is (output len={len(num_out_tokens)})")
-                return False
-
-    return True
+                _raise_error_invalid(
+                    msg_suffix=(f"Token {token_id} should not be masked but "
+                                f"is (output len={num_out_tokens})"),
+                    batch_index=batch_index,
+                    request_params=request_params)
 
 
 def _none_validate(
@@ -285,8 +305,12 @@ def _none_validate(
     request_params: LogitsProcsRequestParams,
 ) -> bool:
     """Validate that no logits processors are applied"""
-    return torch.all(logits_new[batch_index] == (
-        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu()))
+    if not torch.all(logits_new[batch_index] == (test_fakes.logits[
+            persistent_batch[batch_index].workload_index].cpu())):
+        _raise_error_invalid(msg_suffix=(
+            "Unexpected modification of logits with no logitsprocs"),
+                             batch_index=batch_index,
+                             request_params=request_params)
 
 
 class LogitsprocTestHelpers(NamedTuple):
@@ -405,7 +429,7 @@ def _generate_fake_step_update(
     # moved downward are grouped consecutively in the upper indices of
     # the persistent batch. Truncate them to get condensed persistent batch
     condensed_batch_size = batch_size + num_step_add - num_step_remove
-    persistent_batch = persistent_batch[0:condensed_batch_size]
+    persistent_batch[:] = persistent_batch[0:condensed_batch_size]
 
     if condensed_batch_size > 1:
         # Simulate arbitrary reorder_batch() in the kernel backend
@@ -423,7 +447,7 @@ def _generate_fake_step_update(
                 bdx], persistent_batch[adx]
     batch_update.batch_size = condensed_batch_size
 
-    return batch_update, workload_size - wdx
+    return batch_update, wdx, workload_size - wdx
 
 
 def _assert_valid(
@@ -447,13 +471,13 @@ def _assert_valid(
         # Invoke the appropriate validation function for
         # the logitproc employed by this request
         fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
-        assert fxn(test_fakes=test_fakes,
-                   persistent_batch=persistent_batch,
-                   logits_new=logits_w_lp,
-                   batch_index=batch_index,
-                   request_params=request_params), (
-                       f"Validation failed for batch_index={batch_index}, "
-                       f"req_params={request_params}")
+        fxn(test_fakes=test_fakes,
+            persistent_batch=persistent_batch,
+            logits_new=logits_w_lp,
+            batch_index=batch_index,
+            request_params=request_params), (
+                f"Validation failed for batch_index={batch_index}, "
+                f"req_params={request_params}")
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -488,6 +512,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
 
         (
             batch_update,
+            wdx,
             workload_reqs_remaining,
         ) = _generate_fake_step_update(
             persistent_batch=persistent_batch,

From 9ac6190f63b7893c0cf676bbb4fbb99e11ba58e6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 16 Jun 2025 15:32:15 -0400
Subject: [PATCH 062/180] latest

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 48 ++++++++++++++++-------
 vllm/v1/sample/logits_processor.py        | 24 +++++++-----
 2 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 1944b091103..14cd4f01de8 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -34,7 +34,7 @@
 ]
 MAX_NUM_PROMPT_TOKENS = 64
 MIN_TOKENS_LEN_THRESHOLD = 5
-REQS_PER_LOGITPROC = 10
+REQS_PER_LOGITPROC = 50
 
 
 class LogitsProcsRequestParams:
@@ -175,8 +175,10 @@ def _raise_error_invalid(
     msg_suffix: str,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
+    step_idx: int,
 ) -> None:
-    raise ValueError(f"Validation failed for batch_index={batch_index}, "
+    raise ValueError(f"Validation failed for step={step_idx}, "
+                     f"batch_index={batch_index}, "
                      f"workload_index={request_params.workload_index}, "
                      f"req_params={request_params}. Reason: {msg_suffix}")
 
@@ -187,6 +189,7 @@ def _logit_bias_validate(
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
+    step_idx: int,
 ) -> None:
     """Validate logit bias logitproc applied correctly"""
     logit_bias = request_params.params.logit_bias
@@ -205,7 +208,8 @@ def _logit_bias_validate(
                     f"does not match expected value {exp_value} "
                     f"given bias {bias_value}"),
                                      batch_index=batch_index,
-                                     request_params=request_params)
+                                     request_params=request_params,
+                                     step_idx=step_idx)
 
         else:
             if logit_new_value != pytest.approx(logit_old_value):
@@ -213,7 +217,8 @@ def _logit_bias_validate(
                     f"Unbiased token {token_id} logit value {logit_new_value} "
                     f"does not match expected value {logit_old_value}"),
                                      batch_index=batch_index,
-                                     request_params=request_params)
+                                     request_params=request_params,
+                                     step_idx=step_idx)
 
 
 def _min_p_params(kwargs: dict) -> None:
@@ -227,6 +232,7 @@ def _min_p_validate(
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
+    step_idx: int,
 ) -> bool:
     """Validate min-p logitproc applied correctly"""
     for token_id in range(VOCAB_SIZE):
@@ -237,7 +243,8 @@ def _min_p_validate(
                 _raise_error_invalid(
                     msg_suffix="Invalid: dominant token 0 masked (-inf)",
                     batch_index=batch_index,
-                    request_params=request_params)
+                    request_params=request_params,
+                    step_idx=step_idx)
         else:
             if request_params.params.min_p > 0.0:
                 # Non-dominant tokens should be masked when min_p > 0
@@ -246,7 +253,8 @@ def _min_p_validate(
                         msg_suffix=
                         f"Invalid: non-dominant token {token_id} not masked",
                         batch_index=batch_index,
-                        request_params=request_params)
+                        request_params=request_params,
+                        step_idx=step_idx)
             else:
                 # No masking when min_p is 0
                 if logits_for_token == -float("inf"):
@@ -254,7 +262,8 @@ def _min_p_validate(
                         msg_suffix=
                         f"Invalid: token {token_id} masked when min_p=0.0",
                         batch_index=batch_index,
-                        request_params=request_params)
+                        request_params=request_params,
+                        step_idx=step_idx)
 
 
 def _min_tokens_params(kwargs: dict) -> None:
@@ -272,11 +281,12 @@ def _min_tokens_validate(
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
+    step_idx: int,
 ) -> bool:
     """Validate min-tokens logitsproc applied correctly"""
     num_out_tokens = len(request_params.out_tokens)
     min_reached = num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
-    stop_token_ids = request_params.params.stop_token_ids
+    stop_token_ids = request_params.params.all_stop_token_ids
     for token_id in range(VOCAB_SIZE):
         logits_for_token = logits_new[batch_index][token_id]
         if token_id in stop_token_ids and not min_reached:
@@ -287,14 +297,16 @@ def _min_tokens_validate(
                                 "but the token is not masked "
                                 f"(logit={logits_for_token})"),
                     batch_index=batch_index,
-                    request_params=request_params)
+                    request_params=request_params,
+                    step_idx=step_idx)
         else:
             if logits_for_token == -float("inf"):
                 _raise_error_invalid(
                     msg_suffix=(f"Token {token_id} should not be masked but "
                                 f"is (output len={num_out_tokens})"),
                     batch_index=batch_index,
-                    request_params=request_params)
+                    request_params=request_params,
+                    step_idx=step_idx)
 
 
 def _none_validate(
@@ -303,6 +315,7 @@ def _none_validate(
     logits_new: torch.Tensor,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
+    step_idx: int,
 ) -> bool:
     """Validate that no logits processors are applied"""
     if not torch.all(logits_new[batch_index] == (test_fakes.logits[
@@ -310,7 +323,8 @@ def _none_validate(
         _raise_error_invalid(msg_suffix=(
             "Unexpected modification of logits with no logitsprocs"),
                              batch_index=batch_index,
-                             request_params=request_params)
+                             request_params=request_params,
+                             step_idx=step_idx)
 
 
 class LogitsprocTestHelpers(NamedTuple):
@@ -456,7 +470,8 @@ def _assert_valid(
     test_fakes: LogitsprocsTestFakes,
     slice_idxs: list[int],
     logits_w_lp: torch.Tensor,
-):
+    step_idx: int,
+)->None:
     if not slice_idxs:
         # Trivial case of empty persistent batch
         assert len(persistent_batch) == 0
@@ -475,9 +490,8 @@ def _assert_valid(
             persistent_batch=persistent_batch,
             logits_new=logits_w_lp,
             batch_index=batch_index,
-            request_params=request_params), (
-                f"Validation failed for batch_index={batch_index}, "
-                f"req_params={request_params}")
+            request_params=request_params,
+            step_idx=step_idx)
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -506,6 +520,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
     # batch is empty
     workload_reqs_remaining = workload_size
     batch_size = 0
+    step_idx = 0
     while True:
         if not (workload_reqs_remaining or batch_size):
             break
@@ -534,4 +549,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
             test_fakes=test_fakes,
             slice_idxs=slice_idxs,
             logits_w_lp=logits_w_lp,
+            step_idx=step_idx,
         )
+
+        step_idx+=1
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index e820bab1536..e7ceaeedc75 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -322,15 +322,21 @@ def update_state(self, batch_update: BatchUpdate):
             # Process moved requests, unidirectional (a->b) and
             # swapped (a<->b)
             for a_index, b_index, direct in batch_update.moved:
-                a_entry = self.min_toks.pop(a_index, None)
-                if direct == MoveDirectionalityEnum.SWAP and (
-                        b_entry := self.min_toks.pop(b_index,
-                                                     None)) is not None:
-                    needs_update = True
-                    self.min_toks[a_index] = b_entry
-                if a_entry:
-                    needs_update = True
-                    self.min_toks[b_index] = a_entry
+                if direct == MoveDirectionalityEnum.UNIDIRECTIONAL:
+                    if (a_entry := self.min_toks.pop(a_index, None)) is None:
+                        if self.min_toks.pop(b_index,None) is not None:
+                            needs_update=True
+                    else:
+                        self.min_toks[b_index] = a_entry
+                        needs_update=True
+                else:
+                    a_entry = self.min_toks.pop(a_index, None)
+                    if (b_entry := self.min_toks.pop(b_index, None)) is not None:
+                        self.min_toks[a_index]=b_entry
+                        needs_update=True
+                    if a_entry is not None:
+                        self.min_toks[b_index]=a_entry
+                        needs_update=True
 
         if self.min_toks:
             # Check for any requests that have attained their min tokens.

From 360f2c4ed85f2ffd8300f0a998730c7701524b5c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 16:27:09 -0400
Subject: [PATCH 063/180] removed tpu hack

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/worker/test_gpu_input_batch.py |   2 -
 vllm/v1/sample/metadata.py              |   6 +-
 vllm/v1/worker/gpu_input_batch.py       | 109 ++++++------------------
 3 files changed, 25 insertions(+), 92 deletions(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 43e4c4dd495..52f65cc107e 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -142,8 +142,6 @@ def _construct_expected_sampling_metadata(
             top_p, dtype=torch.float, device=device),
         top_k=None if all(x == 0 for x in top_k) else torch.tensor(
             top_k, dtype=torch.int, device=device),
-        min_p=None if all(x == 0.0 for x in min_p) else torch.tensor(
-            min_p, dtype=torch.float, device=device),
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=make_tensor_with_pad(
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index eeb378170a1..186f090337b 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -42,8 +42,4 @@ class SamplingMetadata:
     # Some logits processors don't affect greedy decoding (or if they do,
     # only due to precision errors); "non-greedy" processors are
     # only applied to random-sampled requests in the batch.
-    logitsprocs: LogitsProcessorObjects
-
-    # TODO(andy): Because newest logits processors implementation
-    # does not support TPU yet, the old min_p field is still required
-    min_p: Optional[torch.Tensor] = None
+    logitsprocs: LogitsProcessorObjects
\ No newline at end of file
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9c03b7fb643..9da7c5008f1 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -10,7 +10,6 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
@@ -21,12 +20,6 @@
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 from vllm.v1.worker.utils import init_hard_coded_logitsprocs
 
-# TODO(andy): TPU implementation does not support
-# latest logits processor implementation
-is_tpu = current_platform.is_tpu()
-
-_SAMPLING_EPS = 1e-5
-
 
 @dataclass
 class CachedRequestState:
@@ -148,17 +141,6 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
-        if is_tpu:
-            self.min_p = torch.empty((max_num_reqs, ),
-                                     dtype=torch.float32,
-                                     device=device)
-            self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
-                                                dtype=torch.float32,
-                                                device="cpu",
-                                                pin_memory=pin_memory)
-            self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-            self.min_p_reqs: set[str] = set()
-
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -218,20 +200,19 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        if not is_tpu:
-            # Internal representation of per-step batch state changes.
-            # Should reset each step.
-            self.batch_update = BatchUpdate()
-
-            # Define logits processors. Note that Min-P logitsproc is returned
-            # both on its own as min_p_logitsproc (to support spec decoding
-            # compatibility check) and also as part of logits_procs
-            # TODO(andy): logits processor list should be extensible via engine
-            # constructor argument; for now the list is fixed.
-            self.logitsprocs = init_hard_coded_logitsprocs(
-                pin_memory_available=pin_memory,
-                max_num_reqs=max_num_reqs + 1,
-                device=device)
+        # Internal representation of per-step batch state changes.
+        # Should reset each step.
+        self.batch_update = BatchUpdate()
+
+        # Define logits processors. Note that Min-P logitsproc is returned
+        # both on its own as min_p_logitsproc (to support spec decoding
+        # compatibility check) and also as part of logits_procs
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
+        self.logitsprocs = init_hard_coded_logitsprocs(
+            pin_memory_available=pin_memory,
+            max_num_reqs=max_num_reqs + 1,
+            device=device)
 
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
@@ -275,16 +256,8 @@ def has_step_removed_requests(self) -> bool:
     def add_request(
         self,
         request: "CachedRequestState",
-        req_index: Optional[int] = None,
     ) -> int:
-        if is_tpu:
-            # TODO(andy): update TPU implementation
-            if req_index is None:
-                req_index = self.num_reqs
-            assert req_index < self.max_num_reqs
-        else:
-            # Ignore req_index argument on GPU
-            req_index = self._register_add_request(request)
+        req_index = self._register_add_request(request)
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -332,10 +305,6 @@ def add_request(
         else:
             top_k = self.vocab_size
         self.top_k_cpu[req_index] = top_k
-        if is_tpu:
-            self.min_p_cpu[req_index] = sampling_params.min_p
-            if sampling_params.min_p > _SAMPLING_EPS:
-                self.min_p_reqs.add(req_id)
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
         if sampling_params.frequency_penalty != 0.0:
@@ -403,9 +372,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        if not is_tpu:
-            # TODO(andy): TPU implementation does not support this path
-            self.batch_update.removed_append(req_index)
+        self.batch_update.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -413,8 +380,6 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
-        if is_tpu:
-            self.min_p_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -440,10 +405,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
-        if not is_tpu:
-            # TODO(andy): TPU implementation does not support this path
-            self.batch_update.moved.append(
-                (i1, i2, MoveDirectionalityEnum.SWAP))
+        self.batch_update.moved.append((i1, i2, MoveDirectionalityEnum.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -473,9 +435,6 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
-        if is_tpu:
-            self.min_p_cpu[i1], self.min_p_cpu[i2] =\
-                self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -517,11 +476,8 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
         """
-        if is_tpu:
-            assert empty_req_indices is not None
-        else:
-            assert empty_req_indices is None
-            empty_req_indices = self.batch_update.removed
+        assert empty_req_indices is None
+        empty_req_indices = self.batch_update.removed
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -538,19 +494,17 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = (empty_req_indices.pop() if is_tpu else
-                           self.batch_update.peek_removed_if_can())
+            empty_index = self.batch_update.peek_removed_if_can()
             assert empty_index is not None
             if empty_index >= last_req_index:
                 break
 
             # Move active request down into empty request
             # index.
-            if not is_tpu:
-                self.batch_update.pop_removed_if_can()
-                self.batch_update.moved.append(
-                    (last_req_index, empty_index,
-                     MoveDirectionalityEnum.UNIDIRECTIONAL))
+            self.batch_update.pop_removed_if_can()
+            self.batch_update.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionalityEnum.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -581,8 +535,6 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
-            if is_tpu:
-                self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -617,8 +569,7 @@ def _commit_logit_procs_state_changes(self) -> None:
         self.batch_update.reset()
 
     def refresh(self):
-        if not is_tpu:
-            self._commit_logit_procs_state_changes()
+        self._commit_logit_procs_state_changes()
         self.sampling_metadata = self._make_sampling_metadata()
 
     def _make_sampling_metadata(self) -> SamplingMetadata:
@@ -632,8 +583,6 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
             copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
-        if is_tpu and not self.no_min_p:
-            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
 
         if not self.no_penalties:
             # Since syncing these tensors is expensive only copy them
@@ -666,8 +615,6 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
-            min_p=None
-            if not is_tpu or self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,
@@ -750,14 +697,6 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
-    @property
-    def no_min_p(self) -> bool:
-        # TODO(andy): remove this method once
-        # new logits processors implementation
-        # supports TPU
-        assert is_tpu
-        return len(self.min_p_reqs) == 0
-
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0

From 395d472edcf617dbf787446f41c203a57928d3ad Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 16:57:19 -0400
Subject: [PATCH 064/180] wip tpu backward compat

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/tpu_input_batch.py | 716 ++++++++++++++++++++++++++++++
 1 file changed, 716 insertions(+)
 create mode 100644 vllm/v1/worker/tpu_input_batch.py

diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
new file mode 100644
index 00000000000..9da7c5008f1
--- /dev/null
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -0,0 +1,716 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Datastructures defining an input batch
+
+from dataclasses import dataclass
+from typing import Optional, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.sample.logits_processor import (BatchUpdate, MinPLogitsProcessor,
+                                             MoveDirectionalityEnum)
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.utils import copy_slice
+from vllm.v1.worker.block_table import MultiGroupBlockTable
+from vllm.v1.worker.utils import init_hard_coded_logitsprocs
+
+
+@dataclass
+class CachedRequestState:
+
+    req_id: str
+    prompt_token_ids: list[int]
+    mm_inputs: list[MultiModalKwargs]
+    mm_positions: list[PlaceholderRange]
+    sampling_params: SamplingParams
+    generator: Optional[torch.Generator]
+
+    block_ids: tuple[list[int], ...]
+    num_computed_tokens: int
+    output_token_ids: list[int]
+
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[int] = None
+
+    lora_request: Optional[LoRARequest] = None
+
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+
+    @property
+    def num_tokens(self) -> int:
+        return self.num_prompt_tokens + len(self.output_token_ids)
+
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
+
+
+class InputBatch:
+
+    def __init__(
+            self,
+            max_num_reqs: int,
+            max_model_len: int,
+            max_num_batched_tokens: int,
+            device: torch.device,
+            pin_memory: bool,
+            vocab_size: int,
+            block_sizes: list[int],  # The block_size of each kv cache group
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
+
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
+
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=False,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
+
+        # Block table.
+        self.block_table = MultiGroupBlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            pin_memory=pin_memory,
+            device=device,
+            block_sizes=block_sizes,
+        )
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: set[str] = set()
+
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
+        self.presence_penalties_reqs: set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: set[str] = set()
+
+        self.prompt_token_ids: Optional[torch.Tensor] = None
+
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
+
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: dict[int, torch.Generator] = {}
+
+        self.num_logprobs: dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
+
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+
+        # Internal representation of per-step batch state changes.
+        # Should reset each step.
+        self.batch_update = BatchUpdate()
+
+        # Define logits processors. Note that Min-P logitsproc is returned
+        # both on its own as min_p_logitsproc (to support spec decoding
+        # compatibility check) and also as part of logits_procs
+        # TODO(andy): logits processor list should be extensible via engine
+        # constructor argument; for now the list is fixed.
+        self.logitsprocs = init_hard_coded_logitsprocs(
+            pin_memory_available=pin_memory,
+            max_num_reqs=max_num_reqs + 1,
+            device=device)
+
+        # TODO convert this to LogitsProcessor
+        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
+        self.req_output_token_ids: list[Optional[list[int]]] = []
+
+        # This is updated each time the batch constituents change.
+        self.sampling_metadata = self._make_sampling_metadata()
+
+    @property
+    def req_ids(self) -> list[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(list[str], self._req_ids)
+
+    def _get_next_add_index(self) -> int:
+        if (req_index := self.batch_update.pop_removed_if_can()) is not None:
+            # Fill the empty index.
+            return req_index
+        # Append to end
+        return self.num_reqs
+
+    def _register_add_request(self, request: "CachedRequestState") -> int:
+        """Track add-request operations"""
+        req_index = self._get_next_add_index()
+        assert req_index < self.max_num_reqs
+        self.batch_update.added.append(
+            (req_index, request.sampling_params, request.output_token_ids))
+        return req_index
+
+    def has_step_removed_requests(self) -> bool:
+        return self.batch_update.has_removed()
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+    ) -> int:
+        req_index = self._register_add_request(request)
+
+        req_id = request.req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
+        self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        self.block_table.add_row(request.block_ids, req_index)
+
+        sampling_params = request.sampling_params
+        if sampling_params.sampling_type == SamplingType.GREEDY:
+            # Avoid later division by zero.
+            self.temperature_cpu[req_index] = -1.0
+            self.greedy_reqs.add(req_id)
+        else:
+            self.temperature_cpu[req_index] = sampling_params.temperature
+            self.random_reqs.add(req_id)
+
+        self.top_p_cpu[req_index] = sampling_params.top_p
+        if sampling_params.top_p < 1:
+            self.top_p_reqs.add(req_id)
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
+            self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
+        self.frequency_penalties_cpu[
+            req_index] = sampling_params.frequency_penalty
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[
+            req_index] = sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[
+            req_index] = sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+
+        # NOTE(woosuk): self.generators should not include the requests that
+        # do not have their own generator.
+        if request.generator is not None:
+            self.generators[req_index] = request.generator
+
+        if sampling_params.logprobs is not None:
+            self.num_logprobs[req_id] = sampling_params.logprobs
+        if sampling_params.prompt_logprobs is not None:
+            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
+
+        if sampling_params.allowed_token_ids:
+            self.has_allowed_token_ids.add(req_id)
+            if self.allowed_token_ids_mask_cpu_tensor is None:
+                # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs,
+                    self.vocab_size,
+                    dtype=torch.bool,
+                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index][
+                sampling_params.allowed_token_ids] = False
+
+        if sampling_params.bad_words_token_ids:
+            self.bad_words_token_ids[
+                req_index] = sampling_params.bad_words_token_ids
+
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+
+        return req_index
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense()."""
+
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self.batch_update.removed_append(req_index)
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
+        return req_index
+
+    def swap_states(self, i1: int, i2: int) -> None:
+        self.batch_update.moved.append((i1, i2, MoveDirectionalityEnum.SWAP))
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
+
+        self.block_table.swap_row(i1, i2)
+
+    def _register_move_request(self, from_idx: int, to_idx: int) -> None:
+        self.batch_update.moved.append(
+            (from_idx, to_idx, MoveDirectionalityEnum.UNIDIRECTIONAL))
+
+    def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
+        """Slide non-empty requests down into empty indices.
+
+        Any consecutive empty indices at the very end of the list are not
+        filled.
+
+        Args:
+          empty_req_indices: empty indices which may be filled.
+
+        Returns:
+          swaps: list of (from,to) swap tuples for moved requests
+          empty_req_indices: indices not filled by condensation
+        """
+        assert empty_req_indices is None
+        empty_req_indices = self.batch_update.removed
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
+            # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = self.batch_update.peek_removed_if_can()
+            assert empty_index is not None
+            if empty_index >= last_req_index:
+                break
+
+            # Move active request down into empty request
+            # index.
+            self.batch_update.pop_removed_if_can()
+            self.batch_update.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionalityEnum.UNIDIRECTIONAL))
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
+            assert req_id is not None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
+            # TODO convert these to LogitsProcessors
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
+
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+
+    def _commit_logit_procs_state_changes(self) -> None:
+        """Apply batch add/remove/permute to logits procs' states"""
+        self.batch_update.batch_size = self.num_reqs
+        for logit_proc in self.logitsprocs.all_list:
+            logit_proc.update_state(self.batch_update)
+        # Clear state change representation to prepare for next step
+        self.batch_update.reset()
+
+    def refresh(self):
+        self._commit_logit_procs_state_changes()
+        self.sampling_metadata = self._make_sampling_metadata()
+
+    def _make_sampling_metadata(self) -> SamplingMetadata:
+        num_reqs = self.num_reqs
+        if not self.all_greedy:
+            temperature = copy_slice(self.temperature_cpu_tensor,
+                                     self.temperature, num_reqs)
+        else:
+            temperature = None
+        if not self.no_top_p:
+            copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
+        if not self.no_top_k:
+            copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
+
+        if not self.no_penalties:
+            # Since syncing these tensors is expensive only copy them
+            # if necessary i.e. if there are requests which require
+            # penalties to be applied during sampling.
+            copy_slice(self.frequency_penalties_cpu_tensor,
+                       self.frequency_penalties, num_reqs)
+            copy_slice(self.presence_penalties_cpu_tensor,
+                       self.presence_penalties, num_reqs)
+            copy_slice(self.repetition_penalties_cpu_tensor,
+                       self.repetition_penalties, num_reqs)
+
+            # The prompt tokens are used only for applying penalties during
+            # the sampling process. Hence copy these tensors only when
+            # there are requests which need penalties to be applied.
+            prompt_token_ids = self._make_prompt_token_ids_tensor()
+        else:
+            prompt_token_ids = None
+
+        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        if not self.no_allowed_token_ids:
+            assert self.allowed_token_ids_mask is not None
+            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
+                       self.allowed_token_ids_mask, num_reqs)
+            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
+
+        return SamplingMetadata(
+            temperature=temperature,
+            all_greedy=self.all_greedy,
+            all_random=self.all_random,
+            top_p=None if self.no_top_p else self.top_p[:num_reqs],
+            top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            generators=self.generators,
+            max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:num_reqs],
+            presence_penalties=self.presence_penalties[:num_reqs],
+            repetition_penalties=self.repetition_penalties[:num_reqs],
+            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+            no_penalties=self.no_penalties,
+            allowed_token_ids_mask=allowed_token_ids_mask,
+            bad_words_token_ids=self.bad_words_token_ids,
+            logitsprocs=self.logitsprocs,
+        )
+
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory,
+        )
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+    def get_min_p_by_req_id(self, req_id: str) -> float:
+        assert req_id in self.req_id_to_index
+        min_p_logitsproc = self.logitsprocs.get_logitsproc_by_id("min_p")
+        assert min_p_logitsproc is not None and isinstance(
+            min_p_logitsproc, MinPLogitsProcessor)
+        return min_p_logitsproc.get_min_p_by_index(
+            self.req_id_to_index[req_id])
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
+    @property
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0

From 5c53a8ca45b24cb100d857026334e8bbde0f56f2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 17:00:59 -0400
Subject: [PATCH 065/180] typing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 14cd4f01de8..5d26dee0778 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -233,7 +233,7 @@ def _min_p_validate(
     batch_index: int,
     request_params: LogitsProcsRequestParams,
     step_idx: int,
-) -> bool:
+) -> None:
     """Validate min-p logitproc applied correctly"""
     for token_id in range(VOCAB_SIZE):
         logits_for_token = logits_new[batch_index][token_id]
@@ -282,7 +282,7 @@ def _min_tokens_validate(
     batch_index: int,
     request_params: LogitsProcsRequestParams,
     step_idx: int,
-) -> bool:
+) -> None:
     """Validate min-tokens logitsproc applied correctly"""
     num_out_tokens = len(request_params.out_tokens)
     min_reached = num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
@@ -316,7 +316,7 @@ def _none_validate(
     batch_index: int,
     request_params: LogitsProcsRequestParams,
     step_idx: int,
-) -> bool:
+) -> None:
     """Validate that no logits processors are applied"""
     if not torch.all(logits_new[batch_index] == (test_fakes.logits[
             persistent_batch[batch_index].workload_index].cpu())):
@@ -348,7 +348,7 @@ class LogitsprocTestHelpers(NamedTuple):
 }
 
 
-def _get_test_cases() -> list[str]:
+def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
     logitsprocs_ids = list(logitsprocs_test_mapping.keys())
     return [[logitproc_id]
@@ -359,7 +359,7 @@ def _generate_fake_step_update(
     persistent_batch: list[LogitsProcsRequestParams],
     workload_params: list[LogitsProcsRequestParams],
     wdx: int,
-) -> tuple[BatchUpdate, int]:
+) -> tuple[BatchUpdate, int, int]:
     batch_size = len(persistent_batch)
     workload_size = len(workload_params)
     workload_reqs_remaining = workload_size - wdx
@@ -471,7 +471,7 @@ def _assert_valid(
     slice_idxs: list[int],
     logits_w_lp: torch.Tensor,
     step_idx: int,
-)->None:
+) -> None:
     if not slice_idxs:
         # Trivial case of empty persistent batch
         assert len(persistent_batch) == 0
@@ -552,4 +552,4 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
             step_idx=step_idx,
         )
 
-        step_idx+=1
+        step_idx += 1

From f7969c54554ecd88d19ba439066ddd39f0029f39 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 17:58:46 -0400
Subject: [PATCH 066/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/tpu/metadata.py            |   2 +-
 vllm/v1/worker/gpu_input_batch.py         |   2 +-
 vllm/v1/worker/lora_model_runner_mixin.py |  10 +-
 vllm/v1/worker/tpu_input_batch.py         | 187 +++++++---------------
 vllm/v1/worker/tpu_model_runner.py        |   2 +-
 5 files changed, 71 insertions(+), 132 deletions(-)

diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 4c1ac489519..6491c84f607 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.tpu_input_batch import InputBatch
 
 DEFAULT_SAMPLING_PARAMS = dict(
     temperature=-1.0,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9da7c5008f1..7ae89fe10e3 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Datastructures defining an input batch
+# Datastructures defining a GPU input batch
 
 from dataclasses import dataclass
 from typing import Optional, cast
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index afa41a37eeb..99cdf8c2589 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,6 +5,7 @@
 """
 
 from contextlib import contextmanager
+from typing import Union
 
 import numpy as np
 import torch.nn as nn
@@ -15,7 +16,14 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.platforms import current_platform
+
+if current_platform.device_name == "tpu":
+    from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
+else:
+    from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+
+InputBatch = Union[TPUInputBatch, GPUInputBatch]
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 9da7c5008f1..1bba2f36e2a 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -1,57 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Datastructures defining an input batch
+# Datastructures defining a CPU input batch
 
-from dataclasses import dataclass
 from typing import Optional, cast
 
 import numpy as np
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sampling_params import SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.logits_processor import (BatchUpdate, MinPLogitsProcessor,
-                                             MoveDirectionalityEnum)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
-from vllm.v1.worker.utils import init_hard_coded_logitsprocs
+from vllm.v1.worker.gpu_input_batch import CachedRequestState
 
-
-@dataclass
-class CachedRequestState:
-
-    req_id: str
-    prompt_token_ids: list[int]
-    mm_inputs: list[MultiModalKwargs]
-    mm_positions: list[PlaceholderRange]
-    sampling_params: SamplingParams
-    generator: Optional[torch.Generator]
-
-    block_ids: tuple[list[int], ...]
-    num_computed_tokens: int
-    output_token_ids: list[int]
-
-    mrope_positions: Optional[torch.Tensor] = None
-    mrope_position_delta: Optional[int] = None
-
-    lora_request: Optional[LoRARequest] = None
-
-    def __post_init__(self):
-        self.num_prompt_tokens = len(self.prompt_token_ids)
-
-    @property
-    def num_tokens(self) -> int:
-        return self.num_prompt_tokens + len(self.output_token_ids)
-
-    def get_token_id(self, idx: int) -> int:
-        if idx < self.num_prompt_tokens:
-            return self.prompt_token_ids[idx]
-        else:
-            return self.output_token_ids[idx - self.num_prompt_tokens]
+_SAMPLING_EPS = 1e-5
 
 
 class InputBatch:
@@ -141,6 +106,16 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
+        self.min_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.min_p_reqs: set[str] = set()
+
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -179,7 +154,8 @@ def __init__(
             self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
-        self.prompt_token_ids: Optional[torch.Tensor] = None
+        # req_index -> (min_tokens, stop_token_ids)
+        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
 
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
@@ -200,21 +176,8 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        # Internal representation of per-step batch state changes.
-        # Should reset each step.
-        self.batch_update = BatchUpdate()
-
-        # Define logits processors. Note that Min-P logitsproc is returned
-        # both on its own as min_p_logitsproc (to support spec decoding
-        # compatibility check) and also as part of logits_procs
-        # TODO(andy): logits processor list should be extensible via engine
-        # constructor argument; for now the list is fixed.
-        self.logitsprocs = init_hard_coded_logitsprocs(
-            pin_memory_available=pin_memory,
-            max_num_reqs=max_num_reqs + 1,
-            device=device)
-
-        # TODO convert this to LogitsProcessor
+        self.logit_bias: list[Optional[dict[int,
+                                            float]]] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
@@ -235,29 +198,14 @@ def req_ids(self) -> list[str]:
         # while performing state updates to the batch.
         return cast(list[str], self._req_ids)
 
-    def _get_next_add_index(self) -> int:
-        if (req_index := self.batch_update.pop_removed_if_can()) is not None:
-            # Fill the empty index.
-            return req_index
-        # Append to end
-        return self.num_reqs
-
-    def _register_add_request(self, request: "CachedRequestState") -> int:
-        """Track add-request operations"""
-        req_index = self._get_next_add_index()
-        assert req_index < self.max_num_reqs
-        self.batch_update.added.append(
-            (req_index, request.sampling_params, request.output_token_ids))
-        return req_index
-
-    def has_step_removed_requests(self) -> bool:
-        return self.batch_update.has_removed()
-
     def add_request(
         self,
         request: "CachedRequestState",
-    ) -> int:
-        req_index = self._register_add_request(request)
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -305,8 +253,11 @@ def add_request(
         else:
             top_k = self.vocab_size
         self.top_k_cpu[req_index] = top_k
+        self.min_p_cpu[req_index] = sampling_params.min_p
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
+        if sampling_params.min_p > _SAMPLING_EPS:
+            self.min_p_reqs.add(req_id)
         if sampling_params.frequency_penalty != 0.0:
             self.frequency_penalties_reqs.add(req_id)
         self.presence_penalties_cpu[
@@ -317,6 +268,9 @@ def add_request(
             req_index] = sampling_params.repetition_penalty
         if sampling_params.repetition_penalty != 1.0:
             self.repetition_penalties_reqs.add(req_id)
+        if sampling_params.min_tokens:
+            self.min_tokens[req_index] = (sampling_params.min_tokens,
+                                          sampling_params.all_stop_token_ids)
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -327,6 +281,8 @@ def add_request(
             self.num_logprobs[req_id] = sampling_params.logprobs
         if sampling_params.prompt_logprobs is not None:
             self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
+        if sampling_params.logit_bias is not None:
+            self.logit_bias[req_index] = sampling_params.logit_bias
 
         if sampling_params.allowed_token_ids:
             self.has_allowed_token_ids.add(req_id)
@@ -364,15 +320,12 @@ def add_request(
             # No LoRA
             self.request_lora_mapping[req_index] = 0
 
-        return req_index
-
     def remove_request(self, req_id: str) -> Optional[int]:
         """This method must always be followed by a call to condense()."""
 
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.batch_update.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -380,6 +333,8 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        self.min_p_reqs.discard(req_id)
+        self.min_tokens.pop(req_index, None)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -397,6 +352,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
                 self.lora_id_to_lora_request.pop(lora_id)
             self.request_lora_mapping[req_index] = 0
 
+        self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             # False means we don't fill with -inf.
@@ -405,7 +361,6 @@ def remove_request(self, req_id: str) -> Optional[int]:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
-        self.batch_update.moved.append((i1, i2, MoveDirectionalityEnum.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -435,6 +390,8 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+            self.min_p_cpu[i2], self.min_p_cpu[i1]
 
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
@@ -446,38 +403,22 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.token_ids_cpu[i2, ...] = tmp
 
         swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.logit_bias[i1], self.logit_bias[i2] =\
+            self.logit_bias[i2], self.logit_bias[i1]
 
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             self.allowed_token_ids_mask_cpu_tensor[i1], \
                 self.allowed_token_ids_mask_cpu_tensor[i2] =\
                 self.allowed_token_ids_mask_cpu_tensor[i2], \
                     self.allowed_token_ids_mask_cpu_tensor[i1]
-
         self.block_table.swap_row(i1, i2)
 
-    def _register_move_request(self, from_idx: int, to_idx: int) -> None:
-        self.batch_update.moved.append(
-            (from_idx, to_idx, MoveDirectionalityEnum.UNIDIRECTIONAL))
-
-    def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
-        """Slide non-empty requests down into empty indices.
-
-        Any consecutive empty indices at the very end of the list are not
-        filled.
-
-        Args:
-          empty_req_indices: empty indices which may be filled.
-
-        Returns:
-          swaps: list of (from,to) swap tuples for moved requests
-          empty_req_indices: indices not filled by condensation
-        """
-        assert empty_req_indices is None
-        empty_req_indices = self.batch_update.removed
+    def condense(self, empty_req_indices: list[int]) -> None:
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -494,17 +435,11 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = self.batch_update.peek_removed_if_can()
-            assert empty_index is not None
+            empty_index = empty_req_indices.pop()
             if empty_index >= last_req_index:
                 break
 
-            # Move active request down into empty request
-            # index.
-            self.batch_update.pop_removed_if_can()
-            self.batch_update.moved.append(
-                (last_req_index, empty_index,
-                 MoveDirectionalityEnum.UNIDIRECTIONAL))
+            # Swap the states.
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -535,14 +470,20 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
 
+            min_token = self.min_tokens.pop(last_req_index, None)
+            if min_token is not None:
+                self.min_tokens[empty_index] = min_token
+
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                 last_req_index]
 
-            # TODO convert these to LogitsProcessors
+            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
+
             if self.allowed_token_ids_mask_cpu_tensor is not None:
                 self.allowed_token_ids_mask_cpu_tensor[
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
@@ -552,7 +493,6 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
                 last_req_index, None)
             if bad_words_token_ids is not None:
                 self.bad_words_token_ids[empty_index] = bad_words_token_ids
-
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -560,16 +500,7 @@ def condense(self, empty_req_indices: Optional[list[int]] = None) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-    def _commit_logit_procs_state_changes(self) -> None:
-        """Apply batch add/remove/permute to logits procs' states"""
-        self.batch_update.batch_size = self.num_reqs
-        for logit_proc in self.logitsprocs.all_list:
-            logit_proc.update_state(self.batch_update)
-        # Clear state change representation to prepare for next step
-        self.batch_update.reset()
-
-    def refresh(self):
-        self._commit_logit_procs_state_changes()
+    def refresh_sampling_metadata(self):
         self.sampling_metadata = self._make_sampling_metadata()
 
     def _make_sampling_metadata(self) -> SamplingMetadata:
@@ -583,6 +514,8 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
             copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
+        if not self.no_min_p:
+            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
 
         if not self.no_penalties:
             # Since syncing these tensors is expensive only copy them
@@ -615,6 +548,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],
             top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            min_p=None if self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
             prompt_token_ids=prompt_token_ids,
@@ -622,10 +556,11 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
             output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+            min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
+            logit_bias=self.logit_bias[:num_reqs],
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
-            logitsprocs=self.logitsprocs,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
@@ -669,14 +604,6 @@ def make_lora_inputs(
 
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
 
-    def get_min_p_by_req_id(self, req_id: str) -> float:
-        assert req_id in self.req_id_to_index
-        min_p_logitsproc = self.logitsprocs.get_logitsproc_by_id("min_p")
-        assert min_p_logitsproc is not None and isinstance(
-            min_p_logitsproc, MinPLogitsProcessor)
-        return min_p_logitsproc.get_min_p_by_index(
-            self.req_id_to_index[req_id])
-
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
@@ -697,6 +624,10 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 89c6373b377..de5a0a1f559 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -42,8 +42,8 @@
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
-from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
 from .utils import (initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs)

From 1117f51ce0faa0f1581fbb4bed01c76448731a62 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 18:08:28 -0400
Subject: [PATCH 067/180] first pass at tpu/gpu separation

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/tpu_input_batch.py | 68 -------------------------------
 1 file changed, 68 deletions(-)

diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 1bba2f36e2a..aec33f833d2 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -11,8 +11,6 @@
 from vllm.sampling_params import SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState
 
@@ -189,9 +187,6 @@ def __init__(
 
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
-        # This is updated each time the batch constituents change.
-        self.sampling_metadata = self._make_sampling_metadata()
-
     @property
     def req_ids(self) -> list[str]:
         # None elements should only be present transiently
@@ -500,69 +495,6 @@ def condense(self, empty_req_indices: list[int]) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-    def refresh_sampling_metadata(self):
-        self.sampling_metadata = self._make_sampling_metadata()
-
-    def _make_sampling_metadata(self) -> SamplingMetadata:
-        num_reqs = self.num_reqs
-        if not self.all_greedy:
-            temperature = copy_slice(self.temperature_cpu_tensor,
-                                     self.temperature, num_reqs)
-        else:
-            temperature = None
-        if not self.no_top_p:
-            copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
-        if not self.no_top_k:
-            copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
-        if not self.no_min_p:
-            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
-
-        if not self.no_penalties:
-            # Since syncing these tensors is expensive only copy them
-            # if necessary i.e. if there are requests which require
-            # penalties to be applied during sampling.
-            copy_slice(self.frequency_penalties_cpu_tensor,
-                       self.frequency_penalties, num_reqs)
-            copy_slice(self.presence_penalties_cpu_tensor,
-                       self.presence_penalties, num_reqs)
-            copy_slice(self.repetition_penalties_cpu_tensor,
-                       self.repetition_penalties, num_reqs)
-
-            # The prompt tokens are used only for applying penalties during
-            # the sampling process. Hence copy these tensors only when
-            # there are requests which need penalties to be applied.
-            prompt_token_ids = self._make_prompt_token_ids_tensor()
-        else:
-            prompt_token_ids = None
-
-        allowed_token_ids_mask: Optional[torch.Tensor] = None
-        if not self.no_allowed_token_ids:
-            assert self.allowed_token_ids_mask is not None
-            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
-                       self.allowed_token_ids_mask, num_reqs)
-            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
-
-        return SamplingMetadata(
-            temperature=temperature,
-            all_greedy=self.all_greedy,
-            all_random=self.all_random,
-            top_p=None if self.no_top_p else self.top_p[:num_reqs],
-            top_k=None if self.no_top_k else self.top_k[:num_reqs],
-            min_p=None if self.no_min_p else self.min_p[:num_reqs],
-            generators=self.generators,
-            max_num_logprobs=self.max_num_logprobs,
-            prompt_token_ids=prompt_token_ids,
-            frequency_penalties=self.frequency_penalties[:num_reqs],
-            presence_penalties=self.presence_penalties[:num_reqs],
-            repetition_penalties=self.repetition_penalties[:num_reqs],
-            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
-            min_tokens=self.min_tokens,
-            no_penalties=self.no_penalties,
-            logit_bias=self.logit_bias[:num_reqs],
-            allowed_token_ids_mask=allowed_token_ids_mask,
-            bad_words_token_ids=self.bad_words_token_ids,
-        )
-
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
         max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
         prompt_token_ids_cpu_tensor = torch.empty(

From 2a4e09c2459fb72273a4ea09d726518ff2ad810c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 18:16:37 -0400
Subject: [PATCH 068/180] first pass at new TPU approach

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/tpu/metadata.py            |   2 +-
 vllm/v1/worker/gpu_input_batch.py         |   2 +-
 vllm/v1/worker/lora_model_runner_mixin.py |  10 +-
 vllm/v1/worker/tpu_input_batch.py         | 579 ++++++++++++++++++++++
 vllm/v1/worker/tpu_model_runner.py        |   2 +-
 5 files changed, 591 insertions(+), 4 deletions(-)
 create mode 100644 vllm/v1/worker/tpu_input_batch.py

diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index 4c1ac489519..6491c84f607 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.tpu_input_batch import InputBatch
 
 DEFAULT_SAMPLING_PARAMS = dict(
     temperature=-1.0,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ebb770a7ddb..c0fd6b90b1d 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Datastructures defining an input batch
+# Datastructures defining a GPU input batch
 
 from dataclasses import dataclass
 from typing import Optional, cast
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index afa41a37eeb..99cdf8c2589 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,6 +5,7 @@
 """
 
 from contextlib import contextmanager
+from typing import Union
 
 import numpy as np
 import torch.nn as nn
@@ -15,7 +16,14 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.platforms import current_platform
+
+if current_platform.device_name == "tpu":
+    from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
+else:
+    from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+
+InputBatch = Union[TPUInputBatch, GPUInputBatch]
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
new file mode 100644
index 00000000000..4ff74bab278
--- /dev/null
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -0,0 +1,579 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Datastructures defining a TPU input batch
+
+from typing import Optional, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingType
+from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.worker.block_table import MultiGroupBlockTable
+from vllm.v1.worker.gpu_input_batch import CachedRequestState
+
+_SAMPLING_EPS = 1e-5
+
+
+class InputBatch:
+
+    def __init__(
+            self,
+            max_num_reqs: int,
+            max_model_len: int,
+            max_num_batched_tokens: int,
+            device: torch.device,
+            pin_memory: bool,
+            vocab_size: int,
+            block_sizes: list[int],  # The block_size of each kv cache group
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
+
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
+
+        # TODO(woosuk): This buffer could be too large if max_model_len is big.
+        # Find a way to reduce the CPU memory usage.
+        # This buffer is not directly transferred to the GPU, so it does not
+        # need to be pinned.
+        self.token_ids_cpu_tensor = torch.zeros(
+            (max_num_reqs, max_model_len),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=False,
+        )
+        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
+
+        # Block table.
+        self.block_table = MultiGroupBlockTable(
+            max_num_reqs=max_num_reqs,
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_num_batched_tokens,
+            pin_memory=pin_memory,
+            device=device,
+            block_sizes=block_sizes,
+        )
+
+        # Sampling-related.
+        self.temperature = torch.empty((max_num_reqs, ),
+                                       dtype=torch.float32,
+                                       device=device)
+        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                  dtype=torch.float32,
+                                                  device="cpu",
+                                                  pin_memory=pin_memory)
+        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
+
+        self.top_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
+        self.top_p_reqs: set[str] = set()
+
+        self.top_k = torch.empty((max_num_reqs, ),
+                                 dtype=torch.int32,
+                                 device=device)
+        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.int32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
+        self.top_k_reqs: set[str] = set()
+
+        self.min_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.min_p_reqs: set[str] = set()
+
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
+        self.presence_penalties_reqs: set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: set[str] = set()
+
+        # req_index -> (min_tokens, stop_token_ids)
+        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
+
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
+
+        # req_index -> generator
+        # NOTE(woosuk): The indices of the requests that do not have their own
+        # generator should not be included in the dictionary.
+        self.generators: dict[int, torch.Generator] = {}
+
+        self.num_logprobs: dict[str, int] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
+
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+
+        self.logit_bias: list[Optional[dict[int,
+                                            float]]] = [None] * max_num_reqs
+        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
+        self.req_output_token_ids: list[Optional[list[int]]] = []
+
+    @property
+    def req_ids(self) -> list[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(list[str], self._req_ids)
+
+    def add_request(
+        self,
+        request: "CachedRequestState",
+        req_index: Optional[int] = None,
+    ) -> None:
+        if req_index is None:
+            req_index = self.num_reqs
+        assert req_index < self.max_num_reqs
+
+        req_id = request.req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+
+        self.req_id_to_index[req_id] = req_index
+
+        # Copy the prompt token ids and output token ids.
+        num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
+        self.token_ids_cpu[
+            req_index, :num_prompt_tokens] = request.prompt_token_ids
+        start_idx = num_prompt_tokens
+        end_idx = start_idx + len(request.output_token_ids)
+        self.token_ids_cpu[req_index,
+                           start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
+        self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
+
+        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
+        self.block_table.add_row(request.block_ids, req_index)
+
+        sampling_params = request.sampling_params
+        if sampling_params.sampling_type == SamplingType.GREEDY:
+            # Avoid later division by zero.
+            self.temperature_cpu[req_index] = -1.0
+            self.greedy_reqs.add(req_id)
+        else:
+            self.temperature_cpu[req_index] = sampling_params.temperature
+            self.random_reqs.add(req_id)
+
+        self.top_p_cpu[req_index] = sampling_params.top_p
+        if sampling_params.top_p < 1:
+            self.top_p_reqs.add(req_id)
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
+            self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
+        self.min_p_cpu[req_index] = sampling_params.min_p
+        self.frequency_penalties_cpu[
+            req_index] = sampling_params.frequency_penalty
+        if sampling_params.min_p > _SAMPLING_EPS:
+            self.min_p_reqs.add(req_id)
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[
+            req_index] = sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[
+            req_index] = sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+        if sampling_params.min_tokens:
+            self.min_tokens[req_index] = (sampling_params.min_tokens,
+                                          sampling_params.all_stop_token_ids)
+
+        # NOTE(woosuk): self.generators should not include the requests that
+        # do not have their own generator.
+        if request.generator is not None:
+            self.generators[req_index] = request.generator
+
+        if sampling_params.logprobs is not None:
+            self.num_logprobs[req_id] = sampling_params.logprobs
+        if sampling_params.prompt_logprobs is not None:
+            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
+        if sampling_params.logit_bias is not None:
+            self.logit_bias[req_index] = sampling_params.logit_bias
+
+        if sampling_params.allowed_token_ids:
+            self.has_allowed_token_ids.add(req_id)
+            if self.allowed_token_ids_mask_cpu_tensor is None:
+                # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs,
+                    self.vocab_size,
+                    dtype=torch.bool,
+                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index][
+                sampling_params.allowed_token_ids] = False
+
+        if sampling_params.bad_words_token_ids:
+            self.bad_words_token_ids[
+                req_index] = sampling_params.bad_words_token_ids
+
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+
+    def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense()."""
+
+        req_index = self.req_id_to_index.pop(req_id, None)
+        if req_index is None:
+            return None
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
+
+        self.greedy_reqs.discard(req_id)
+        self.random_reqs.discard(req_id)
+        self.top_p_reqs.discard(req_id)
+        self.top_k_reqs.discard(req_id)
+        self.min_p_reqs.discard(req_id)
+        self.min_tokens.pop(req_index, None)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
+        self.generators.pop(req_index, None)
+        self.num_logprobs.pop(req_id, None)
+        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
+
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+
+        self.logit_bias[req_index] = None
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
+        return req_index
+
+    def swap_states(self, i1: int, i2: int) -> None:
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+            self.min_p_cpu[i2], self.min_p_cpu[i1]
+
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.logit_bias[i1], self.logit_bias[i2] =\
+            self.logit_bias[i2], self.logit_bias[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
+        self.block_table.swap_row(i1, i2)
+
+    def condense(self, empty_req_indices: list[int]) -> None:
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
+            # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
+            return
+
+        # NOTE(woosuk): This function assumes that the empty_req_indices
+        # is sorted in descending order.
+        last_req_index = num_reqs + len(empty_req_indices) - 1
+        while empty_req_indices:
+            # Find the largest non-empty index.
+            while last_req_index in empty_req_indices:
+                last_req_index -= 1
+
+            # Find the smallest empty index.
+            empty_index = empty_req_indices.pop()
+            if empty_index >= last_req_index:
+                break
+
+            # Swap the states.
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
+            assert req_id is not None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
+            self.req_id_to_index[req_id] = empty_index
+
+            num_tokens = self.num_tokens[last_req_index]
+            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
+                last_req_index, :num_tokens]
+            self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
+            self.num_computed_tokens_cpu[
+                empty_index] = self.num_computed_tokens_cpu[last_req_index]
+            self.block_table.move_row(last_req_index, empty_index)
+            self.temperature_cpu[empty_index] = self.temperature_cpu[
+                last_req_index]
+            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
+            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
+            generator = self.generators.pop(last_req_index, None)
+            if generator is not None:
+                self.generators[empty_index] = generator
+
+            min_token = self.min_tokens.pop(last_req_index, None)
+            if min_token is not None:
+                self.min_tokens[empty_index] = min_token
+
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
+            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
+
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
+            # Decrement last_req_index since it is now empty.
+            last_req_index -= 1
+
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory,
+        )
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    @property
+    def all_greedy(self) -> bool:
+        return len(self.random_reqs) == 0
+
+    @property
+    def all_random(self) -> bool:
+        return len(self.greedy_reqs) == 0
+
+    @property
+    def no_top_p(self) -> bool:
+        return len(self.top_p_reqs) == 0
+
+    @property
+    def no_top_k(self) -> bool:
+        return len(self.top_k_reqs) == 0
+
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
+    @property
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
+
+    @property
+    def no_prompt_logprob(self) -> bool:
+        return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 89c6373b377..de5a0a1f559 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -42,8 +42,8 @@
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
-from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
 from .utils import (initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs)

From 1f4cad31ea737ba9200e24b72ea58a795f248b2b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 18:29:46 -0400
Subject: [PATCH 069/180] docstrings

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py | 5 +++++
 vllm/v1/worker/tpu_input_batch.py | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c0fd6b90b1d..e76293f98a5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -453,6 +453,11 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.block_table.swap_row(i1, i2)
 
     def condense(self, empty_req_indices: list[int]) -> None:
+        """Move non-empty requests down into lower, empty indices.
+        
+        Args:
+          empty_req_indices: empty batch indices, sorted descending.
+        """
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 4ff74bab278..3f105ccc5d9 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -414,6 +414,11 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.block_table.swap_row(i1, i2)
 
     def condense(self, empty_req_indices: list[int]) -> None:
+        """Move non-empty requests down into lower, empty indices.
+        
+        Args:
+          empty_req_indices: empty batch indices, sorted descending.
+        """
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.

From ca8731998c1f122216d9fa9fb4048ec83349f225 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 19:20:51 -0400
Subject: [PATCH 070/180] bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/lora_model_runner_mixin.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 99cdf8c2589..2fbdee4724e 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -16,12 +16,8 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.platforms import current_platform
-
-if current_platform.device_name == "tpu":
-    from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
-else:
-    from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
 
 InputBatch = Union[TPUInputBatch, GPUInputBatch]
 

From 32e427577144c95b88ae408ff91a35e33a9bd9d1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 19:29:36 -0400
Subject: [PATCH 071/180] type checking

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/lora_model_runner_mixin.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 2fbdee4724e..00467513a53 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,7 +5,7 @@
 """
 
 from contextlib import contextmanager
-from typing import Union
+from typing import TYPE_CHECKING, Union
 
 import numpy as np
 import torch.nn as nn
@@ -16,10 +16,11 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
-from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
 
-InputBatch = Union[TPUInputBatch, GPUInputBatch]
+if TYPE_CHECKING:
+    from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+    from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
+    InputBatch = Union[TPUInputBatch, GPUInputBatch]
 
 logger = init_logger(__name__)
 

From 0383e73153efd9ed0ed6855bd7cd099c734c3c21 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 20:51:51 -0400
Subject: [PATCH 072/180] InputBatch fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/lora_model_runner_mixin.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 00467513a53..2fbdee4724e 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,7 +5,7 @@
 """
 
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Union
+from typing import Union
 
 import numpy as np
 import torch.nn as nn
@@ -16,11 +16,10 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
+from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
 
-if TYPE_CHECKING:
-    from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
-    from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
-    InputBatch = Union[TPUInputBatch, GPUInputBatch]
+InputBatch = Union[TPUInputBatch, GPUInputBatch]
 
 logger = init_logger(__name__)
 

From b804423b2b2a6890e1dfaeee28ac33245e711495 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 22:32:25 -0400
Subject: [PATCH 073/180] vllm_xargs/kv_transfer_params compatibility

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 16028644d90..5542063752c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -529,6 +529,8 @@ def to_sampling_params(
             structural_tag=self.structural_tag,
         )
 
+        if "kv_transfer_params" not in self.vllm_xargs:
+            self.vllm_xargs["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -560,9 +562,7 @@ def to_sampling_params(
             bad_words= self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
             # Pass in `extra_body` args and kv transfer params
-            extra_args=({**self.vllm_xargs,
-                         **({"kv_transfer_params": self.kv_transfer_params}
-                            if self.kv_transfer_params else {})})
+            extra_args=self.vllm_xargs,
         )
 
     def _get_guided_json_from_tool(

From 17f02ee1942a00cb3b52c957052beed54dc97d38 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 22:43:36 -0400
Subject: [PATCH 074/180] fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5542063752c..aec0e426aec 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -983,6 +983,8 @@ def to_sampling_params(
             whitespace_pattern=self.guided_whitespace_pattern,
         )
 
+        if "kv_transfer_params" not in self.vllm_xargs:
+            self.vllm_xargs["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -1013,9 +1015,7 @@ def to_sampling_params(
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             # Pass in `extra_body` args and kv transfer params
-            extra_args=({**self.vllm_xargs,
-                         **({"kv_transfer_params": self.kv_transfer_params}
-                            if self.kv_transfer_params else {})})
+            extra_args=self.vllm_xargs,
             )
 
     @model_validator(mode="before")

From 061ac67d94593cf125427e7412a32ca099175f42 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 22:50:57 -0400
Subject: [PATCH 075/180] remove unnecessary unit test

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../v1/entrypoints/openai/test_completion.py  | 60 -------------------
 1 file changed, 60 deletions(-)

diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 935fc800cb7..a7c31c06422 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -81,66 +81,6 @@ async def test_single_completion(client: openai.AsyncOpenAI,
     assert completion.choices[0].prompt_logprobs is None
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_completion_custom_arg(client: openai.AsyncOpenAI,
-                                     model_name: str) -> None:
-    """Test that custom arg works and does not break completion.
-    1. Issue a request with a contradictory `max_tokens` setting
-    in `extra_body`; test that the value in `extra_body` was
-    applied.
-    2. Issue a request with a contradictory `max_tokens` setting
-    in `extra_sampling_params`; test that the value is applied.
-    """
-    completion_body = await client.completions.create(
-        model=model_name,
-        prompt="Hello, my name is",
-        max_tokens=10,
-        temperature=0.0,
-        # Contradictory `max_tokens`
-        extra_body={
-            "max_tokens": 5,
-            "ignore_eos": True
-        })
-
-    # Assert: valid completion with `extra_body["max_tokens"]` tokens
-    assert completion_body.id is not None
-    assert completion_body.choices is not None and len(
-        completion_body.choices) == 1
-    choice = completion_body.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion_body.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    completion_sampling_params = await client.completions.create(
-        model=model_name,
-        prompt="Hello, my name is",
-        temperature=0.0,
-        # Contradictory `max_tokens`
-        extra_body={
-            "ignore_eos": True,
-            "extra_sampling_params": {
-                # Contradictory max_tokens
-                "max_tokens": 5
-            }
-        })
-
-    # Assert: valid completion with
-    # `extra_body["extra_sampling_params"]["max_tokens"]` tokens
-    assert completion_sampling_params.id is not None
-    assert (completion_sampling_params.choices is not None
-            and len(completion_sampling_params.choices) == 1)
-    choice = completion_sampling_params.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion_sampling_params.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",

From 421c2787ec1110af4d22034e7f9aa617ec12c69d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 22:59:26 -0400
Subject: [PATCH 076/180] precedence

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index aec0e426aec..617296af54e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -529,8 +529,6 @@ def to_sampling_params(
             structural_tag=self.structural_tag,
         )
 
-        if "kv_transfer_params" not in self.vllm_xargs:
-            self.vllm_xargs["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -562,7 +560,10 @@ def to_sampling_params(
             bad_words= self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
             # Pass in `extra_body` args and kv transfer params
-            extra_args=self.vllm_xargs,
+            extra_args={**({"kv_transfer_params": self.kv_transfer_params}
+                            if self.kv_transfer_params else {}),
+                            **self.vllm_xargs,
+                         }
         )
 
     def _get_guided_json_from_tool(
@@ -983,8 +984,6 @@ def to_sampling_params(
             whitespace_pattern=self.guided_whitespace_pattern,
         )
 
-        if "kv_transfer_params" not in self.vllm_xargs:
-            self.vllm_xargs["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -1015,7 +1014,10 @@ def to_sampling_params(
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             # Pass in `extra_body` args and kv transfer params
-            extra_args=self.vllm_xargs,
+            extra_args={**({"kv_transfer_params": self.kv_transfer_params}
+                            if self.kv_transfer_params else {}),
+                            **self.vllm_xargs
+                         }
             )
 
     @model_validator(mode="before")

From f315e0ed99b299d133a54545966a8f0c7ea9ba8b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 17 Jun 2025 23:09:42 -0400
Subject: [PATCH 077/180] pre-commit fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 617296af54e..231690edc4a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -562,7 +562,7 @@ def to_sampling_params(
             # Pass in `extra_body` args and kv transfer params
             extra_args={**({"kv_transfer_params": self.kv_transfer_params}
                             if self.kv_transfer_params else {}),
-                            **self.vllm_xargs,
+                            **(self.vllm_xargs if self.vllm_xargs else {})
                          }
         )
 
@@ -788,7 +788,7 @@ class CompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # Custom args param
-    vllm_xargs: Optional[dict[str, Any]] = Field(
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
         description=("Additional kwargs to pass to sampling."),
     )
@@ -1016,7 +1016,7 @@ def to_sampling_params(
             # Pass in `extra_body` args and kv transfer params
             extra_args={**({"kv_transfer_params": self.kv_transfer_params}
                             if self.kv_transfer_params else {}),
-                            **self.vllm_xargs
+                            **(self.vllm_xargs if self.vllm_xargs else {})
                          }
             )
 

From 9c5f407305f6f4d02ad1a94d26242e1950db92cd Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 14:15:11 -0400
Subject: [PATCH 078/180] Documentation changes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 42 ++++++++++++++++-------------
 vllm/sampling_params.py             |  4 +--
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 231690edc4a..51ca8564406 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -252,12 +252,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ChatCompletionNamedToolChoiceParam,
     ]] = "none"
 
-    # Custom sampling params
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
-        default=None,
-        description=("Additional kwargs to pass to sampling."),
-    )
-
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
@@ -420,6 +414,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description="KVTransfer parameters used for disaggregated serving.")
 
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=
+        ("Dict of arbitrary additional kwargs with string or numeric values. "
+         "Can be used by custom sampling implementations, plugins, etc. Not "
+         "used by any in-tree sampling implementations."),
+    )
+
     # --8<-- [end:chat-completion-extra-params]
 
     # Default sampling parameters for chat completion requests
@@ -787,12 +789,6 @@ class CompletionRequest(OpenAIBaseModel):
     top_p: Optional[float] = None
     user: Optional[str] = None
 
-    # Custom args param
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
-        default=None,
-        description=("Additional kwargs to pass to sampling."),
-    )
-
     # --8<-- [start:completion-sampling-params]
     use_beam_search: bool = False
     top_k: Optional[int] = None
@@ -887,6 +883,14 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description="KVTransfer parameters used for disaggregated serving.")
 
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=
+        ("Dict of arbitrary additional kwargs with string or numeric values. "
+         "Can be used by custom sampling implementations, plugins, etc. Not "
+         "used by any in-tree sampling implementations."),
+    )
+
     # --8<-- [end:completion-extra-params]
 
     # Default sampling parameters for completion requests
@@ -1756,6 +1760,14 @@ class TranscriptionRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
     stream_continuous_usage_stats: Optional[bool] = False
+
+    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+        default=None,
+        description=
+        ("Dict of arbitrary additional kwargs with string or numeric values. "
+         "Can be used by custom sampling implementations, plugins, etc. Not "
+         "used by any in-tree sampling implementations."),
+    )
     # --8<-- [end:transcription-extra-params]
 
     # --8<-- [start:transcription-sampling-params]
@@ -1803,12 +1815,6 @@ class TranscriptionRequest(OpenAIBaseModel):
         "min_p": 0.0,
     }
 
-    # Custom sampling params
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
-        default=None,
-        description=("Additional kwargs to pass to sampling."),
-    )
-
     def to_sampling_params(
             self,
             default_max_tokens: int,
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 7abdcecca47..a9a862384d1 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -198,8 +198,8 @@ class SamplingParams(
             processor which only retains scores for the given token ids.
             Defaults to None.
         extra_args: Arbitrary additional args, that can be used by custom
-            sampling implementations. Not used by any in-tree sampling
-            implementations.
+            sampling implementations, plugins, etc. Not used by any in-tree
+            sampling implementations.
     """
 
     n: int = 1

From 0857dc47e60ba2fe31f9ef95789f40b479f52835 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 14:40:00 -0400
Subject: [PATCH 079/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 67 ++++++++++++++++-------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 51ca8564406..efc1bbbbf10 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -326,12 +326,13 @@ class ChatCompletionRequest(OpenAIBaseModel):
     )
     chat_template_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional kwargs to pass to the template renderer. "
-                     "Will be accessible by the chat template."),
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."),
     )
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional kwargs to pass to the HF processor."),
+        description=("Additional keyword args to pass to the HF processor."),
     )
     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
         default=None,
@@ -416,10 +417,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
-        description=
-        ("Dict of arbitrary additional kwargs with string or numeric values. "
-         "Can be used by custom sampling implementations, plugins, etc. Not "
-         "used by any in-tree sampling implementations."),
+        description=(
+            "Dict of arbitrary additional keyword args with string or numeric "
+            "values. Can be used by custom sampling implementations, plugins, "
+            "etc. Not used by any in-tree sampling implementations."),
     )
 
     # --8<-- [end:chat-completion-extra-params]
@@ -531,6 +532,11 @@ def to_sampling_params(
             structural_tag=self.structural_tag,
         )
 
+        extra_args = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            # Pass in kv_transfer_params via extra_args
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+        extra_args = extra_args or None
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -561,11 +567,7 @@ def to_sampling_params(
             logit_bias=self.logit_bias,
             bad_words= self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
-            # Pass in `extra_body` args and kv transfer params
-            extra_args={**({"kv_transfer_params": self.kv_transfer_params}
-                            if self.kv_transfer_params else {}),
-                            **(self.vllm_xargs if self.vllm_xargs else {})
-                         }
+            extra_args=extra_args,
         )
 
     def _get_guided_json_from_tool(
@@ -885,10 +887,10 @@ class CompletionRequest(OpenAIBaseModel):
 
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
-        description=
-        ("Dict of arbitrary additional kwargs with string or numeric values. "
-         "Can be used by custom sampling implementations, plugins, etc. Not "
-         "used by any in-tree sampling implementations."),
+        description=(
+            "Dict of arbitrary additional keyword args with string or numeric "
+            "values. Can be used by custom sampling implementations, plugins, "
+            "etc. Not used by any in-tree sampling implementations."),
     )
 
     # --8<-- [end:completion-extra-params]
@@ -988,6 +990,11 @@ def to_sampling_params(
             whitespace_pattern=self.guided_whitespace_pattern,
         )
 
+        extra_args = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            # Pass in kv_transfer_params via extra_args
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+        extra_args = extra_args or None
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -1017,11 +1024,7 @@ def to_sampling_params(
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
-            # Pass in `extra_body` args and kv transfer params
-            extra_args={**({"kv_transfer_params": self.kv_transfer_params}
-                            if self.kv_transfer_params else {}),
-                            **(self.vllm_xargs if self.vllm_xargs else {})
-                         }
+            extra_args=extra_args,
             )
 
     @model_validator(mode="before")
@@ -1141,12 +1144,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     )
     chat_template_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional kwargs to pass to the template renderer. "
-                     "Will be accessible by the chat template."),
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."),
     )
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional kwargs to pass to the HF processor."),
+        description=("Additional keyword args to pass to the HF processor."),
     )
     priority: int = Field(
         default=0,
@@ -1647,12 +1651,13 @@ class TokenizeChatRequest(OpenAIBaseModel):
     )
     chat_template_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional kwargs to pass to the template renderer. "
-                     "Will be accessible by the chat template."),
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."),
     )
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional kwargs to pass to the HF processor."),
+        description=("Additional keyword args to pass to the HF processor."),
     )
     tools: Optional[list[ChatCompletionToolsParam]] = Field(
         default=None,
@@ -1763,10 +1768,10 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
-        description=
-        ("Dict of arbitrary additional kwargs with string or numeric values. "
-         "Can be used by custom sampling implementations, plugins, etc. Not "
-         "used by any in-tree sampling implementations."),
+        description=(
+            "Dict of arbitrary additional keyword args with string or numeric "
+            "values. Can be used by custom sampling implementations, plugins, "
+            "etc. Not used by any in-tree sampling implementations."),
     )
     # --8<-- [end:transcription-extra-params]
 

From f9c4e1997ec5343ed327880f53d76332825a08f2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 14:52:49 -0400
Subject: [PATCH 080/180] typing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index efc1bbbbf10..adb7ebb287f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -532,7 +532,8 @@ def to_sampling_params(
             structural_tag=self.structural_tag,
         )
 
-        extra_args = self.vllm_xargs if self.vllm_xargs else {}
+        extra_args: Optional[dict[
+            str, Any]] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
             # Pass in kv_transfer_params via extra_args
             extra_args["kv_transfer_params"] = self.kv_transfer_params
@@ -990,7 +991,8 @@ def to_sampling_params(
             whitespace_pattern=self.guided_whitespace_pattern,
         )
 
-        extra_args = self.vllm_xargs if self.vllm_xargs else {}
+        extra_args: Optional[dict[
+            str, Any]] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
             # Pass in kv_transfer_params via extra_args
             extra_args["kv_transfer_params"] = self.kv_transfer_params

From 03c6010c27d824052064dd38ed0e43fd2bf8b144 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 14:57:14 -0400
Subject: [PATCH 081/180] typing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index adb7ebb287f..5df553d18b1 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -532,12 +532,10 @@ def to_sampling_params(
             structural_tag=self.structural_tag,
         )
 
-        extra_args: Optional[dict[
-            str, Any]] = self.vllm_xargs if self.vllm_xargs else {}
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
             # Pass in kv_transfer_params via extra_args
             extra_args["kv_transfer_params"] = self.kv_transfer_params
-        extra_args = extra_args or None
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -568,7 +566,7 @@ def to_sampling_params(
             logit_bias=self.logit_bias,
             bad_words= self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
-            extra_args=extra_args,
+            extra_args=extra_args or None,
         )
 
     def _get_guided_json_from_tool(
@@ -996,7 +994,6 @@ def to_sampling_params(
         if self.kv_transfer_params:
             # Pass in kv_transfer_params via extra_args
             extra_args["kv_transfer_params"] = self.kv_transfer_params
-        extra_args = extra_args or None
         return SamplingParams.from_optional(
             n=self.n,
             best_of=self.best_of,
@@ -1026,7 +1023,7 @@ def to_sampling_params(
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
-            extra_args=extra_args,
+            extra_args=extra_args or None,
             )
 
     @model_validator(mode="before")

From 95e1b0dacd232b5292a632fced328f4fa221a3f1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 15:00:30 -0400
Subject: [PATCH 082/180] typing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5df553d18b1..6ad5f35d107 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -989,8 +989,7 @@ def to_sampling_params(
             whitespace_pattern=self.guided_whitespace_pattern,
         )
 
-        extra_args: Optional[dict[
-            str, Any]] = self.vllm_xargs if self.vllm_xargs else {}
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
             # Pass in kv_transfer_params via extra_args
             extra_args["kv_transfer_params"] = self.kv_transfer_params

From 9daeaede9936ea8f54820f5d959232c3ef1ad635 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Wed, 18 Jun 2025 15:18:54 -0400
Subject: [PATCH 083/180] Update vllm/entrypoints/openai/protocol.py

Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6ad5f35d107..94ce2c4e10a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -418,9 +418,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
         description=(
-            "Dict of arbitrary additional keyword args with string or numeric "
-            "values. Can be used by custom sampling implementations, plugins, "
-            "etc. Not used by any in-tree sampling implementations."),
+            "Additional request parameters with string or "
+            "numeric values, used by custom extensions."),
     )
 
     # --8<-- [end:chat-completion-extra-params]

From baf90c93cd9d2ee461dfdaeb2946e06085e86d31 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 15:21:39 -0400
Subject: [PATCH 084/180] feedback

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/openai/protocol.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 94ce2c4e10a..b278d0d0058 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -332,7 +332,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     )
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional keyword args to pass to the HF processor."),
+        description=("Additional kwargs to pass to the HF processor."),
     )
     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
         default=None,
@@ -417,9 +417,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
-        description=(
-            "Additional request parameters with string or "
-            "numeric values, used by custom extensions."),
+        description=("Additional request parameters with string or "
+                     "numeric values, used by custom extensions."),
     )
 
     # --8<-- [end:chat-completion-extra-params]
@@ -885,10 +884,8 @@ class CompletionRequest(OpenAIBaseModel):
 
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
-        description=(
-            "Dict of arbitrary additional keyword args with string or numeric "
-            "values. Can be used by custom sampling implementations, plugins, "
-            "etc. Not used by any in-tree sampling implementations."),
+        description=("Additional request parameters with string or "
+                     "numeric values, used by custom extensions."),
     )
 
     # --8<-- [end:completion-extra-params]
@@ -1147,7 +1144,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     )
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional keyword args to pass to the HF processor."),
+        description=("Additional kwargs to pass to the HF processor."),
     )
     priority: int = Field(
         default=0,
@@ -1654,7 +1651,7 @@ class TokenizeChatRequest(OpenAIBaseModel):
     )
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("Additional keyword args to pass to the HF processor."),
+        description=("Additional kwargs to pass to the HF processor."),
     )
     tools: Optional[list[ChatCompletionToolsParam]] = Field(
         default=None,
@@ -1765,10 +1762,8 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
         default=None,
-        description=(
-            "Dict of arbitrary additional keyword args with string or numeric "
-            "values. Can be used by custom sampling implementations, plugins, "
-            "etc. Not used by any in-tree sampling implementations."),
+        description=("Additional request parameters with string or "
+                     "numeric values, used by custom extensions."),
     )
     # --8<-- [end:transcription-extra-params]
 

From 4f04198d160f271f71003167fa0ae33606e57a01 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 16:06:10 -0400
Subject: [PATCH 085/180] remove swap type

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index e7ceaeedc75..190b2f83a20 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -21,9 +21,6 @@ class MoveDirectionalityEnum(Enum):
 # (index, params, output_tok_ids) for new
 # requests added to the batch.
 AddedRequestType = tuple[int, SamplingParams, list[int]]
-# (a, b) batch indices of any requests
-# swapped within the batch.
-SwappedRequestType = tuple[int, int]
 # (from, to) batch indices of any requests
 # moved within the batch.
 MovedRequestType = tuple[int, int, MoveDirectionalityEnum]
@@ -324,19 +321,20 @@ def update_state(self, batch_update: BatchUpdate):
             for a_index, b_index, direct in batch_update.moved:
                 if direct == MoveDirectionalityEnum.UNIDIRECTIONAL:
                     if (a_entry := self.min_toks.pop(a_index, None)) is None:
-                        if self.min_toks.pop(b_index,None) is not None:
-                            needs_update=True
+                        if self.min_toks.pop(b_index, None) is not None:
+                            needs_update = True
                     else:
                         self.min_toks[b_index] = a_entry
-                        needs_update=True
+                        needs_update = True
                 else:
                     a_entry = self.min_toks.pop(a_index, None)
-                    if (b_entry := self.min_toks.pop(b_index, None)) is not None:
-                        self.min_toks[a_index]=b_entry
-                        needs_update=True
+                    if (b_entry := self.min_toks.pop(b_index,
+                                                     None)) is not None:
+                        self.min_toks[a_index] = b_entry
+                        needs_update = True
                     if a_entry is not None:
-                        self.min_toks[b_index]=a_entry
-                        needs_update=True
+                        self.min_toks[b_index] = a_entry
+                        needs_update = True
 
         if self.min_toks:
             # Check for any requests that have attained their min tokens.

From da23801ef11967ecc0e4394c85a3b677ca9a044f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 16:09:02 -0400
Subject: [PATCH 086/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_rejection_sampler.py |  4 ++--
 tests/v1/sample/test_sampler.py           |  4 ++--
 vllm/v1/sample/logits_processor.py        | 20 ++++++++++----------
 vllm/v1/sample/metadata.py                |  4 ++--
 vllm/v1/worker/gpu_model_runner.py        |  4 ++--
 vllm/v1/worker/utils.py                   |  6 +++---
 6 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index e8a866f9274..144cae12ba4 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -10,7 +10,7 @@
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
-from vllm.v1.worker.utils import LogitsProcessorObjects
+from vllm.v1.worker.utils import LogitsProcessorManager
 
 DEVICE = "cuda"
 
@@ -68,7 +68,7 @@ def create_sampling_metadata(
         output_token_ids=[],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorObjects(),
+        logitsprocs=LogitsProcessorManager(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index ee36e993433..e885021c639 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -11,7 +11,7 @@
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
-from vllm.v1.worker.utils import LogitsProcessorObjects
+from vllm.v1.worker.utils import LogitsProcessorManager
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
@@ -147,7 +147,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorObjects(),
+        logitsprocs=LogitsProcessorManager(),
     )
     return fake_sampling_metadata
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 190b2f83a20..310ed31160a 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -20,26 +20,26 @@ class MoveDirectionalityEnum(Enum):
 
 # (index, params, output_tok_ids) for new
 # requests added to the batch.
-AddedRequestType = tuple[int, SamplingParams, list[int]]
+AddedRequest = tuple[int, SamplingParams, list[int]]
 # (from, to) batch indices of any requests
 # moved within the batch.
-MovedRequestType = tuple[int, int, MoveDirectionalityEnum]
+MovedRequest = tuple[int, int, MoveDirectionalityEnum]
 # Batch indices of any removed requests.
-RemovedRequestType = int
+RemovedRequest = int
 
 
 class BatchUpdate:
     # The current number of requests in the batch.
     batch_size: int
-    _removed: list[RemovedRequestType]
+    _removed: list[RemovedRequest]
     _is_removed_sorted: bool
-    moved: list[MovedRequestType]
-    added: list[AddedRequestType]
+    moved: list[MovedRequest]
+    added: list[AddedRequest]
 
     def __init__(self,
-                 removed: Optional[list[RemovedRequestType]] = None,
-                 moved: Optional[list[MovedRequestType]] = None,
-                 added: Optional[list[AddedRequestType]] = None,
+                 removed: Optional[list[RemovedRequest]] = None,
+                 moved: Optional[list[MovedRequest]] = None,
+                 added: Optional[list[AddedRequest]] = None,
                  batch_size: Optional[int] = None) -> None:
         self._removed = removed or []
         self.moved = moved or []
@@ -59,7 +59,7 @@ def _sort_removed(self) -> None:
             self._is_removed_sorted = True
 
     @property
-    def removed(self) -> list[RemovedRequestType]:
+    def removed(self) -> list[RemovedRequest]:
         self._sort_removed()
         return self._removed
 
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 186f090337b..c1def559819 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.worker.utils import LogitsProcessorObjects
+from vllm.v1.worker.utils import LogitsProcessorManager
 
 
 @dataclass
@@ -42,4 +42,4 @@ class SamplingMetadata:
     # Some logits processors don't affect greedy decoding (or if they do,
     # only due to precision errors); "non-greedy" processors are
     # only applied to random-sampled requests in the batch.
-    logitsprocs: LogitsProcessorObjects
\ No newline at end of file
+    logitsprocs: LogitsProcessorManager
\ No newline at end of file
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index eb47ff73aaf..210acde2888 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -60,7 +60,7 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (LogitsProcessorObjects, gather_mm_placeholders,
+from .utils import (LogitsProcessorManager, gather_mm_placeholders,
                     initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
@@ -1916,7 +1916,7 @@ def _dummy_sampler_run(
             output_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logitsprocs=LogitsProcessorObjects(),
+            logitsprocs=LogitsProcessorManager(),
         )
         try:
             sampler_output = self.sampler(logits=logits,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 228e35a60d5..f751e51b96a 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -124,7 +124,7 @@ def initialize_kv_cache_for_kv_sharing(
 
 
 @dataclass
-class LogitsProcessorObjects:
+class LogitsProcessorManager:
     """Encapsulates initialized logitsproc objects.
     
     Each logits processor has a unique id.
@@ -165,7 +165,7 @@ def all_list(self) -> list[LogitsProcessor]:
 
 def init_hard_coded_logitsprocs(
         pin_memory_available: bool, max_num_reqs: int,
-        device: torch.device) -> LogitsProcessorObjects:
+        device: torch.device) -> LogitsProcessorManager:
     min_tokens_logitproc = MinTokensLogitsProcessor(
         pin_memory=pin_memory_available, device=device)
     logit_bias_logitproc = LogitBiasLogitsProcessor(
@@ -175,7 +175,7 @@ def init_hard_coded_logitsprocs(
         device=device,
         # +1 for temporary swap space
         max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorObjects(
+    return LogitsProcessorManager(
         greedy={
             STR_MIN_TOKENS_LOGITPROC_ID: min_tokens_logitproc,
             STR_LOGITS_BIAS_LOGITPROC_ID: logit_bias_logitproc

From 34c98668a523e6ff01a0b6c125670047c2d1b6c8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 16:15:39 -0400
Subject: [PATCH 087/180] move/swap refactoring

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  6 +++---
 vllm/v1/sample/logits_processor.py        | 18 ++++++++++--------
 vllm/v1/worker/gpu_input_batch.py         | 11 +++++------
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 5d26dee0778..907e93edc8f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -16,7 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
-from vllm.v1.sample.logits_processor import BatchUpdate, MoveDirectionalityEnum
+from vllm.v1.sample.logits_processor import BatchUpdate, MoveDirectionality
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
                                   STR_MIN_P_LOGITPROC_ID,
@@ -435,7 +435,7 @@ def _generate_fake_step_update(
         persistent_batch[first_empty_index] = persistent_batch[
             last_nonempty_index]
         batch_update.moved.append((last_nonempty_index, first_empty_index,
-                                   MoveDirectionalityEnum.UNIDIRECTIONAL))
+                                   MoveDirectionality.UNIDIRECTIONAL))
 
         last_nonempty_index -= 1
 
@@ -454,7 +454,7 @@ def _generate_fake_step_update(
         swaps = [
             tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
         ]
-        batch_update.moved.extend([(sw[0], sw[1], MoveDirectionalityEnum.SWAP)
+        batch_update.moved.extend([(sw[0], sw[1], MoveDirectionality.SWAP)
                                    for sw in swaps])
         for adx, bdx in swaps:
             persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 310ed31160a..faa3e44d7cc 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -13,17 +13,19 @@
 logger = init_logger(__name__)
 
 
-class MoveDirectionalityEnum(Enum):
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
     UNIDIRECTIONAL = 0
+    # Two-way i1<->i2 req swap within batch
     SWAP = 1
 
 
-# (index, params, output_tok_ids) for new
+# (index, params, output_tok_ids) tuples for new
 # requests added to the batch.
 AddedRequest = tuple[int, SamplingParams, list[int]]
-# (from, to) batch indices of any requests
-# moved within the batch.
-MovedRequest = tuple[int, int, MoveDirectionalityEnum]
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
 # Batch indices of any removed requests.
 RemovedRequest = int
 
@@ -181,7 +183,7 @@ def update_state(self, batch_update: BatchUpdate):
                 needs_update |= change
                 if change:
                     self.min_p_cpu[bdx] = min_p_a
-                    if direct == MoveDirectionalityEnum.SWAP:
+                    if direct == MoveDirectionality.SWAP:
                         self.min_p_cpu[adx] = min_p_b
 
         # Update tensors if needed.
@@ -246,7 +248,7 @@ def update_state(self, batch_update: BatchUpdate):
             # Process moved requests, unidirectional (a->b) and swap (a<->b)
             for a_index, b_index, direct in batch_update.moved:
                 a_entry = self.biases.pop(a_index, None)
-                if direct == MoveDirectionalityEnum.SWAP and (
+                if direct == MoveDirectionality.SWAP and (
                         b_entry := self.biases.pop(b_index, None)) is not None:
                     needs_update = True
                     self.biases[a_index] = b_entry
@@ -319,7 +321,7 @@ def update_state(self, batch_update: BatchUpdate):
             # Process moved requests, unidirectional (a->b) and
             # swapped (a<->b)
             for a_index, b_index, direct in batch_update.moved:
-                if direct == MoveDirectionalityEnum.UNIDIRECTIONAL:
+                if direct == MoveDirectionality.UNIDIRECTIONAL:
                     if (a_entry := self.min_toks.pop(a_index, None)) is None:
                         if self.min_toks.pop(b_index, None) is not None:
                             needs_update = True
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 72ab03c1888..19d9f8f10f5 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -14,7 +14,7 @@
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.sample.logits_processor import (BatchUpdate, MinPLogitsProcessor,
-                                             MoveDirectionalityEnum)
+                                             MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
@@ -405,7 +405,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
-        self.batch_update.moved.append((i1, i2, MoveDirectionalityEnum.SWAP))
+        self.batch_update.moved.append((i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -461,7 +461,7 @@ def swap_states(self, i1: int, i2: int) -> None:
 
     def _register_move_request(self, from_idx: int, to_idx: int) -> None:
         self.batch_update.moved.append(
-            (from_idx, to_idx, MoveDirectionalityEnum.UNIDIRECTIONAL))
+            (from_idx, to_idx, MoveDirectionality.UNIDIRECTIONAL))
 
     def condense(self) -> None:
         """Slide non-empty requests down into lower, empty indices.
@@ -501,9 +501,8 @@ def condense(self) -> None:
             # Move active request down into empty request
             # index.
             self.batch_update.pop_removed_if_can()
-            self.batch_update.moved.append(
-                (last_req_index, empty_index,
-                 MoveDirectionalityEnum.UNIDIRECTIONAL))
+            self.batch_update.moved.append((last_req_index, empty_index,
+                                            MoveDirectionality.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None

From e1f0455fa9660dd99c4dc3b3b3db0b02f0938548 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 16:28:07 -0400
Subject: [PATCH 088/180] refactoring

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/spec_decode/utils.py      | 14 ++++++++++----
 vllm/v1/worker/gpu_input_batch.py | 11 +----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 7e35e84d19b..83906006969 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,16 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.triton_utils import tl, triton
+from vllm.v1.sample.logits_processor import MinPLogitsProcessor
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 
 def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    if input_batch.get_min_p_by_req_id(req_id):
+    min_p_lp = input_batch.logitsprocs.get_logitsproc_by_id("min_p")
+    if (isinstance(min_p_lp, MinPLogitsProcessor) and
+            min_p_lp.get_min_p_by_index(input_batch.req_id_to_index[req_id])):
         # Spec decode doesn't support min_p sampling.
+        # Note: isinstance implicitly catches the case where the logitproc
+        # is None (i.e. no min-p logitproc is loaded.)
         return False
-    elif (req_id in input_batch.frequency_penalties_reqs
-          or req_id in input_batch.presence_penalties_reqs
-          or req_id in input_batch.repetition_penalties_reqs):
+
+    if (req_id in input_batch.frequency_penalties_reqs
+            or req_id in input_batch.presence_penalties_reqs
+            or req_id in input_batch.repetition_penalties_reqs):
         # Spec decode doesn't support penalties.
         return False
     elif req_id in input_batch.num_logprobs:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 19d9f8f10f5..fb458948c36 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -13,8 +13,7 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.logits_processor import (BatchUpdate, MinPLogitsProcessor,
-                                             MoveDirectionality)
+from vllm.v1.sample.logits_processor import BatchUpdate, MoveDirectionality
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
@@ -667,14 +666,6 @@ def make_lora_inputs(
 
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
 
-    def get_min_p_by_req_id(self, req_id: str) -> float:
-        assert req_id in self.req_id_to_index
-        min_p_logitsproc = self.logitsprocs.get_logitsproc_by_id("min_p")
-        assert min_p_logitsproc is not None and isinstance(
-            min_p_logitsproc, MinPLogitsProcessor)
-        return min_p_logitsproc.get_min_p_by_index(
-            self.req_id_to_index[req_id])
-
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)

From f506dd70591eed1b1c68c20b1f78c8a1cca4af0f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 18 Jun 2025 16:37:11 -0400
Subject: [PATCH 089/180] small fixes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/metadata.py        | 2 +-
 vllm/v1/sample/sampler.py         | 3 +--
 vllm/v1/worker/gpu_input_batch.py | 3 ---
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index c1def559819..943b2c67cb9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -42,4 +42,4 @@ class SamplingMetadata:
     # Some logits processors don't affect greedy decoding (or if they do,
     # only due to precision errors); "non-greedy" processors are
     # only applied to random-sampled requests in the batch.
-    logitsprocs: LogitsProcessorManager
\ No newline at end of file
+    logitsprocs: LogitsProcessorManager
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 90f88be8498..8d5f3553365 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -39,7 +39,6 @@ def forward(
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
-
         # Apply allowed token ids.
         logits = self.apply_allowed_token_ids(logits, sampling_metadata)
         # Apply bad words exclusion.
@@ -113,7 +112,7 @@ def sample(
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
-        # Apply logits processors only compatible with nongreedy.
+        # Apply logits processors only compatible with nongreedy sampling.
         for processor in sampling_metadata.logitsprocs.nongreedy_list:
             logits = processor.apply(logits)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index fb458948c36..bc5b79f7e3b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -178,8 +178,6 @@ def __init__(
             self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
-        self.prompt_token_ids: Optional[torch.Tensor] = None
-
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
                                              dtype=np.int32)
@@ -455,7 +453,6 @@ def swap_states(self, i1: int, i2: int) -> None:
                 self.allowed_token_ids_mask_cpu_tensor[i2] =\
                 self.allowed_token_ids_mask_cpu_tensor[i2], \
                     self.allowed_token_ids_mask_cpu_tensor[i1]
-
         self.block_table.swap_row(i1, i2)
 
     def _register_move_request(self, from_idx: int, to_idx: int) -> None:

From 7d89720126f3e83a5f3d1ceb71207daabbac98e6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 08:22:51 -0400
Subject: [PATCH 090/180] batch update builder

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 39 ++++++++++++-----------
 vllm/v1/sample/logits_processor.py        | 32 +++++++++++++------
 vllm/v1/worker/gpu_input_batch.py         | 37 +++++++++++----------
 3 files changed, 65 insertions(+), 43 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 907e93edc8f..4fe2ab0e190 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -16,7 +16,8 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
-from vllm.v1.sample.logits_processor import BatchUpdate, MoveDirectionality
+from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+                                             MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
                                   STR_MIN_P_LOGITPROC_ID,
@@ -389,14 +390,14 @@ def _generate_fake_step_update(
 
     # Generate fake removed request indices from current persistent
     # batch before adds
-    batch_update = BatchUpdate(
+    batch_update_builder = BatchUpdateBuilder(
         removed=random.sample(range(batch_size), num_step_remove))
 
     # Get added requests from workload
     for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
         # Replace as many removed requests as possible with added requests
-        add_remove_idx = batch_update.pop_removed_if_can()
-        batch_update.added.append(
+        add_remove_idx = batch_update_builder.pop_removed_if_can()
+        batch_update_builder.added.append(
             (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
         persistent_batch[add_remove_idx] = add_req_params
 
@@ -404,7 +405,7 @@ def _generate_fake_step_update(
     add_reqs_append = workload_params[(wdx +
                                        num_step_add_replace):(wdx +
                                                               num_step_add)]
-    batch_update.added.extend([
+    batch_update_builder.added.extend([
         (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
         for adx, add_req_params in enumerate(add_reqs_append)
     ])
@@ -415,14 +416,14 @@ def _generate_fake_step_update(
     # Simulate condensing persistent batch
     last_nonempty_index = pre_condense_batch_size - 1
     condensed_to_idxs = set()
-    while batch_update.removed:
-        if (last_nonempty_index in batch_update.removed
+    while batch_update_builder.removed:
+        if (last_nonempty_index in batch_update_builder.removed
                 or last_nonempty_index in condensed_to_idxs):
             last_nonempty_index -= 1
             continue
         # last_nonempty_index is the highest persistent batch index that was
         # not removed
-        first_empty_index = batch_update.peek_removed_if_can()
+        first_empty_index = batch_update_builder.peek_removed_if_can()
         assert first_empty_index is not None
         if first_empty_index > last_nonempty_index:
             break
@@ -430,12 +431,13 @@ def _generate_fake_step_update(
         # that is less than last_nonempty_index
         #
         # move last_nonempty_index -> first_empty_index
-        batch_update.pop_removed_if_can()
+        batch_update_builder.pop_removed_if_can()
         condensed_to_idxs.add(first_empty_index)
         persistent_batch[first_empty_index] = persistent_batch[
             last_nonempty_index]
-        batch_update.moved.append((last_nonempty_index, first_empty_index,
-                                   MoveDirectionality.UNIDIRECTIONAL))
+        batch_update_builder.moved.append(
+            (last_nonempty_index, first_empty_index,
+             MoveDirectionality.UNIDIRECTIONAL))
 
         last_nonempty_index -= 1
 
@@ -454,18 +456,19 @@ def _generate_fake_step_update(
         swaps = [
             tuple(sorted([idxs[2 * i], idxs[2 * i + 1]])) for i in range(k)
         ]
-        batch_update.moved.extend([(sw[0], sw[1], MoveDirectionality.SWAP)
-                                   for sw in swaps])
+        batch_update_builder.moved.extend([
+            (sw[0], sw[1], MoveDirectionality.SWAP) for sw in swaps
+        ])
         for adx, bdx in swaps:
             persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
                 bdx], persistent_batch[adx]
-    batch_update.batch_size = condensed_batch_size
 
-    return batch_update, wdx, workload_size - wdx
+    return (batch_update_builder.buildBatchUpdate(condensed_batch_size), wdx,
+            workload_size - wdx)
 
 
 def _assert_valid(
-    batch_update: BatchUpdate,
+    batch_size: int,
     persistent_batch: list[LogitsProcsRequestParams],
     test_fakes: LogitsprocsTestFakes,
     slice_idxs: list[int],
@@ -481,7 +484,7 @@ def _assert_valid(
         return
 
     # Validate logits for each fake request
-    for batch_index in range(batch_update.batch_size):
+    for batch_index in range(batch_size):
         request_params = persistent_batch[batch_index]
         # Invoke the appropriate validation function for
         # the logitproc employed by this request
@@ -544,7 +547,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
         logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
 
         _assert_valid(
-            batch_update=batch_update,
+            batch_size=batch_size,
             persistent_batch=persistent_batch,
             test_fakes=test_fakes,
             slice_idxs=slice_idxs,
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index faa3e44d7cc..ef5e8918d37 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import dataclasses
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from enum import Enum
@@ -30,23 +31,29 @@ class MoveDirectionality(Enum):
 RemovedRequest = int
 
 
+@dataclasses.dataclass
 class BatchUpdate:
-    # The current number of requests in the batch.
-    batch_size: int
+    batch_size: int  # Current num reqs in batch
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
+
+
+class BatchUpdateBuilder:
     _removed: list[RemovedRequest]
     _is_removed_sorted: bool
     moved: list[MovedRequest]
     added: list[AddedRequest]
 
-    def __init__(self,
-                 removed: Optional[list[RemovedRequest]] = None,
-                 moved: Optional[list[MovedRequest]] = None,
-                 added: Optional[list[AddedRequest]] = None,
-                 batch_size: Optional[int] = None) -> None:
+    def __init__(
+        self,
+        removed: Optional[list[RemovedRequest]] = None,
+        moved: Optional[list[MovedRequest]] = None,
+        added: Optional[list[AddedRequest]] = None,
+    ) -> None:
         self._removed = removed or []
         self.moved = moved or []
         self.added = added or []
-        self.batch_size = 0 if batch_size is None else batch_size
         self._is_removed_sorted = False
 
     def _sort_removed(self) -> None:
@@ -87,12 +94,19 @@ def pop_removed_if_can(self) -> Optional[int]:
         return None
 
     def reset(self):
-        self.batch_size = 0
         self._removed = []
         self._is_removed_sorted = False
         self.moved = []
         self.added = []
 
+    def buildBatchUpdate(self, batch_size: int) -> BatchUpdate:
+        return BatchUpdate(
+            batch_size=batch_size,
+            removed=self.removed,
+            moved=self.moved,
+            added=self.added,
+        )
+
 
 class LogitsProcessor(ABC):
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 961b2950c22..a09569ade5a 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -14,8 +14,9 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
-from vllm.v1.sample.logits_processor import BatchUpdate, MoveDirectionality
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
+                                             MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
@@ -205,7 +206,7 @@ def __init__(
 
         # Internal representation of per-step batch state changes.
         # Should reset each step.
-        self.batch_update = BatchUpdate()
+        self.batch_update_builder = BatchUpdateBuilder()
 
         # Define logits processors. Note that Min-P logitsproc is returned
         # both on its own as min_p_logitsproc (to support spec decoding
@@ -241,7 +242,8 @@ def req_ids(self) -> list[str]:
         return cast(list[str], self._req_ids)
 
     def _get_next_add_index(self) -> int:
-        if (req_index := self.batch_update.pop_removed_if_can()) is not None:
+        if (req_index :=
+                self.batch_update_builder.pop_removed_if_can()) is not None:
             # Fill the empty index.
             return req_index
         # Append to end
@@ -251,12 +253,12 @@ def _register_add_request(self, request: "CachedRequestState") -> int:
         """Track add-request operations"""
         req_index = self._get_next_add_index()
         assert req_index < self.max_num_reqs
-        self.batch_update.added.append(
+        self.batch_update_builder.added.append(
             (req_index, request.sampling_params, request.output_token_ids))
         return req_index
 
     def has_step_removed_requests(self) -> bool:
-        return self.batch_update.has_removed()
+        return self.batch_update_builder.has_removed()
 
     def add_request(
         self,
@@ -382,7 +384,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.batch_update.removed_append(req_index)
+        self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -416,7 +418,8 @@ def remove_request(self, req_id: str) -> Optional[int]:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
-        self.batch_update.moved.append((i1, i2, MoveDirectionality.SWAP))
+        self.batch_update_builder.moved.append(
+            (i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
         old_id_i2 = self._req_ids[i2]
         self._req_ids[i1], self._req_ids[i2] =\
@@ -470,7 +473,7 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.block_table.swap_row(i1, i2)
 
     def _register_move_request(self, from_idx: int, to_idx: int) -> None:
-        self.batch_update.moved.append(
+        self.batch_update_builder.moved.append(
             (from_idx, to_idx, MoveDirectionality.UNIDIRECTIONAL))
 
     def condense(self) -> None:
@@ -486,7 +489,7 @@ def condense(self) -> None:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
         """
-        empty_req_indices = self.batch_update.removed
+        empty_req_indices = self.batch_update_builder.removed
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -503,16 +506,17 @@ def condense(self) -> None:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = self.batch_update.peek_removed_if_can()
+            empty_index = self.batch_update_builder.peek_removed_if_can()
             assert empty_index is not None
             if empty_index >= last_req_index:
                 break
 
             # Move active request down into empty request
             # index.
-            self.batch_update.pop_removed_if_can()
-            self.batch_update.moved.append((last_req_index, empty_index,
-                                            MoveDirectionality.UNIDIRECTIONAL))
+            self.batch_update_builder.pop_removed_if_can()
+            self.batch_update_builder.moved.append(
+                (last_req_index, empty_index,
+                 MoveDirectionality.UNIDIRECTIONAL))
             req_id = self._req_ids[last_req_index]
             output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
@@ -570,11 +574,12 @@ def condense(self) -> None:
 
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
-        self.batch_update.batch_size = self.num_reqs
+        batch_update = self.batch_update_builder.buildBatchUpdate(
+            self.num_reqs)
         for logit_proc in self.logitsprocs.all_list:
-            logit_proc.update_state(self.batch_update)
+            logit_proc.update_state(batch_update)
         # Clear state change representation to prepare for next step
-        self.batch_update.reset()
+        self.batch_update_builder.reset()
 
     def refresh(self):
         self._commit_logit_procs_state_changes()

From 33e0f140b7be1afbc9b95a511ac5a7114be8afa1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 09:00:11 -0400
Subject: [PATCH 091/180] comments

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  5 +-
 vllm/v1/sample/logits_processor.py        | 60 ++++++++++++++++++++---
 vllm/v1/worker/gpu_input_batch.py         |  9 ++--
 vllm/v1/worker/utils.py                   | 17 +++++--
 4 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 4fe2ab0e190..cf932ca9dc9 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -22,8 +22,7 @@
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
                                   STR_MIN_P_LOGITPROC_ID,
                                   STR_MIN_TOKENS_LOGITPROC_ID,
-                                  STR_NO_LOGITPROC,
-                                  init_hard_coded_logitsprocs)
+                                  STR_NO_LOGITPROC, init_builtin_logitsprocs)
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
@@ -81,7 +80,7 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_hard_coded_logitsprocs(
+    logitsprocs = init_builtin_logitsprocs(
         pin_memory_available=PIN_MEMORY_AVAILABLE,
         max_num_reqs=MAX_NUM_REQS + 1,
         device=device)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index ef5e8918d37..133687d8b0d 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -31,8 +31,9 @@ class MoveDirectionality(Enum):
 RemovedRequest = int
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
     batch_size: int  # Current num reqs in batch
     removed: Sequence[RemovedRequest]
     moved: Sequence[MovedRequest]
@@ -40,6 +41,28 @@ class BatchUpdate:
 
 
 class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+    
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed_if_can()
+      or self.peek_removed_if_can() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed_if_can()
+      or self.peek_removed_if_can() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed_if_can())
+    
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed_if_can() and self.peek_removed_if_can() both return
+      the lowest removed request index in the current step
+    """
+
     _removed: list[RemovedRequest]
     _is_removed_sorted: bool
     moved: list[MovedRequest]
@@ -60,8 +83,8 @@ def _sort_removed(self) -> None:
         """Sort removed request indices in
         descending order.
         
-        Idempotent after first call, until
-        reset.
+        Idempotent after first call in a
+        given step, until reset.
         """
         if not self._is_removed_sorted:
             self._removed.sort(reverse=True)
@@ -69,37 +92,60 @@ def _sort_removed(self) -> None:
 
     @property
     def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
         self._sort_removed()
         return self._removed
 
     def removed_append(self, index: int) -> None:
+        """Register the removal of a request from
+        the persistent batch.
+
+        Must not be called after the first time
+        self.removed, self.pop_removed_if_can() or
+        self.peek_removed_if_can() are invoked.
+        
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError("Cannot register new removed request after"
+                               " self.removed has been read.")
         self._removed.append(index)
 
     def has_removed(self) -> bool:
         return bool(self._removed)
 
-    def num_removed(self) -> int:
-        return len(self._removed)
-
     def peek_removed_if_can(self) -> Optional[int]:
-        if self.num_removed():
+        """Return lowest removed request index"""
+        if self.has_removed():
             self._sort_removed()
             return self._removed[-1]
         return None
 
     def pop_removed_if_can(self) -> Optional[int]:
+        """Pop lowest removed request index"""
         if self.has_removed():
             self._sort_removed()
             return self._removed.pop()
         return None
 
     def reset(self):
+        """Reset batch info at end of step"""
         self._removed = []
         self._is_removed_sorted = False
         self.moved = []
         self.added = []
 
     def buildBatchUpdate(self, batch_size: int) -> BatchUpdate:
+        """Generate a logitsprocs batch update data structure.
+        
+        Args:
+          batch_size: current persistent batch size
+
+        Returns:
+          Frozen logitsprocs batch update dataclass instance
+        """
         return BatchUpdate(
             batch_size=batch_size,
             removed=self.removed,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index a09569ade5a..1520e1e2243 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -20,7 +20,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
-from vllm.v1.worker.utils import init_hard_coded_logitsprocs
+from vllm.v1.worker.utils import init_builtin_logitsprocs
 
 
 @dataclass
@@ -204,8 +204,9 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        # Internal representation of per-step batch state changes.
-        # Should reset each step.
+        # Internal representation of per-step batch state changes, used for
+        # reordering persistent batch and generating logitsprocs batch state
+        # updates. Should reset each step.
         self.batch_update_builder = BatchUpdateBuilder()
 
         # Define logits processors. Note that Min-P logitsproc is returned
@@ -213,7 +214,7 @@ def __init__(
         # compatibility check) and also as part of logits_procs
         # TODO(andy): logits processor list should be extensible via engine
         # constructor argument; for now the list is fixed.
-        self.logitsprocs = init_hard_coded_logitsprocs(
+        self.logitsprocs = init_builtin_logitsprocs(
             pin_memory_available=pin_memory,
             max_num_reqs=max_num_reqs + 1,
             device=device)
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index f751e51b96a..d1a23f4a14a 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -163,9 +163,20 @@ def all_list(self) -> list[LogitsProcessor]:
         return self.nongreedy_list + self.greedy_list
 
 
-def init_hard_coded_logitsprocs(
-        pin_memory_available: bool, max_num_reqs: int,
-        device: torch.device) -> LogitsProcessorManager:
+def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
+                             device: torch.device) -> LogitsProcessorManager:
+    """Construct 'builtin' vLLM logitsprocs which the engine
+    loads by default.
+    
+    Args:
+      pin_memory_available: pinned memory is available for use
+                            for use by logitsproc
+      max_num_reqs: ceiling on request count in persistent batch
+      device: inference device
+
+    Returns:
+      Data structure encapsulating loaded logitsprocs
+    """
     min_tokens_logitproc = MinTokensLogitsProcessor(
         pin_memory=pin_memory_available, device=device)
     logit_bias_logitproc = LogitBiasLogitsProcessor(

From 2e56aec44b47feb6dd7363307166544bb8a3e1ef Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 14:54:44 -0400
Subject: [PATCH 092/180] add custom logitsprocs arg

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/engine/async_llm.py  | 28 +++++++++++++++-------------
 vllm/v1/engine/llm_engine.py |  4 +++-
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3754570dfaa..427413dd927 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from collections.abc import AsyncGenerator, Mapping
+from collections.abc import AsyncGenerator, Mapping, Sequence
 from copy import copy
 from typing import Any, Optional, Union
 
@@ -39,6 +39,7 @@
                                      setup_default_loggers)
 from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 logger = init_logger(__name__)
 
@@ -46,18 +47,19 @@
 class AsyncLLM(EngineClient):
 
     def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        use_cached_outputs: bool = False,
-        log_requests: bool = True,
-        start_engine_loop: bool = True,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        client_addresses: Optional[dict[str, str]] = None,
-        client_index: int = 0,
+            self,
+            vllm_config: VllmConfig,
+            executor_class: type[Executor],
+            log_stats: bool,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+            use_cached_outputs: bool = False,
+            log_requests: bool = True,
+            start_engine_loop: bool = True,
+            stat_loggers: Optional[list[StatLoggerFactory]] = None,
+            client_addresses: Optional[dict[str, str]] = None,
+            client_index: int = 0,
+            custom_logitsprocs: Sequence[LogitsProcessor] = (),
     ) -> None:
         """
         Create an AsyncLLM.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 1932cd10bb1..9b9709db0ba 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Mapping
+from collections.abc import Mapping, Sequence
 from copy import copy
 from typing import Any, Callable, Optional, Union
 
@@ -32,6 +32,7 @@
                                      StatLoggerFactory)
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 logger = init_logger(__name__)
 
@@ -51,6 +52,7 @@ def __init__(
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
+        custom_logitsprocs: Sequence[LogitsProcessor] = [],
     ) -> None:
         if not envs.VLLM_USE_V1:
             raise ValueError(

From 36d6f69217b16e87ab372910fce874e8b4056205 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 15:30:57 -0400
Subject: [PATCH 093/180] logitsprocs+pooling bugfix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 22 ++++++++++++----------
 vllm/v1/worker/gpu_input_batch.py  |  4 +++-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 133687d8b0d..782a7890c2e 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -3,12 +3,12 @@
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from enum import Enum
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from torch._prims_common import DeviceLikeType
 
-from vllm import SamplingParams
+from vllm import PoolingParams, SamplingParams
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -23,7 +23,7 @@ class MoveDirectionality(Enum):
 
 # (index, params, output_tok_ids) tuples for new
 # requests added to the batch.
-AddedRequest = tuple[int, SamplingParams, list[int]]
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
 # (index 1, index 2, directionality) tuples representing
 # one-way moves or two-way swaps of requests in batch
 MovedRequest = tuple[int, int, MoveDirectionality]
@@ -220,8 +220,8 @@ def update_state(self, batch_update: BatchUpdate):
 
         needs_update = False
         # Process added requests.
-        for index, sampling_params, _ in batch_update.added:
-            min_p = sampling_params.min_p
+        for index, params, _ in batch_update.added:
+            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
             if self.min_p_cpu[index] != min_p:
                 needs_update = True
                 self.min_p_cpu[index] = min_p
@@ -294,8 +294,9 @@ def update_state(self, batch_update: BatchUpdate):
 
         needs_update = False
         # Process added requests.
-        for index, sampling_params, _ in batch_update.added:
-            if lb := sampling_params.logit_bias:
+        for index, params, _ in batch_update.added:
+            if isinstance(params, SamplingParams) and (lb :=
+                                                       params.logit_bias):
                 self.biases[index] = lb
                 needs_update = True
 
@@ -365,11 +366,12 @@ def update_state(self, batch_update: BatchUpdate):
         needs_update = False
 
         # Process added requests.
-        for index, sampling_params, output_tok_ids in batch_update.added:
-            if ((min_tokens := sampling_params.min_tokens)
+        for index, params, output_tok_ids in batch_update.added:
+            if (isinstance(params, SamplingParams)
+                    and (min_tokens := params.min_tokens)
                     and len(output_tok_ids) < min_tokens):
                 self.min_toks[index] = (min_tokens, output_tok_ids,
-                                        sampling_params.all_stop_token_ids)
+                                        params.all_stop_token_ids)
                 needs_update = True
 
         if self.min_toks:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 1520e1e2243..801de87850d 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -254,8 +254,10 @@ def _register_add_request(self, request: "CachedRequestState") -> int:
         """Track add-request operations"""
         req_index = self._get_next_add_index()
         assert req_index < self.max_num_reqs
+        params = (request.sampling_params
+                  if request.sampling_params else request.pooling_params)
         self.batch_update_builder.added.append(
-            (req_index, request.sampling_params, request.output_token_ids))
+            (req_index, params, request.output_token_ids))
         return req_index
 
     def has_step_removed_requests(self) -> bool:

From 3cca78ff81b55223a17b2631c07f40b1f2f1c73c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 16:08:53 -0400
Subject: [PATCH 094/180] small tweaks

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 2 +-
 vllm/v1/sample/logits_processor.py        | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index cf932ca9dc9..bdeeddb09d2 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -124,7 +124,7 @@ def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
 def _sampling_params_from_logitproc(logitproc_id: str) -> SamplingParams:
     """Customize request SamplingParams for a specified logitproc"""
     # SamplingParams for req with no logitproc
-    kwargs = {"min_p": 0, "logit_bias": None, "min_tokens": 0}
+    kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
     if fxn := logitsprocs_test_mapping[logitproc_id].gen_request_fxn:
         fxn(kwargs)
     return SamplingParams(**kwargs)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 782a7890c2e..d79717a4325 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -215,9 +215,6 @@ def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
     def update_state(self, batch_update: BatchUpdate):
-        if not batch_update:
-            return
-
         needs_update = False
         # Process added requests.
         for index, params, _ in batch_update.added:
@@ -289,9 +286,6 @@ def requires_nongreedy(cls) -> bool:
         return False
 
     def update_state(self, batch_update: BatchUpdate):
-        if not batch_update:
-            return
-
         needs_update = False
         # Process added requests.
         for index, params, _ in batch_update.added:

From 417759430f79946cf59c2552e747e600a6363b1e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 16:12:10 -0400
Subject: [PATCH 095/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index bb28bab4500..cee441bfb32 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -51,14 +51,12 @@ def test_penalties(model):
     )
     _ = model.generate(PROMPT, params)
 
+
 def test_min_p_greedy_fails(model):
     """Check that min-p fails for greedy sampling."""
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
-                           SamplingParams(
-        temperature=0.0,
-        min_p=0.5
-    ))
+        _ = model.generate(PROMPT, SamplingParams(temperature=0.0, min_p=0.5))
+
 
 def test_stop(model):
     """Check that we respect the stop words."""

From 5209ffe0f3bb751da662bdd29b59b25912c2fce9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 20:12:06 -0400
Subject: [PATCH 096/180] Fixed min tokens bug

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 79 ++++++++++++++++++-----
 vllm/v1/sample/logits_processor.py        | 30 +++++----
 2 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index bdeeddb09d2..bf7b7cf766e 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -17,6 +17,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
 from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+                                             MinTokensLogitsProcessor,
                                              MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
@@ -164,23 +165,24 @@ def _generate_mixed_logitsprocs_batch_params(
     ]
 
 
-def _logit_bias_params(kwargs: dict) -> None:
-    """Logit bias config"""
-    kwargs["logit_bias"] = {
-        random.randint(0, VOCAB_SIZE - 1): random.choice([-0.1, 0.2])
-    }
-
-
 def _raise_error_invalid(
     msg_suffix: str,
     batch_index: int,
     request_params: LogitsProcsRequestParams,
     step_idx: int,
+    err_cls: type[Exception] = ValueError,
 ) -> None:
-    raise ValueError(f"Validation failed for step={step_idx}, "
-                     f"batch_index={batch_index}, "
-                     f"workload_index={request_params.workload_index}, "
-                     f"req_params={request_params}. Reason: {msg_suffix}")
+    raise err_cls(f"Validation failed for step={step_idx}, "
+                  f"batch_index={batch_index}, "
+                  f"workload_index={request_params.workload_index}, "
+                  f"req_params={request_params}. Reason: {msg_suffix}")
+
+
+def _logit_bias_params(kwargs: dict) -> None:
+    """Logit bias config"""
+    kwargs["logit_bias"] = {
+        random.randint(0, VOCAB_SIZE - 1): random.choice([-0.1, 0.2])
+    }
 
 
 def _logit_bias_validate(
@@ -284,12 +286,57 @@ def _min_tokens_validate(
     step_idx: int,
 ) -> None:
     """Validate min-tokens logitsproc applied correctly"""
-    num_out_tokens = len(request_params.out_tokens)
-    min_reached = num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
-    stop_token_ids = request_params.params.all_stop_token_ids
+    ref_num_out_tokens = len(request_params.out_tokens)
+    min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
+    ref_all_stop_token_ids = request_params.params.all_stop_token_ids
+    mt_lp: MinTokensLogitsProcessor = (
+        test_fakes.sampling_metadata.logitsprocs.get_logitsproc_by_id(
+            STR_MIN_TOKENS_LOGITPROC_ID))
+    assert isinstance(mt_lp, MinTokensLogitsProcessor)
+    min_tok = mt_lp.min_toks.get(batch_index, None)
+
+    # Validate min-token logits processor state
+    if min_tok:
+        (_, out_tok, all_stop_token_ids) = min_tok
+        num_out_tokens = len(out_tok)
+        if num_out_tokens != ref_num_out_tokens:
+            _raise_error_invalid(msg_suffix=(
+                "Number of output tokens in min-token logit processor "
+                f"request metadata ({num_out_tokens}) does not match "
+                f"reference ({ref_num_out_tokens})."),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+        if ref_all_stop_token_ids != all_stop_token_ids:
+            _raise_error_invalid(msg_suffix=(
+                "Stop token ids do not match reference; all_stop_token_ids: "
+                f"{sorted(all_stop_token_ids)}, ref_all_stop_token_ids: "
+                f"{sorted(ref_all_stop_token_ids)}"),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+        if min_reached:
+            _raise_error_invalid(msg_suffix=(
+                "Expected min-tokens request with min reached, but batch "
+                "index is recognized by min-tokens logits processor."),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx,
+                                 err_cls=RuntimeError)
+
+    elif not min_reached:
+        _raise_error_invalid(msg_suffix=(
+            "Expected min-tokens request with min not reached, but batch "
+            "index is not recognized by min-tokens logits processor."),
+                             batch_index=batch_index,
+                             request_params=request_params,
+                             step_idx=step_idx,
+                             err_cls=RuntimeError)
+
+    # Validate min-token logits
     for token_id in range(VOCAB_SIZE):
         logits_for_token = logits_new[batch_index][token_id]
-        if token_id in stop_token_ids and not min_reached:
+        if token_id in ref_all_stop_token_ids and not min_reached:
             if logits_for_token != -float("inf"):
                 _raise_error_invalid(
                     msg_suffix=(f"Token {token_id} is a stop token and "
@@ -303,7 +350,7 @@ def _min_tokens_validate(
             if logits_for_token == -float("inf"):
                 _raise_error_invalid(
                     msg_suffix=(f"Token {token_id} should not be masked but "
-                                f"is (output len={num_out_tokens})"),
+                                f"is (output len={ref_num_out_tokens})"),
                     batch_index=batch_index,
                     request_params=request_params,
                     step_idx=step_idx)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index d79717a4325..e7cb47e778d 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -286,13 +286,14 @@ def requires_nongreedy(cls) -> bool:
         return False
 
     def update_state(self, batch_update: BatchUpdate):
-        needs_update = False
         # Process added requests.
+        needs_update = bool(batch_update.added)
         for index, params, _ in batch_update.added:
             if isinstance(params, SamplingParams) and (lb :=
                                                        params.logit_bias):
                 self.biases[index] = lb
-                needs_update = True
+            else:
+                self.biases.pop(index, None)
 
         if self.biases:
             # Process removed requests.
@@ -357,16 +358,19 @@ def requires_nongreedy(cls) -> bool:
         return False
 
     def update_state(self, batch_update: BatchUpdate):
-        needs_update = False
 
         # Process added requests.
+        needs_update = bool(batch_update.added)
         for index, params, output_tok_ids in batch_update.added:
             if (isinstance(params, SamplingParams)
                     and (min_tokens := params.min_tokens)
                     and len(output_tok_ids) < min_tokens):
+                # Replace request metadata at batch index
                 self.min_toks[index] = (min_tokens, output_tok_ids,
                                         params.all_stop_token_ids)
-                needs_update = True
+            else:
+                # Drop request metadata at batch index
+                self.min_toks.pop(index, None)
 
         if self.min_toks:
             # Process removed requests.
@@ -404,16 +408,16 @@ def update_state(self, batch_update: BatchUpdate):
                 for index in to_remove:
                     del self.min_toks[index]
 
-            # Update tensors if needed.
-            if needs_update and self.min_toks:
-                reqs: list[int] = []
-                tok_ids: list[int] = []
-                for req, (_, _, stop_tok_ids) in self.min_toks.items():
-                    reqs.extend([req] * len(stop_tok_ids))
-                    tok_ids.extend(stop_tok_ids)
+        # Update tensors if needed.
+        if needs_update:
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            for req, (_, _, stop_tok_ids) in self.min_toks.items():
+                reqs.extend([req] * len(stop_tok_ids))
+                tok_ids.extend(stop_tok_ids)
 
-                self.logits_slice = (self._device_tensor(reqs, torch.int32),
-                                     self._device_tensor(tok_ids, torch.int32))
+            self.logits_slice = (self._device_tensor(reqs, torch.int32),
+                                 self._device_tensor(tok_ids, torch.int32))
 
     def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
         return (torch.tensor(data,

From 6f41503778f8206cc10f5286c0dc49dbaad17105 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 24 Jun 2025 22:39:52 -0400
Subject: [PATCH 097/180] fixed logit bias bug

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 21 ++++++++++++++-----
 vllm/v1/sample/logits_processor.py        | 25 +++++++++++++++--------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index bf7b7cf766e..e6bf1c653cd 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -365,10 +365,19 @@ def _none_validate(
     step_idx: int,
 ) -> None:
     """Validate that no logits processors are applied"""
-    if not torch.all(logits_new[batch_index] == (test_fakes.logits[
-            persistent_batch[batch_index].workload_index].cpu())):
+    logits = (
+        test_fakes.logits[persistent_batch[batch_index].workload_index].cpu())
+    ref_logits = logits_new[batch_index]
+    if not torch.all(ref_logits == logits):
+        mismatch_toks = (ref_logits
+                         != logits).nonzero(as_tuple=True)[0].tolist()
+        mismatch_strs = []
+        for token in mismatch_toks:
+            val = float(logits[token])
+            ref_val = float(ref_logits[token])
+            mismatch_strs.append(f"({token=},{val=},{ref_val=})")
         _raise_error_invalid(msg_suffix=(
-            "Unexpected modification of logits with no logitsprocs"),
+            f"Unexpected modification of logits: {','.join(mismatch_strs)}"),
                              batch_index=batch_index,
                              request_params=request_params,
                              step_idx=step_idx)
@@ -398,8 +407,10 @@ class LogitsprocTestHelpers(NamedTuple):
 def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
     logitsprocs_ids = list(logitsprocs_test_mapping.keys())
-    return [[logitproc_id]
-            for logitproc_id in logitsprocs_ids] + [logitsprocs_ids]
+    return [[STR_NO_LOGITPROC]] + [[logitproc_id, STR_NO_LOGITPROC]
+                                   for logitproc_id in logitsprocs_ids
+                                   if logitproc_id != STR_NO_LOGITPROC
+                                   ] + [logitsprocs_ids]
 
 
 def _generate_fake_step_update(
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index e7cb47e778d..9a47ab3c938 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -303,17 +303,24 @@ def update_state(self, batch_update: BatchUpdate):
 
             # Process moved requests, unidirectional (a->b) and swap (a<->b)
             for a_index, b_index, direct in batch_update.moved:
-                a_entry = self.biases.pop(a_index, None)
-                if direct == MoveDirectionality.SWAP and (
-                        b_entry := self.biases.pop(b_index, None)) is not None:
-                    needs_update = True
-                    self.biases[a_index] = b_entry
-                if a_entry is not None:
-                    needs_update = True
-                    self.biases[b_index] = a_entry
+                if direct == MoveDirectionality.UNIDIRECTIONAL:
+                    if (a_entry := self.biases.pop(a_index, None)) is None:
+                        if self.biases.pop(b_index, None) is not None:
+                            needs_update = True
+                    else:
+                        self.biases[b_index] = a_entry
+                        needs_update = True
+                else:
+                    a_entry = self.biases.pop(a_index, None)
+                    if (b_entry := self.biases.pop(b_index, None)) is not None:
+                        self.biases[a_index] = b_entry
+                        needs_update = True
+                    if a_entry is not None:
+                        self.biases[b_index] = a_entry
+                        needs_update = True
 
         # Update tensors if needed.
-        if self.biases and needs_update:
+        if needs_update:
             reqs, tok_ids, biases = [], [], []
             for req, lb in self.biases.items():
                 reqs.extend([req] * len(lb))

From fbdb59525493c9c7607679016bb07b37c562b3d0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 00:40:16 -0400
Subject: [PATCH 098/180] comment Re: output tokens list ref

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 9a47ab3c938..bf782c65dff 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -35,6 +35,16 @@ class MoveDirectionality(Enum):
 class BatchUpdate:
     """Persistent batch state change info for logitsprocs"""
     batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Note: each added request is represented as
+    # (index, params, output_tok_ids)
+    # Key assumption: output_tok_ids is a reference to the
+    # request's running output tokens list; in this way
+    # the logits processors always see the latest list of
+    # generated tokens
     removed: Sequence[RemovedRequest]
     moved: Sequence[MovedRequest]
     added: Sequence[AddedRequest]

From 77bba481fe57528e63a0ce2bb5a26de5a1e41abb Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 10:17:40 -0400
Subject: [PATCH 099/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_rejection_sampler.py |  2 +-
 tests/v1/sample/test_sampler.py           |  2 +-
 vllm/v1/sample/logits_processor.py        | 43 ++++++++++++++++++++++-
 vllm/v1/sample/metadata.py                |  2 +-
 vllm/v1/worker/gpu_model_runner.py        |  4 +--
 vllm/v1/worker/utils.py                   | 43 +----------------------
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 144cae12ba4..8823c6d911e 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -6,11 +6,11 @@
 import torch
 import torch.nn.functional as F
 
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
-from vllm.v1.worker.utils import LogitsProcessorManager
 
 DEVICE = "cuda"
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index e885021c639..ea10661ea11 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,9 +9,9 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
-from vllm.v1.worker.utils import LogitsProcessorManager
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index bf782c65dff..9eff1507abd 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -2,6 +2,7 @@
 import dataclasses
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import Optional, Union
 
@@ -195,7 +196,47 @@ def update_state(
         raise NotImplementedError
 
 
-###### ----- LogitsProcessor impls below here
+@dataclass
+class LogitsProcessorManager:
+    """Encapsulates initialized logitsproc objects.
+
+    Each logits processor has a unique id.
+    """
+    nongreedy: dict[str, LogitsProcessor] = field(
+        default_factory=dict)  # id -> nongreedy-sampling-only logitsproc
+    greedy: dict[str, LogitsProcessor] = field(
+        default_factory=dict)  # id -> greedy-sampling compatible logitsproc
+
+    def __post_init__(self):
+        """Guarantee unique ids"""
+        if (self.nongreedy.keys() & self.greedy.keys()):
+            raise ValueError("Greedy and non-greedy logits "
+                             "processors must not share ids")
+
+    def get_logitsproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
+        """Find logits processor by id, if it exists"""
+        return self.all.get(id, None)
+
+    @property
+    def all(self) -> dict[str, LogitsProcessor]:
+        """All logits processors"""
+        return self.greedy | self.nongreedy
+
+    @property
+    def nongreedy_list(self) -> list[LogitsProcessor]:
+        return list(self.nongreedy.values())
+
+    @property
+    def greedy_list(self) -> list[LogitsProcessor]:
+        return list(self.greedy.values())
+
+    @property
+    def all_list(self) -> list[LogitsProcessor]:
+        """List of all logits processors"""
+        return self.nongreedy_list + self.greedy_list
+
+
+###### ----- Built-in LogitsProcessor impls below here
 
 
 class MinPLogitsProcessor(LogitsProcessor):
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 943b2c67cb9..ac0486cb183 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.worker.utils import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 
 
 @dataclass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 729ee99ceef..aba702a9645 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -67,8 +67,8 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (LogitsProcessorManager, gather_mm_placeholders,
-                    initialize_kv_cache_for_kv_sharing,
+from ..sample.logits_processor import LogitsProcessorManager
+from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index d1a23f4a14a..fc5557d1e2a 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass, field
 from typing import Optional
 
 import torch
@@ -8,7 +7,7 @@
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
 from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
+                                             LogitsProcessorManager,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor)
 
@@ -123,46 +122,6 @@ def initialize_kv_cache_for_kv_sharing(
         kv_cache_groups[group_idx].layer_names.append(layer_name)
 
 
-@dataclass
-class LogitsProcessorManager:
-    """Encapsulates initialized logitsproc objects.
-    
-    Each logits processor has a unique id.
-    """
-    nongreedy: dict[str, LogitsProcessor] = field(
-        default_factory=dict)  # id -> nongreedy-sampling-only logitsproc
-    greedy: dict[str, LogitsProcessor] = field(
-        default_factory=dict)  # id -> greedy-sampling compatible logitsproc
-
-    def __post_init__(self):
-        """Guarantee unique ids"""
-        if (self.nongreedy.keys() & self.greedy.keys()):
-            raise ValueError("Greedy and non-greedy logits "
-                             "processors must not share ids")
-
-    def get_logitsproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
-        """Find logits processor by id, if it exists"""
-        return self.all.get(id, None)
-
-    @property
-    def all(self) -> dict[str, LogitsProcessor]:
-        """All logits processors"""
-        return self.greedy | self.nongreedy
-
-    @property
-    def nongreedy_list(self) -> list[LogitsProcessor]:
-        return list(self.nongreedy.values())
-
-    @property
-    def greedy_list(self) -> list[LogitsProcessor]:
-        return list(self.greedy.values())
-
-    @property
-    def all_list(self) -> list[LogitsProcessor]:
-        """List of all logits processors"""
-        return self.nongreedy_list + self.greedy_list
-
-
 def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
                              device: torch.device) -> LogitsProcessorManager:
     """Construct 'builtin' vLLM logitsprocs which the engine

From 890a9cd40b3c05f807ecc2010124adc7f2fead3b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 10:19:31 -0400
Subject: [PATCH 100/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 2 +-
 tests/v1/sample/utils.py                  | 2 +-
 vllm/v1/sample/logits_processor.py        | 2 +-
 vllm/v1/spec_decode/utils.py              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index e6bf1c653cd..135aa952640 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -290,7 +290,7 @@ def _min_tokens_validate(
     min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
     ref_all_stop_token_ids = request_params.params.all_stop_token_ids
     mt_lp: MinTokensLogitsProcessor = (
-        test_fakes.sampling_metadata.logitsprocs.get_logitsproc_by_id(
+        test_fakes.sampling_metadata.logitsprocs.get_logitproc_by_id(
             STR_MIN_TOKENS_LOGITPROC_ID))
     assert isinstance(mt_lp, MinTokensLogitsProcessor)
     min_tok = mt_lp.min_toks.get(batch_index, None)
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index fc4d061ced5..f833672a73c 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -174,7 +174,7 @@ class LogitsprocsTestFakes(NamedTuple):
 
     def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
         """Shorthand for getting a specific logitproc from SamplingMetadata"""
-        return self.sampling_metadata.logitsprocs.get_logitsproc_by_id(id)
+        return self.sampling_metadata.logitsprocs.get_logitproc_by_id(id)
 
     def get_logitsprocs(self) -> list[LogitsProcessor]:
         return self.sampling_metadata.logitsprocs.all_list
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 9eff1507abd..92c0d406769 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -213,7 +213,7 @@ def __post_init__(self):
             raise ValueError("Greedy and non-greedy logits "
                              "processors must not share ids")
 
-    def get_logitsproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
+    def get_logitproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
         """Find logits processor by id, if it exists"""
         return self.all.get(id, None)
 
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 83906006969..62afb1105bf 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -6,7 +6,7 @@
 
 
 def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    min_p_lp = input_batch.logitsprocs.get_logitsproc_by_id("min_p")
+    min_p_lp = input_batch.logitsprocs.get_logitproc_by_id("min_p")
     if (isinstance(min_p_lp, MinPLogitsProcessor) and
             min_p_lp.get_min_p_by_index(input_batch.req_id_to_index[req_id])):
         # Spec decode doesn't support min_p sampling.

From 6b3ea9ffedc1f6a2038f3cdb45d10d8718719276 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Wed, 25 Jun 2025 14:56:24 -0400
Subject: [PATCH 101/180] Update vllm/v1/sample/logits_processor.py

Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 92c0d406769..393fcd2458d 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -90,7 +90,7 @@ def __init__(
         self.added = added or []
         self._is_removed_sorted = False
 
-    def _sort_removed(self) -> None:
+    def _ensure_removed_sorted(self) -> None:
         """Sort removed request indices in
         descending order.
         

From 8a8f9c2afd86a09d1f99456ec164b0f44e5d85c1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 15:27:31 -0400
Subject: [PATCH 102/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 92c0d406769..ece75ff4a98 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -217,11 +217,39 @@ def get_logitproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
         """Find logits processor by id, if it exists"""
         return self.all.get(id, None)
 
+    def add_logitsprocs_by_ids(
+            self, ids_logitsprocs: Sequence[tuple[str,
+                                                  LogitsProcessor]]) -> None:
+        """Add a sequence of (logitproc ID, logitproc instance)'s to the
+        logitsprocs manager
+        
+        Args:
+          ids_logitsprocs: sequence of (logitproc ID, logitproc instance pairs)
+        """
+        ids = self.all_ids
+        for id, logitproc in ids_logitsprocs:
+            # Ensure no duplicate IDs
+            if id in ids:
+                raise ValueError(
+                    f"Logits processor ID {id} already loaded (loaded IDs: "
+                    f"{ids})")
+            ids.add(id)
+            if logitproc.requires_nongreedy():
+                self.nongreedy[id] = logitproc
+            else:
+                # Greedy-compatible logitproc
+                self.greedy[id] = logitproc
+
     @property
     def all(self) -> dict[str, LogitsProcessor]:
         """All logits processors"""
         return self.greedy | self.nongreedy
 
+    @property
+    def all_ids(self) -> set[str]:
+        "All logits processors' IDs"
+        return self.greedy.keys() | self.nongreedy.keys()
+
     @property
     def nongreedy_list(self) -> list[LogitsProcessor]:
         return list(self.nongreedy.values())

From 5384732451496bfdeba79cf68d4320ee544a380d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 15:48:49 -0400
Subject: [PATCH 103/180] feedback

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 6 ------
 vllm/sampling_params.py                     | 3 ---
 vllm/v1/sample/logits_processor.py          | 6 +++---
 vllm/v1/sample/metadata.py                  | 4 +---
 4 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index cee441bfb32..ac0f3eb5883 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -52,12 +52,6 @@ def test_penalties(model):
     _ = model.generate(PROMPT, params)
 
 
-def test_min_p_greedy_fails(model):
-    """Check that min-p fails for greedy sampling."""
-    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, SamplingParams(temperature=0.0, min_p=0.5))
-
-
 def test_stop(model):
     """Check that we respect the stop words."""
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9d6112c7f34..a9a862384d1 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -426,9 +426,6 @@ def _verify_args(self) -> None:
         if not 0.0 <= self.min_p <= 1.0:
             raise ValueError("min_p must be in [0, 1], got "
                              f"{self.min_p}.")
-        if self.min_p > 0.0 and self.temperature == 0.0:
-            raise ValueError("min_p > 0.0 requires random sampling "
-                             "but temperature == 0.0")
         if self.max_tokens is not None and self.max_tokens < 1:
             raise ValueError(
                 f"max_tokens must be at least 1, got {self.max_tokens}.")
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 393fcd2458d..5457f93012b 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -105,7 +105,7 @@ def _ensure_removed_sorted(self) -> None:
     def removed(self) -> list[RemovedRequest]:
         """Removed request indices sorted in
         descending order"""
-        self._sort_removed()
+        self._ensure_removed_sorted()
         return self._removed
 
     def removed_append(self, index: int) -> None:
@@ -130,14 +130,14 @@ def has_removed(self) -> bool:
     def peek_removed_if_can(self) -> Optional[int]:
         """Return lowest removed request index"""
         if self.has_removed():
-            self._sort_removed()
+            self._ensure_removed_sorted()
             return self._removed[-1]
         return None
 
     def pop_removed_if_can(self) -> Optional[int]:
         """Pop lowest removed request index"""
         if self.has_removed():
-            self._sort_removed()
+            self._ensure_removed_sorted()
             return self._removed.pop()
         return None
 
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index ac0486cb183..1189b12f307 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -39,7 +39,5 @@ class SamplingMetadata:
     # req_index -> bad_words_token_ids
     bad_words_token_ids: dict[int, list[list[int]]]
 
-    # Some logits processors don't affect greedy decoding (or if they do,
-    # only due to precision errors); "non-greedy" processors are
-    # only applied to random-sampled requests in the batch.
+    # Loaded logits processors
     logitsprocs: LogitsProcessorManager

From 9aebc9f8cc1a1d9ed9ccf3aa90cd43d6aff8fd5a Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Wed, 25 Jun 2025 15:48:58 -0400
Subject: [PATCH 104/180] Update vllm/v1/sample/sampler.py

Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/sample/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 8d5f3553365..6960a16b818 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -112,7 +112,7 @@ def sample(
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
-        # Apply logits processors only compatible with nongreedy sampling.
+        # Apply logits processors that only apply to random sampling.
         for processor in sampling_metadata.logitsprocs.nongreedy_list:
             logits = processor.apply(logits)
 

From 8bb6bf02868fc5aca277c90a4a51e5b7c0b1ec8d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 15:57:41 -0400
Subject: [PATCH 105/180] revert some changes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/engine/async_llm.py  | 28 +++++++++++++---------------
 vllm/v1/engine/llm_engine.py |  4 +---
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 427413dd927..3754570dfaa 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from collections.abc import AsyncGenerator, Mapping, Sequence
+from collections.abc import AsyncGenerator, Mapping
 from copy import copy
 from typing import Any, Optional, Union
 
@@ -39,7 +39,6 @@
                                      setup_default_loggers)
 from vllm.v1.metrics.prometheus import shutdown_prometheus
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
-from vllm.v1.sample.logits_processor import LogitsProcessor
 
 logger = init_logger(__name__)
 
@@ -47,19 +46,18 @@
 class AsyncLLM(EngineClient):
 
     def __init__(
-            self,
-            vllm_config: VllmConfig,
-            executor_class: type[Executor],
-            log_stats: bool,
-            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-            mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-            use_cached_outputs: bool = False,
-            log_requests: bool = True,
-            start_engine_loop: bool = True,
-            stat_loggers: Optional[list[StatLoggerFactory]] = None,
-            client_addresses: Optional[dict[str, str]] = None,
-            client_index: int = 0,
-            custom_logitsprocs: Sequence[LogitsProcessor] = (),
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ) -> None:
         """
         Create an AsyncLLM.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ac08a90831c..25fab271311 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from copy import copy
 from typing import Any, Callable, Optional, Union
 
@@ -32,7 +32,6 @@
                                      StatLoggerFactory)
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
 from vllm.v1.metrics.stats import IterationStats
-from vllm.v1.sample.logits_processor import LogitsProcessor
 
 logger = init_logger(__name__)
 
@@ -52,7 +51,6 @@ def __init__(
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
-        custom_logitsprocs: Sequence[LogitsProcessor] = [],
     ) -> None:
         if not envs.VLLM_USE_V1:
             raise ValueError(

From 0a88e16d9b618de0b94a0182c1b5442df8eb1402 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 16:02:06 -0400
Subject: [PATCH 106/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  6 +++---
 vllm/v1/sample/logits_processor.py        | 20 ++++++++++----------
 vllm/v1/worker/gpu_input_batch.py         |  7 +++----
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 135aa952640..afea08e9fc6 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -453,7 +453,7 @@ def _generate_fake_step_update(
     # Get added requests from workload
     for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
         # Replace as many removed requests as possible with added requests
-        add_remove_idx = batch_update_builder.pop_removed_if_can()
+        add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
             (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
         persistent_batch[add_remove_idx] = add_req_params
@@ -480,7 +480,7 @@ def _generate_fake_step_update(
             continue
         # last_nonempty_index is the highest persistent batch index that was
         # not removed
-        first_empty_index = batch_update_builder.peek_removed_if_can()
+        first_empty_index = batch_update_builder.peek_removed()
         assert first_empty_index is not None
         if first_empty_index > last_nonempty_index:
             break
@@ -488,7 +488,7 @@ def _generate_fake_step_update(
         # that is less than last_nonempty_index
         #
         # move last_nonempty_index -> first_empty_index
-        batch_update_builder.pop_removed_if_can()
+        batch_update_builder.pop_removed()
         condensed_to_idxs.add(first_empty_index)
         persistent_batch[first_empty_index] = persistent_batch[
             last_nonempty_index]
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 5457f93012b..7e02b66aaad 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -59,18 +59,18 @@ class BatchUpdateBuilder:
     * All information about requests removed from persistent batch
       during a step is aggregated in self._removed through calls to
       self.removed_append() at the beginning of a step. This must happen
-      before the first time that self.removed, self.pop_removed_if_can()
-      or self.peek_removed_if_can() are invoked in a given step
-    * After the first time that self.removed, self.pop_removed_if_can()
-      or self.peek_removed_if_can() are read in a step, no new removals
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
       are registered using self.removed_append()
     * Elements of self._removed are never directly modified, added or
       removed (i.e. modification is only via self.removed_append() and
-      self.pop_removed_if_can())
+      self.pop_removed())
     
     Guarantees under above assumptions:
     * self.removed is always sorted in descending order
-    * self.pop_removed_if_can() and self.peek_removed_if_can() both return
+    * self.pop_removed() and self.peek_removed() both return
       the lowest removed request index in the current step
     """
 
@@ -113,8 +113,8 @@ def removed_append(self, index: int) -> None:
         the persistent batch.
 
         Must not be called after the first time
-        self.removed, self.pop_removed_if_can() or
-        self.peek_removed_if_can() are invoked.
+        self.removed, self.pop_removed() or
+        self.peek_removed() are invoked.
         
         Args:
           index: request index
@@ -127,14 +127,14 @@ def removed_append(self, index: int) -> None:
     def has_removed(self) -> bool:
         return bool(self._removed)
 
-    def peek_removed_if_can(self) -> Optional[int]:
+    def peek_removed(self) -> Optional[int]:
         """Return lowest removed request index"""
         if self.has_removed():
             self._ensure_removed_sorted()
             return self._removed[-1]
         return None
 
-    def pop_removed_if_can(self) -> Optional[int]:
+    def pop_removed(self) -> Optional[int]:
         """Pop lowest removed request index"""
         if self.has_removed():
             self._ensure_removed_sorted()
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 801de87850d..5ac24a4d904 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -243,8 +243,7 @@ def req_ids(self) -> list[str]:
         return cast(list[str], self._req_ids)
 
     def _get_next_add_index(self) -> int:
-        if (req_index :=
-                self.batch_update_builder.pop_removed_if_can()) is not None:
+        if (req_index := self.batch_update_builder.pop_removed()) is not None:
             # Fill the empty index.
             return req_index
         # Append to end
@@ -509,14 +508,14 @@ def condense(self) -> None:
                 last_req_index -= 1
 
             # Find the smallest empty index.
-            empty_index = self.batch_update_builder.peek_removed_if_can()
+            empty_index = self.batch_update_builder.peek_removed()
             assert empty_index is not None
             if empty_index >= last_req_index:
                 break
 
             # Move active request down into empty request
             # index.
-            self.batch_update_builder.pop_removed_if_can()
+            self.batch_update_builder.pop_removed()
             self.batch_update_builder.moved.append(
                 (last_req_index, empty_index,
                  MoveDirectionality.UNIDIRECTIONAL))

From dc0b23ab3cb98ffd13eb2929d727dd3d0102df37 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 18:02:32 -0400
Subject: [PATCH 107/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 24 ++++++++++++++----------
 vllm/v1/sample/sampler.py          |  5 +++--
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 7e02b66aaad..5200f077567 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -171,11 +171,13 @@ class LogitsProcessor(ABC):
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
 
-    @classmethod
     @abstractmethod
-    def requires_nongreedy(cls) -> bool:
-        """True if logits processor is incompatible with
-        greedy sampling.
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
         TODO(andy): won't be utilized until logits
         processors are user-extensible
         """
@@ -258,8 +260,8 @@ def __init__(self, max_num_reqs: int, pin_memory: bool,
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
-    @classmethod
-    def requires_nongreedy(cls) -> bool:
+    def is_argmax_invariant(self) -> bool:
+        """Min-p never impacts greedy sampling"""
         return True
 
     def get_min_p_by_index(self, index: int) -> float:
@@ -332,8 +334,9 @@ def __init__(self, pin_memory: bool, device: torch.device):
         self.logits_slice = (self._device_tensor([], torch.int32),
                              self._device_tensor([], torch.int32))
 
-    @classmethod
-    def requires_nongreedy(cls) -> bool:
+    def is_argmax_invariant(self) -> bool:
+        """Logit bias can rebalance token probabilities and change the
+        outcome of argmax in greedy sampling."""
         return False
 
     def update_state(self, batch_update: BatchUpdate):
@@ -411,8 +414,9 @@ def __init__(self, pin_memory: bool, device: torch.device):
                                                   self._device_tensor(
                                                       [], torch.int32))
 
-    @classmethod
-    def requires_nongreedy(cls) -> bool:
+    def is_argmax_invariant(self) -> bool:
+        """By censoring stop tokens, min-tokens can change the outcome
+        of the argmax operation in greedy sampling."""
         return False
 
     def update_state(self, batch_update: BatchUpdate):
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 6960a16b818..afabb09dd2a 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -44,7 +44,7 @@ def forward(
         # Apply bad words exclusion.
         logits = self.apply_bad_words(logits, sampling_metadata)
 
-        # Apply greedy-sampling-compatible logits processors.
+        # Apply logits processors which can impact greedy sampling
         for processor in sampling_metadata.logitsprocs.greedy_list:
             logits = processor.apply(logits)
 
@@ -112,7 +112,8 @@ def sample(
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
-        # Apply logits processors that only apply to random sampling.
+        # Apply logits processors that only apply to random sampling
+        # (argmax invariant)
         for processor in sampling_metadata.logitsprocs.nongreedy_list:
             logits = processor.apply(logits)
 

From 2f0de7755f00601036d891ef31ad8fe7ede57097 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 18:12:45 -0400
Subject: [PATCH 108/180] argmax_invariant

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  2 +-
 vllm/v1/sample/logits_processor.py        | 24 +++++++++++------------
 vllm/v1/sample/sampler.py                 |  5 +++--
 vllm/v1/worker/utils.py                   |  4 ++--
 4 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index afea08e9fc6..ced631789ae 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -599,7 +599,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
         # Apply fake batch update to logitsprocs
         fake_update_logitsprocs_state(test_fakes, batch_update)
 
-        # Emulate application of greedy logits processors in engine
+        # Emulate application of logits processors in engine
         slice_idxs = [req.workload_index for req in persistent_batch]
         logits_w_lp = fake_apply_logitsprocs(test_fakes, slice_idxs).cpu()
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 5200f077567..f43153793a8 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -204,15 +204,15 @@ class LogitsProcessorManager:
 
     Each logits processor has a unique id.
     """
-    nongreedy: dict[str, LogitsProcessor] = field(
-        default_factory=dict)  # id -> nongreedy-sampling-only logitsproc
-    greedy: dict[str, LogitsProcessor] = field(
-        default_factory=dict)  # id -> greedy-sampling compatible logitsproc
+    argmax_invariant: dict[str, LogitsProcessor] = field(
+        default_factory=dict)  # id -> argmax-invariant logitsprocs
+    non_argmax_invariant: dict[str, LogitsProcessor] = field(
+        default_factory=dict)  # id -> non-argmax-invariant logitsprocs
 
     def __post_init__(self):
         """Guarantee unique ids"""
-        if (self.nongreedy.keys() & self.greedy.keys()):
-            raise ValueError("Greedy and non-greedy logits "
+        if (self.argmax_invariant.keys() & self.non_argmax_invariant.keys()):
+            raise ValueError("Argmax invariant and non-invariant logits "
                              "processors must not share ids")
 
     def get_logitproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
@@ -222,20 +222,20 @@ def get_logitproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
     @property
     def all(self) -> dict[str, LogitsProcessor]:
         """All logits processors"""
-        return self.greedy | self.nongreedy
+        return self.non_argmax_invariant | self.argmax_invariant
 
     @property
-    def nongreedy_list(self) -> list[LogitsProcessor]:
-        return list(self.nongreedy.values())
+    def argmax_invariant_list(self) -> list[LogitsProcessor]:
+        return list(self.argmax_invariant.values())
 
     @property
-    def greedy_list(self) -> list[LogitsProcessor]:
-        return list(self.greedy.values())
+    def non_argmax_invariant_list(self) -> list[LogitsProcessor]:
+        return list(self.non_argmax_invariant.values())
 
     @property
     def all_list(self) -> list[LogitsProcessor]:
         """List of all logits processors"""
-        return self.nongreedy_list + self.greedy_list
+        return self.argmax_invariant_list + self.non_argmax_invariant_list
 
 
 ###### ----- Built-in LogitsProcessor impls below here
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index afabb09dd2a..6f8261e63fb 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -45,7 +45,8 @@ def forward(
         logits = self.apply_bad_words(logits, sampling_metadata)
 
         # Apply logits processors which can impact greedy sampling
-        for processor in sampling_metadata.logitsprocs.greedy_list:
+        for processor in (
+                sampling_metadata.logitsprocs.non_argmax_invariant_list):
             logits = processor.apply(logits)
 
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -114,7 +115,7 @@ def sample(
 
         # Apply logits processors that only apply to random sampling
         # (argmax invariant)
-        for processor in sampling_metadata.logitsprocs.nongreedy_list:
+        for processor in sampling_metadata.logitsprocs.argmax_invariant_list:
             logits = processor.apply(logits)
 
         # Apply top_k and/or top_p.
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index fc5557d1e2a..2a73f104f2b 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -146,9 +146,9 @@ def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
         # +1 for temporary swap space
         max_num_reqs=max_num_reqs + 1)
     return LogitsProcessorManager(
-        greedy={
+        non_argmax_invariant={
             STR_MIN_TOKENS_LOGITPROC_ID: min_tokens_logitproc,
             STR_LOGITS_BIAS_LOGITPROC_ID: logit_bias_logitproc
         },
-        nongreedy={STR_MIN_P_LOGITPROC_ID: min_p_logitproc},
+        argmax_invariant={STR_MIN_P_LOGITPROC_ID: min_p_logitproc},
     )

From 8d97a7c54e3422633e648b6d143525f49b392753 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 18:52:04 -0400
Subject: [PATCH 109/180] batch update builder impl

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 15 ++++++++++-----
 vllm/v1/sample/logits_processor.py        | 11 +++++++----
 vllm/v1/worker/gpu_input_batch.py         |  5 +----
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index ced631789ae..1b1133fdc45 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -417,6 +417,7 @@ def _generate_fake_step_update(
     persistent_batch: list[LogitsProcsRequestParams],
     workload_params: list[LogitsProcsRequestParams],
     wdx: int,
+    batch_update_builder: BatchUpdateBuilder,
 ) -> tuple[BatchUpdate, int, int]:
     batch_size = len(persistent_batch)
     workload_size = len(workload_params)
@@ -445,10 +446,9 @@ def _generate_fake_step_update(
 
     num_step_add_replace = min(num_step_add, num_step_remove)
 
-    # Generate fake removed request indices from current persistent
-    # batch before adds
-    batch_update_builder = BatchUpdateBuilder(
-        removed=random.sample(range(batch_size), num_step_remove))
+    # Generate fake removed request indices drawn from persistent batch indices
+    for removal in random.sample(range(batch_size), num_step_remove):
+        batch_update_builder.removed_append(removal)
 
     # Get added requests from workload
     for add_req_params in workload_params[wdx:(wdx + num_step_add_replace)]:
@@ -520,7 +520,7 @@ def _generate_fake_step_update(
             persistent_batch[adx], persistent_batch[bdx] = persistent_batch[
                 bdx], persistent_batch[adx]
 
-    return (batch_update_builder.buildBatchUpdate(condensed_batch_size), wdx,
+    return (batch_update_builder.get_and_reset(condensed_batch_size), wdx,
             workload_size - wdx)
 
 
@@ -576,6 +576,10 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
     persistent_batch: list[LogitsProcsRequestParams] = [
     ]  # Persistent batch state, as list of workload indices
 
+    # Generate fake removed request indices from current persistent
+    # batch before adds
+    batch_update_builder = BatchUpdateBuilder()
+
     # Break when entire workload has been added previously and persistent
     # batch is empty
     workload_reqs_remaining = workload_size
@@ -593,6 +597,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
             persistent_batch=persistent_batch,
             workload_params=workload_params,
             wdx=wdx,
+            batch_update_builder=batch_update_builder,
         )
         batch_size = batch_update.batch_size
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index f43153793a8..ddd692b9f0e 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -141,15 +141,16 @@ def pop_removed(self) -> Optional[int]:
             return self._removed.pop()
         return None
 
-    def reset(self):
+    def _reset(self):
         """Reset batch info at end of step"""
         self._removed = []
         self._is_removed_sorted = False
         self.moved = []
         self.added = []
 
-    def buildBatchUpdate(self, batch_size: int) -> BatchUpdate:
-        """Generate a logitsprocs batch update data structure.
+    def get_and_reset(self, batch_size: int) -> BatchUpdate:
+        """Generate a logitsprocs batch update data structure
+        and reset internal batch update builder state.
         
         Args:
           batch_size: current persistent batch size
@@ -157,12 +158,14 @@ def buildBatchUpdate(self, batch_size: int) -> BatchUpdate:
         Returns:
           Frozen logitsprocs batch update dataclass instance
         """
-        return BatchUpdate(
+        batch_update = BatchUpdate(
             batch_size=batch_size,
             removed=self.removed,
             moved=self.moved,
             added=self.added,
         )
+        self._reset()
+        return batch_update
 
 
 class LogitsProcessor(ABC):
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 5ac24a4d904..98f1b33ca3b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -576,12 +576,9 @@ def condense(self) -> None:
 
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
-        batch_update = self.batch_update_builder.buildBatchUpdate(
-            self.num_reqs)
+        batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
         for logit_proc in self.logitsprocs.all_list:
             logit_proc.update_state(batch_update)
-        # Clear state change representation to prepare for next step
-        self.batch_update_builder.reset()
 
     def refresh(self):
         self._commit_logit_procs_state_changes()

From d1c6607075b4be166bf8bc1c182a4d25f039c766 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 19:09:21 -0400
Subject: [PATCH 110/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  5 ++--
 vllm/v1/sample/logits_processor.py        | 35 ++++++++++++++++++++++
 vllm/v1/worker/gpu_input_batch.py         |  4 +--
 vllm/v1/worker/utils.py                   | 36 -----------------------
 4 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 1b1133fdc45..cd5d5aa7726 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -18,12 +18,13 @@
 from vllm.utils import is_pin_memory_available
 from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
                                              MinTokensLogitsProcessor,
-                                             MoveDirectionality)
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
                                   STR_MIN_P_LOGITPROC_ID,
                                   STR_MIN_TOKENS_LOGITPROC_ID,
-                                  STR_NO_LOGITPROC, init_builtin_logitsprocs)
+                                  STR_NO_LOGITPROC)
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index ddd692b9f0e..b496e079dab 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -11,6 +11,9 @@
 
 from vllm import PoolingParams, SamplingParams
 from vllm.logger import init_logger
+from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
+                                  STR_MIN_P_LOGITPROC_ID,
+                                  STR_MIN_TOKENS_LOGITPROC_ID)
 
 logger = init_logger(__name__)
 
@@ -496,3 +499,35 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
+
+
+def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
+                             device: torch.device) -> LogitsProcessorManager:
+    """Construct 'builtin' vLLM logitsprocs which the engine
+    loads by default.
+
+    Args:
+      pin_memory_available: pinned memory is available for use
+                            for use by logitsproc
+      max_num_reqs: ceiling on request count in persistent batch
+      device: inference device
+
+    Returns:
+      Data structure encapsulating loaded logitsprocs
+    """
+    min_tokens_logitproc = MinTokensLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    logit_bias_logitproc = LogitBiasLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    min_p_logitproc = MinPLogitsProcessor(
+        pin_memory=pin_memory_available,
+        device=device,
+        # +1 for temporary swap space
+        max_num_reqs=max_num_reqs + 1)
+    return LogitsProcessorManager(
+        non_argmax_invariant={
+            STR_MIN_TOKENS_LOGITPROC_ID: min_tokens_logitproc,
+            STR_LOGITS_BIAS_LOGITPROC_ID: logit_bias_logitproc
+        },
+        argmax_invariant={STR_MIN_P_LOGITPROC_ID: min_p_logitproc},
+    )
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 98f1b33ca3b..a504da042e6 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -16,11 +16,11 @@
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             MoveDirectionality)
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
-from vllm.v1.worker.utils import init_builtin_logitsprocs
 
 
 @dataclass
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 2a73f104f2b..174883967ac 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -6,10 +6,6 @@
 
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
-from vllm.v1.sample.logits_processor import (LogitBiasLogitsProcessor,
-                                             LogitsProcessorManager,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor)
 
 # Logits processor id strs
 STR_NO_LOGITPROC = "none"
@@ -120,35 +116,3 @@ def initialize_kv_cache_for_kv_sharing(
         kv_caches[layer_name] = kv_caches[target_layer_name]
         group_idx = layer_to_kv_cache_group_idx[target_layer_name]
         kv_cache_groups[group_idx].layer_names.append(layer_name)
-
-
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device) -> LogitsProcessorManager:
-    """Construct 'builtin' vLLM logitsprocs which the engine
-    loads by default.
-    
-    Args:
-      pin_memory_available: pinned memory is available for use
-                            for use by logitsproc
-      max_num_reqs: ceiling on request count in persistent batch
-      device: inference device
-
-    Returns:
-      Data structure encapsulating loaded logitsprocs
-    """
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=pin_memory_available,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorManager(
-        non_argmax_invariant={
-            STR_MIN_TOKENS_LOGITPROC_ID: min_tokens_logitproc,
-            STR_LOGITS_BIAS_LOGITPROC_ID: logit_bias_logitproc
-        },
-        argmax_invariant={STR_MIN_P_LOGITPROC_ID: min_p_logitproc},
-    )

From 9fe0bc3cae7d4a58bd36ed2308db7d41292b2a93 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 25 Jun 2025 19:33:42 -0400
Subject: [PATCH 111/180] wip dict removal

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  2 +-
 tests/v1/sample/utils.py                  |  6 +--
 vllm/v1/sample/logits_processor.py        | 56 +++++++----------------
 vllm/v1/sample/sampler.py                 |  5 +-
 vllm/v1/spec_decode/utils.py              |  3 +-
 vllm/v1/worker/gpu_input_batch.py         |  2 +-
 6 files changed, 25 insertions(+), 49 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index cd5d5aa7726..ef13c54240f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -291,7 +291,7 @@ def _min_tokens_validate(
     min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
     ref_all_stop_token_ids = request_params.params.all_stop_token_ids
     mt_lp: MinTokensLogitsProcessor = (
-        test_fakes.sampling_metadata.logitsprocs.get_logitproc_by_id(
+        test_fakes.sampling_metadata.logitsprocs.get_logitprocs_by_cls(
             STR_MIN_TOKENS_LOGITPROC_ID))
     assert isinstance(mt_lp, MinTokensLogitsProcessor)
     min_tok = mt_lp.min_toks.get(batch_index, None)
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index f833672a73c..22055656d66 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -174,10 +174,10 @@ class LogitsprocsTestFakes(NamedTuple):
 
     def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
         """Shorthand for getting a specific logitproc from SamplingMetadata"""
-        return self.sampling_metadata.logitsprocs.get_logitproc_by_id(id)
+        return self.sampling_metadata.logitsprocs.get_logitprocs_by_cls(id)
 
     def get_logitsprocs(self) -> list[LogitsProcessor]:
-        return self.sampling_metadata.logitsprocs.all_list
+        return self.sampling_metadata.logitsprocs.all
 
 
 def fake_update_logitsprocs_state(
@@ -197,6 +197,6 @@ def fake_apply_logitsprocs(
     """Imitate application of logits processors in engine core"""
     logits = test_fakes.logits[torch.tensor(slice_indices,
                                             dtype=torch.long)].clone()
-    for processor in test_fakes.sampling_metadata.logitsprocs.all_list:
+    for processor in test_fakes.sampling_metadata.logitsprocs.all:
         logits = processor.apply(logits)
     return logits
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index b496e079dab..1b9dbe8bde5 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -11,9 +11,6 @@
 
 from vllm import PoolingParams, SamplingParams
 from vllm.logger import init_logger
-from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
-                                  STR_MIN_P_LOGITPROC_ID,
-                                  STR_MIN_TOKENS_LOGITPROC_ID)
 
 logger = init_logger(__name__)
 
@@ -206,42 +203,21 @@ def update_state(
 
 @dataclass
 class LogitsProcessorManager:
-    """Encapsulates initialized logitsproc objects.
-
-    Each logits processor has a unique id.
-    """
-    argmax_invariant: dict[str, LogitsProcessor] = field(
-        default_factory=dict)  # id -> argmax-invariant logitsprocs
-    non_argmax_invariant: dict[str, LogitsProcessor] = field(
-        default_factory=dict)  # id -> non-argmax-invariant logitsprocs
-
-    def __post_init__(self):
-        """Guarantee unique ids"""
-        if (self.argmax_invariant.keys() & self.non_argmax_invariant.keys()):
-            raise ValueError("Argmax invariant and non-invariant logits "
-                             "processors must not share ids")
-
-    def get_logitproc_by_id(self, id: str) -> Optional[LogitsProcessor]:
+    """Encapsulates initialized logitsproc objects."""
+    argmax_invariant: list[LogitsProcessor] = field(
+        default_factory=list)  # argmax-invariant logitsprocs
+    non_argmax_invariant: list[LogitsProcessor] = field(
+        default_factory=list)  # non-argmax-invariant logitsprocs
+
+    def get_logitprocs_by_cls(
+            self, cls: type[LogitsProcessor]) -> list[LogitsProcessor]:
         """Find logits processor by id, if it exists"""
-        return self.all.get(id, None)
-
-    @property
-    def all(self) -> dict[str, LogitsProcessor]:
-        """All logits processors"""
-        return self.non_argmax_invariant | self.argmax_invariant
-
-    @property
-    def argmax_invariant_list(self) -> list[LogitsProcessor]:
-        return list(self.argmax_invariant.values())
-
-    @property
-    def non_argmax_invariant_list(self) -> list[LogitsProcessor]:
-        return list(self.non_argmax_invariant.values())
+        return [lp for lp in self.all if isinstance(lp, cls)]
 
     @property
-    def all_list(self) -> list[LogitsProcessor]:
+    def all(self) -> list[LogitsProcessor]:
         """List of all logits processors"""
-        return self.argmax_invariant_list + self.non_argmax_invariant_list
+        return self.argmax_invariant + self.non_argmax_invariant
 
 
 ###### ----- Built-in LogitsProcessor impls below here
@@ -525,9 +501,9 @@ def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
         # +1 for temporary swap space
         max_num_reqs=max_num_reqs + 1)
     return LogitsProcessorManager(
-        non_argmax_invariant={
-            STR_MIN_TOKENS_LOGITPROC_ID: min_tokens_logitproc,
-            STR_LOGITS_BIAS_LOGITPROC_ID: logit_bias_logitproc
-        },
-        argmax_invariant={STR_MIN_P_LOGITPROC_ID: min_p_logitproc},
+        non_argmax_invariant=[
+            min_tokens_logitproc,
+            logit_bias_logitproc,
+        ],
+        argmax_invariant=[min_p_logitproc],
     )
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 6f8261e63fb..e79e4451a3a 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -45,8 +45,7 @@ def forward(
         logits = self.apply_bad_words(logits, sampling_metadata)
 
         # Apply logits processors which can impact greedy sampling
-        for processor in (
-                sampling_metadata.logitsprocs.non_argmax_invariant_list):
+        for processor in (sampling_metadata.logitsprocs.non_argmax_invariant):
             logits = processor.apply(logits)
 
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -115,7 +114,7 @@ def sample(
 
         # Apply logits processors that only apply to random sampling
         # (argmax invariant)
-        for processor in sampling_metadata.logitsprocs.argmax_invariant_list:
+        for processor in sampling_metadata.logitsprocs.argmax_invariant:
             logits = processor.apply(logits)
 
         # Apply top_k and/or top_p.
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 62afb1105bf..0f0638b5f0d 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -6,7 +6,8 @@
 
 
 def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    min_p_lp = input_batch.logitsprocs.get_logitproc_by_id("min_p")
+    min_p_lp = input_batch.logitsprocs.get_logitprocs_by_cls(
+        MinPLogitsProcessor)
     if (isinstance(min_p_lp, MinPLogitsProcessor) and
             min_p_lp.get_min_p_by_index(input_batch.req_id_to_index[req_id])):
         # Spec decode doesn't support min_p sampling.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index a504da042e6..463587520d8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -577,7 +577,7 @@ def condense(self) -> None:
     def _commit_logit_procs_state_changes(self) -> None:
         """Apply batch add/remove/permute to logits procs' states"""
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
-        for logit_proc in self.logitsprocs.all_list:
+        for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
 
     def refresh(self):

From de81e42b48769b5ba619591d281269b6ed121041 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 26 Jun 2025 14:27:37 -0400
Subject: [PATCH 112/180] updated unit tests

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 28 +++++++++++++----------
 vllm/v1/worker/utils.py                   |  6 -----
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index ef13c54240f..7d037a897c4 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -2,7 +2,7 @@
 
 import random
 from collections.abc import Callable
-from typing import NamedTuple, Optional
+from typing import NamedTuple, Optional, Union
 
 import numpy as np
 import pytest
@@ -17,14 +17,13 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
 from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+                                             LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
                                              MoveDirectionality,
                                              init_builtin_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.utils import (STR_LOGITS_BIAS_LOGITPROC_ID,
-                                  STR_MIN_P_LOGITPROC_ID,
-                                  STR_MIN_TOKENS_LOGITPROC_ID,
-                                  STR_NO_LOGITPROC)
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
@@ -37,6 +36,10 @@
 MAX_NUM_PROMPT_TOKENS = 64
 MIN_TOKENS_LEN_THRESHOLD = 5
 REQS_PER_LOGITPROC = 50
+STR_NO_LOGITPROC = "none"
+
+# LogitsProcessor subclass or "none"
+LogitprocID = Union[type[LogitsProcessor], str]
 
 
 class LogitsProcsRequestParams:
@@ -45,11 +48,11 @@ class LogitsProcsRequestParams:
     Params can be customized based on the enabled logitproc
     """
     workload_index: int
-    logitproc_id: str  # Logitproc enabled, specified by str id
+    logitproc_id: LogitprocID  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
     params: SamplingParams  # Settings customized for logitproc
 
-    def __init__(self, workload_index: int, logitproc_id: str):
+    def __init__(self, workload_index: int, logitproc_id: LogitprocID):
         self.workload_index = workload_index
         self.logitproc_id = logitproc_id
         # Number of output tokens is randomly 0 or twice the min-tokens
@@ -123,7 +126,8 @@ def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
     )
 
 
-def _sampling_params_from_logitproc(logitproc_id: str) -> SamplingParams:
+def _sampling_params_from_logitproc(
+        logitproc_id: LogitprocID) -> SamplingParams:
     """Customize request SamplingParams for a specified logitproc"""
     # SamplingParams for req with no logitproc
     kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
@@ -292,7 +296,7 @@ def _min_tokens_validate(
     ref_all_stop_token_ids = request_params.params.all_stop_token_ids
     mt_lp: MinTokensLogitsProcessor = (
         test_fakes.sampling_metadata.logitsprocs.get_logitprocs_by_cls(
-            STR_MIN_TOKENS_LOGITPROC_ID))
+            MinTokensLogitsProcessor)[0])
     assert isinstance(mt_lp, MinTokensLogitsProcessor)
     min_tok = mt_lp.min_toks.get(batch_index, None)
 
@@ -393,13 +397,13 @@ class LogitsprocTestHelpers(NamedTuple):
 logitsprocs_test_mapping = {
     STR_NO_LOGITPROC:
     LogitsprocTestHelpers(eval_fxn=_none_validate),
-    STR_LOGITS_BIAS_LOGITPROC_ID:
+    LogitBiasLogitsProcessor:
     LogitsprocTestHelpers(gen_request_fxn=_logit_bias_params,
                           eval_fxn=_logit_bias_validate),
-    STR_MIN_P_LOGITPROC_ID:
+    MinPLogitsProcessor:
     LogitsprocTestHelpers(gen_request_fxn=_min_p_params,
                           eval_fxn=_min_p_validate),
-    STR_MIN_TOKENS_LOGITPROC_ID:
+    MinTokensLogitsProcessor:
     LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
                           eval_fxn=_min_tokens_validate),
 }
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 174883967ac..70339ff2f00 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -7,12 +7,6 @@
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
 
-# Logits processor id strs
-STR_NO_LOGITPROC = "none"
-STR_MIN_P_LOGITPROC_ID = "min_p"
-STR_MIN_TOKENS_LOGITPROC_ID = "min_tokens"
-STR_LOGITS_BIAS_LOGITPROC_ID = "logit_bias"
-
 
 def sanity_check_mm_encoder_outputs(
     mm_embeddings: MultiModalEmbeddings,

From 20928f0a10de34b5a0e0f60dcfaf684aa213c781 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 26 Jun 2025 15:40:56 -0400
Subject: [PATCH 113/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 40 +++++++++++------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 7d037a897c4..5b7af306de2 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -39,7 +39,7 @@
 STR_NO_LOGITPROC = "none"
 
 # LogitsProcessor subclass or "none"
-LogitprocID = Union[type[LogitsProcessor], str]
+LogitprocType = Union[type[LogitsProcessor], str]
 
 
 class LogitsProcsRequestParams:
@@ -48,19 +48,19 @@ class LogitsProcsRequestParams:
     Params can be customized based on the enabled logitproc
     """
     workload_index: int
-    logitproc_id: LogitprocID  # Logitproc enabled, specified by str id
+    logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
     params: SamplingParams  # Settings customized for logitproc
 
-    def __init__(self, workload_index: int, logitproc_id: LogitprocID):
+    def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         self.workload_index = workload_index
-        self.logitproc_id = logitproc_id
+        self.logitproc_type = logitproc_type
         # Number of output tokens is randomly 0 or twice the min-tokens
         # threshold which will be used in testing. Output token values
         # don't matter *for these tests* so use 0 as a dummy value
         self.out_tokens = ([0] *
                            (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
-        self.params = _sampling_params_from_logitproc(logitproc_id)
+        self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
         """For debugging"""
@@ -127,37 +127,37 @@ def _generate_test_fakes(batch_size: int, device: str) -> LogitsprocsTestFakes:
 
 
 def _sampling_params_from_logitproc(
-        logitproc_id: LogitprocID) -> SamplingParams:
+        logitproc_type: LogitprocType) -> SamplingParams:
     """Customize request SamplingParams for a specified logitproc"""
     # SamplingParams for req with no logitproc
     kwargs = {"min_p": 0.0, "logit_bias": None, "min_tokens": 0}
-    if fxn := logitsprocs_test_mapping[logitproc_id].gen_request_fxn:
+    if fxn := logitsprocs_test_mapping[logitproc_type].gen_request_fxn:
         fxn(kwargs)
     return SamplingParams(**kwargs)
 
 
 def _generate_mixed_logitsprocs_batch_params(
     reqs_per_logitproc: int,
-    logitsprocs_ids: list[str],
+    logitsprocs_types: list[str],
 ) -> list[LogitsProcsRequestParams]:
     """Define key params for a batch of requests with a different
     logitproc enabled per request.
     
     The batch will have `reqs_per_logitproc` repeats for all
-    `logitsprocs_ids` under test, including the case where
+    `logitsprocs_types` under test, including the case where
     no logitsproc is enabled. The batch is randomly shuffled. The
     size of the batch is `reqs_per_logitproc` times
-    `n = len(logitsprocs_ids)`
+    `n = len(logitsprocs_types)`
 
     Args:
       reqs_per_logitproc: number of requests using each logitproc
-      logitsprocs_ids: logitsprocs under test
+      logitsprocs_types: logitsprocs under test
 
     Returns:
       List of per-request params which configure the engine for that request's
       enabled logitproc
     """
-    batch_size = len(logitsprocs_ids) * reqs_per_logitproc
+    batch_size = len(logitsprocs_types) * reqs_per_logitproc
     # Generate multiple repeats of key params for each logitproc;
     # apply random inverse permutation to the iteration
     # over logitsprocs, such that logitsprocs are shuffled.
@@ -165,7 +165,7 @@ def _generate_mixed_logitsprocs_batch_params(
     return [
         LogitsProcsRequestParams(
             workload_index=idx,
-            logitproc_id=logitsprocs_ids[pdx // reqs_per_logitproc])
+            logitproc_type=logitsprocs_types[pdx // reqs_per_logitproc])
         for idx, pdx in enumerate(batch_perm)
     ]
 
@@ -411,11 +411,11 @@ class LogitsprocTestHelpers(NamedTuple):
 
 def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
-    logitsprocs_ids = list(logitsprocs_test_mapping.keys())
-    return [[STR_NO_LOGITPROC]] + [[logitproc_id, STR_NO_LOGITPROC]
-                                   for logitproc_id in logitsprocs_ids
-                                   if logitproc_id != STR_NO_LOGITPROC
-                                   ] + [logitsprocs_ids]
+    logitsprocs_types = list(logitsprocs_test_mapping.keys())
+    return [[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
+                                   for logitproc_type in logitsprocs_types
+                                   if logitproc_type != STR_NO_LOGITPROC
+                                   ] + [logitsprocs_types]
 
 
 def _generate_fake_step_update(
@@ -550,7 +550,7 @@ def _assert_valid(
         request_params = persistent_batch[batch_index]
         # Invoke the appropriate validation function for
         # the logitproc employed by this request
-        fxn = logitsprocs_test_mapping[request_params.logitproc_id].eval_fxn
+        fxn = logitsprocs_test_mapping[request_params.logitproc_type].eval_fxn
         fxn(test_fakes=test_fakes,
             persistent_batch=persistent_batch,
             logits_new=logits_w_lp,
@@ -571,7 +571,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
     # logitproc, or no logitproc at all
     workload_params = _generate_mixed_logitsprocs_batch_params(
         reqs_per_logitproc=reqs_per_logitproc,
-        logitsprocs_ids=logitsprocs_under_test)
+        logitsprocs_types=logitsprocs_under_test)
     workload_size = len(workload_params)
 
     # Create fake test data structures for testing.

From a0e5398e78ef5bbc39926242abf7ccf4556a39d4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 26 Jun 2025 16:11:37 -0400
Subject: [PATCH 114/180] iterators

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |  5 ++---
 tests/v1/sample/utils.py                  | 14 +++++++++-----
 vllm/v1/sample/logits_processor.py        | 17 +++++++++--------
 vllm/v1/spec_decode/utils.py              |  4 ++--
 4 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 5b7af306de2..eceb1d9a605 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -294,9 +294,8 @@ def _min_tokens_validate(
     ref_num_out_tokens = len(request_params.out_tokens)
     min_reached = ref_num_out_tokens >= MIN_TOKENS_LEN_THRESHOLD
     ref_all_stop_token_ids = request_params.params.all_stop_token_ids
-    mt_lp: MinTokensLogitsProcessor = (
-        test_fakes.sampling_metadata.logitsprocs.get_logitprocs_by_cls(
-            MinTokensLogitsProcessor)[0])
+    mt_lp: MinTokensLogitsProcessor = next(
+        test_fakes.get_logitsprocs_by_cls(MinTokensLogitsProcessor))
     assert isinstance(mt_lp, MinTokensLogitsProcessor)
     min_tok = mt_lp.min_toks.get(batch_index, None)
 
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 22055656d66..2bfcae3e02d 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Iterator
 from enum import Enum
 from typing import NamedTuple, Optional
 
@@ -172,11 +173,14 @@ class LogitsprocsTestFakes(NamedTuple):
     logits: torch.Tensor
     sampling_metadata: SamplingMetadata
 
-    def get_logitsproc_by_id(self, id: str) -> LogitsProcessor:
-        """Shorthand for getting a specific logitproc from SamplingMetadata"""
-        return self.sampling_metadata.logitsprocs.get_logitprocs_by_cls(id)
+    def get_logitsprocs_by_cls(
+        self,
+        cls: type[LogitsProcessor],
+    ) -> Iterator[LogitsProcessor]:
+        """Shorthand for getting specific logitsprocs from SamplingMetadata"""
+        return self.sampling_metadata.logitsprocs.get_logitsprocs_by_cls(cls)
 
-    def get_logitsprocs(self) -> list[LogitsProcessor]:
+    def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
         return self.sampling_metadata.logitsprocs.all
 
 
@@ -197,6 +201,6 @@ def fake_apply_logitsprocs(
     """Imitate application of logits processors in engine core"""
     logits = test_fakes.logits[torch.tensor(slice_indices,
                                             dtype=torch.long)].clone()
-    for processor in test_fakes.sampling_metadata.logitsprocs.all:
+    for processor in test_fakes.get_logitsprocs():
         logits = processor.apply(logits)
     return logits
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 1b9dbe8bde5..91007803329 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import dataclasses
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
+from collections.abc import Iterator, Sequence
 from dataclasses import dataclass, field
 from enum import Enum
+from itertools import chain
 from typing import Optional, Union
 
 import torch
@@ -209,15 +210,15 @@ class LogitsProcessorManager:
     non_argmax_invariant: list[LogitsProcessor] = field(
         default_factory=list)  # non-argmax-invariant logitsprocs
 
-    def get_logitprocs_by_cls(
-            self, cls: type[LogitsProcessor]) -> list[LogitsProcessor]:
-        """Find logits processor by id, if it exists"""
-        return [lp for lp in self.all if isinstance(lp, cls)]
+    def get_logitsprocs_by_cls(
+            self, cls: type[LogitsProcessor]) -> Iterator[LogitsProcessor]:
+        """Yield logits processors of a specific class."""
+        return (lp for lp in self.all if isinstance(lp, cls))
 
     @property
-    def all(self) -> list[LogitsProcessor]:
-        """List of all logits processors"""
-        return self.argmax_invariant + self.non_argmax_invariant
+    def all(self) -> Iterator[LogitsProcessor]:
+        """Iterable over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)
 
 
 ###### ----- Built-in LogitsProcessor impls below here
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 0f0638b5f0d..3e1342b5bbe 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -6,8 +6,8 @@
 
 
 def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    min_p_lp = input_batch.logitsprocs.get_logitprocs_by_cls(
-        MinPLogitsProcessor)
+    min_p_lp = next(
+        input_batch.logitsprocs.get_logitsprocs_by_cls(MinPLogitsProcessor))
     if (isinstance(min_p_lp, MinPLogitsProcessor) and
             min_p_lp.get_min_p_by_index(input_batch.req_id_to_index[req_id])):
         # Spec decode doesn't support min_p sampling.

From d4704d70a65ee40564eb587c696708e6c2f0c9e5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 26 Jun 2025 16:35:37 -0400
Subject: [PATCH 115/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/utils.py           | 10 +++++++++-
 vllm/v1/sample/logits_processor.py | 11 +++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 2bfcae3e02d..ec4cbeb0cf3 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -177,10 +177,18 @@ def get_logitsprocs_by_cls(
         self,
         cls: type[LogitsProcessor],
     ) -> Iterator[LogitsProcessor]:
-        """Shorthand for getting specific logitsprocs from SamplingMetadata"""
+        """Yield logits processors of a specific class.
+        
+        Args:
+          cls: :class:`LogitsProcessor` subclass
+
+        Returns:
+          Iterator over logits processors
+        """
         return self.sampling_metadata.logitsprocs.get_logitsprocs_by_cls(cls)
 
     def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
+        """Iterator over all logits processors."""
         return self.sampling_metadata.logitsprocs.all
 
 
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 91007803329..9029a5b3595 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -212,12 +212,19 @@ class LogitsProcessorManager:
 
     def get_logitsprocs_by_cls(
             self, cls: type[LogitsProcessor]) -> Iterator[LogitsProcessor]:
-        """Yield logits processors of a specific class."""
+        """Yield logits processors of a specific class.
+        
+        Args:
+          cls: :class:`LogitsProcessor` subclass
+
+        Returns:
+          Iterator over logits processors
+        """
         return (lp for lp in self.all if isinstance(lp, cls))
 
     @property
     def all(self) -> Iterator[LogitsProcessor]:
-        """Iterable over all logits processors."""
+        """Iterator over all logits processors."""
         return chain(self.argmax_invariant, self.non_argmax_invariant)
 
 

From 729729dbad88beff15e26023f4c1fef7461de7ff Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 26 Jun 2025 21:24:43 -0400
Subject: [PATCH 116/180] reorg

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/utils.py           |  3 ++-
 vllm/v1/sample/logits_processor.py | 12 ----------
 vllm/v1/spec_decode/utils.py       | 36 ++++++++++--------------------
 vllm/v1/worker/gpu_input_batch.py  | 14 +++++++++---
 vllm/v1/worker/gpu_model_runner.py |  5 +++--
 5 files changed, 28 insertions(+), 42 deletions(-)

diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index ec4cbeb0cf3..e33efb413d0 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -185,7 +185,8 @@ def get_logitsprocs_by_cls(
         Returns:
           Iterator over logits processors
         """
-        return self.sampling_metadata.logitsprocs.get_logitsprocs_by_cls(cls)
+        return (lp for lp in self.sampling_metadata.logitsprocs.all
+                if isinstance(lp, cls))
 
     def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
         """Iterator over all logits processors."""
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 9029a5b3595..00b3e37e7f3 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -210,18 +210,6 @@ class LogitsProcessorManager:
     non_argmax_invariant: list[LogitsProcessor] = field(
         default_factory=list)  # non-argmax-invariant logitsprocs
 
-    def get_logitsprocs_by_cls(
-            self, cls: type[LogitsProcessor]) -> Iterator[LogitsProcessor]:
-        """Yield logits processors of a specific class.
-        
-        Args:
-          cls: :class:`LogitsProcessor` subclass
-
-        Returns:
-          Iterator over logits processors
-        """
-        return (lp for lp in self.all if isinstance(lp, cls))
-
     @property
     def all(self) -> Iterator[LogitsProcessor]:
         """Iterator over all logits processors."""
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 3e1342b5bbe..3a86fea146f 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,30 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton
-from vllm.v1.sample.logits_processor import MinPLogitsProcessor
-from vllm.v1.worker.gpu_input_batch import InputBatch
-
-
-def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    min_p_lp = next(
-        input_batch.logitsprocs.get_logitsprocs_by_cls(MinPLogitsProcessor))
-    if (isinstance(min_p_lp, MinPLogitsProcessor) and
-            min_p_lp.get_min_p_by_index(input_batch.req_id_to_index[req_id])):
-        # Spec decode doesn't support min_p sampling.
-        # Note: isinstance implicitly catches the case where the logitproc
-        # is None (i.e. no min-p logitproc is loaded.)
-        return False
-
-    if (req_id in input_batch.frequency_penalties_reqs
-            or req_id in input_batch.presence_penalties_reqs
-            or req_id in input_batch.repetition_penalties_reqs):
-        # Spec decode doesn't support penalties.
-        return False
-    elif req_id in input_batch.num_logprobs:
-        # Spec decode doesn't support logprobs.
-        return False
-
-    return True
+
+_SAMPLING_EPS = 1e-5
+
+
+def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
+    """True if request is incompatible with speculative decoding"""
+    return (sampling_params.frequency_penalty != 0.0
+            or sampling_params.presence_penalty != 0.0
+            or sampling_params.repetition_penalty != 1.0
+            or sampling_params.min_p > _SAMPLING_EPS
+            or sampling_params.logprobs is not None)
 
 
 @triton.jit
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 463587520d8..f9917c86287 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -19,6 +19,7 @@
                                              MoveDirectionality,
                                              init_builtin_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 
@@ -68,8 +69,10 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        is_spec_decode: bool = False,
         logits_processing_needs_token_ids: bool = False,
     ):
+        self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
@@ -147,6 +150,9 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
+        # Requests with min-p enabled
+        self.spec_decode_unsupported_reqs: set[str] = set()
+
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -209,9 +215,7 @@ def __init__(
         # updates. Should reset each step.
         self.batch_update_builder = BatchUpdateBuilder()
 
-        # Define logits processors. Note that Min-P logitsproc is returned
-        # both on its own as min_p_logitsproc (to support spec decoding
-        # compatibility check) and also as part of logits_procs
+        # Define logits processors.
         # TODO(andy): logits processor list should be extensible via engine
         # constructor argument; for now the list is fixed.
         self.logitsprocs = init_builtin_logitsprocs(
@@ -297,6 +301,9 @@ def add_request(
         self.block_table.add_row(request.block_ids, req_index)
 
         if sampling_params := request.sampling_params:
+            if (self.is_spec_decode
+                    and is_spec_decode_unsupported(sampling_params)):
+                self.spec_decode_unsupported_reqs.add(req_id)
             if sampling_params.sampling_type == SamplingType.GREEDY:
                 # Avoid later division by zero.
                 self.temperature_cpu[req_index] = -1.0
@@ -394,6 +401,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        self.spec_decode_unsupported_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aba702a9645..b5b95c5ca16 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -61,7 +61,6 @@
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.spec_decode.utils import is_spec_decode_supported
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -204,6 +203,7 @@ def __init__(
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
+            is_spec_decode=bool(self.vllm_config.speculative_config),
         )
 
         self.use_cuda_graph = (
@@ -1660,7 +1660,7 @@ def generate_draft_token_ids(
             # Skip requests that require sampling parameters that are not
             # supported with speculative decoding.
             req_id = self.input_batch.req_ids[i]
-            if not is_spec_decode_supported(req_id, self.input_batch):
+            if req_id in self.input_batch.spec_decode_unsupported_reqs:
                 draft_token_ids.append([])
                 continue
 
@@ -2290,6 +2290,7 @@ def may_reinitialize_input_batch(self,
                 pin_memory=self.pin_memory,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
+                is_spec_decode=bool(self.vllm_config.speculative_config),
             )
 
     def _allocate_kv_cache_tensors(

From 9eeea030090dc7254f17569f50491731091c91e1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 27 Jun 2025 21:47:40 -0400
Subject: [PATCH 117/180] feedback

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py |   4 +-
 tests/v1/worker/test_gpu_input_batch.py   |   4 +-
 vllm/v1/sample/logits_processor.py        | 107 ++++++++++++----------
 vllm/v1/worker/gpu_input_batch.py         |  27 +++---
 vllm/v1/worker/gpu_model_runner.py        |  29 ++----
 5 files changed, 84 insertions(+), 87 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index eceb1d9a605..ff2a436815e 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -422,7 +422,7 @@ def _generate_fake_step_update(
     workload_params: list[LogitsProcsRequestParams],
     wdx: int,
     batch_update_builder: BatchUpdateBuilder,
-) -> tuple[BatchUpdate, int, int]:
+) -> tuple[Optional[BatchUpdate], int, int]:
     batch_size = len(persistent_batch)
     workload_size = len(workload_params)
     workload_reqs_remaining = workload_size - wdx
@@ -603,7 +603,7 @@ def test_logitsprocs(device: str, reqs_per_logitproc: int,
             wdx=wdx,
             batch_update_builder=batch_update_builder,
         )
-        batch_size = batch_update.batch_size
+        batch_size = len(persistent_batch)
 
         # Apply fake batch update to logitsprocs
         fake_update_logitsprocs_state(test_fakes, batch_update)
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index ab40f7fc359..f2006c0023c 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -352,7 +352,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         req = reordered_reqs[req_index]
         ref_input_batch.add_request(req, req_index)
 
-    input_batch.refresh()
-    ref_input_batch.refresh()
+    input_batch.update_reset()
+    ref_input_batch.update_reset()
 
     _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 00b3e37e7f3..19f0aec6866 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -142,14 +142,7 @@ def pop_removed(self) -> Optional[int]:
             return self._removed.pop()
         return None
 
-    def _reset(self):
-        """Reset batch info at end of step"""
-        self._removed = []
-        self._is_removed_sorted = False
-        self.moved = []
-        self.added = []
-
-    def get_and_reset(self, batch_size: int) -> BatchUpdate:
+    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
         """Generate a logitsprocs batch update data structure
         and reset internal batch update builder state.
         
@@ -157,15 +150,24 @@ def get_and_reset(self, batch_size: int) -> BatchUpdate:
           batch_size: current persistent batch size
 
         Returns:
-          Frozen logitsprocs batch update dataclass instance
+          Frozen logitsprocs batch update instance; `None` if no updates
         """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        if not any((self.removed, self.moved, self.added)):
+            # No update; short-circuit
+            return None
+        # Build batch state update
         batch_update = BatchUpdate(
             batch_size=batch_size,
             removed=self.removed,
             moved=self.moved,
             added=self.added,
         )
-        self._reset()
+        # Reset removed/moved/added update lists
+        self._removed = []
+        self.moved = []
+        self.added = []
         return batch_update
 
 
@@ -190,7 +192,7 @@ def is_argmax_invariant(self) -> bool:
     @abstractmethod
     def update_state(
         self,
-        batch_update: BatchUpdate,
+        batch_update: Optional[BatchUpdate],
     ) -> None:
         """Called when there are new output tokens, prior
         to each forward pass.
@@ -245,7 +247,10 @@ def is_argmax_invariant(self) -> bool:
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def update_state(self, batch_update: BatchUpdate):
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
         needs_update = False
         # Process added requests.
         for index, params, _ in batch_update.added:
@@ -317,7 +322,10 @@ def is_argmax_invariant(self) -> bool:
         outcome of argmax in greedy sampling."""
         return False
 
-    def update_state(self, batch_update: BatchUpdate):
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
         # Process added requests.
         needs_update = bool(batch_update.added)
         for index, params, _ in batch_update.added:
@@ -397,46 +405,49 @@ def is_argmax_invariant(self) -> bool:
         of the argmax operation in greedy sampling."""
         return False
 
-    def update_state(self, batch_update: BatchUpdate):
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        needs_update = False
 
-        # Process added requests.
-        needs_update = bool(batch_update.added)
-        for index, params, output_tok_ids in batch_update.added:
-            if (isinstance(params, SamplingParams)
-                    and (min_tokens := params.min_tokens)
-                    and len(output_tok_ids) < min_tokens):
-                # Replace request metadata at batch index
-                self.min_toks[index] = (min_tokens, output_tok_ids,
-                                        params.all_stop_token_ids)
-            else:
-                # Drop request metadata at batch index
-                self.min_toks.pop(index, None)
+        if batch_update:
+            # Process added requests.
+            needs_update |= bool(batch_update.added)
+            for index, params, output_tok_ids in batch_update.added:
+                if (isinstance(params, SamplingParams)
+                        and (min_tokens := params.min_tokens)
+                        and len(output_tok_ids) < min_tokens):
+                    # Replace request metadata at batch index
+                    self.min_toks[index] = (min_tokens, output_tok_ids,
+                                            params.all_stop_token_ids)
+                else:
+                    # Drop request metadata at batch index
+                    self.min_toks.pop(index, None)
 
-        if self.min_toks:
-            # Process removed requests.
-            for index in batch_update.removed:
-                if self.min_toks.pop(index, None):
-                    needs_update = True
+            if self.min_toks:
+                # Process removed requests.
+                for index in batch_update.removed:
+                    if self.min_toks.pop(index, None):
+                        needs_update = True
 
-            # Process moved requests, unidirectional (a->b) and
-            # swapped (a<->b)
-            for a_index, b_index, direct in batch_update.moved:
-                if direct == MoveDirectionality.UNIDIRECTIONAL:
-                    if (a_entry := self.min_toks.pop(a_index, None)) is None:
-                        if self.min_toks.pop(b_index, None) is not None:
+                # Process moved requests, unidirectional (a->b) and
+                # swapped (a<->b)
+                for a_index, b_index, direct in batch_update.moved:
+                    if direct == MoveDirectionality.UNIDIRECTIONAL:
+                        if (a_entry := self.min_toks.pop(a_index,
+                                                         None)) is None:
+                            if self.min_toks.pop(b_index, None) is not None:
+                                needs_update = True
+                        else:
+                            self.min_toks[b_index] = a_entry
                             needs_update = True
                     else:
-                        self.min_toks[b_index] = a_entry
-                        needs_update = True
-                else:
-                    a_entry = self.min_toks.pop(a_index, None)
-                    if (b_entry := self.min_toks.pop(b_index,
-                                                     None)) is not None:
-                        self.min_toks[a_index] = b_entry
-                        needs_update = True
-                    if a_entry is not None:
-                        self.min_toks[b_index] = a_entry
-                        needs_update = True
+                        a_entry = self.min_toks.pop(a_index, None)
+                        if (b_entry := self.min_toks.pop(b_index,
+                                                         None)) is not None:
+                            self.min_toks[a_index] = b_entry
+                            needs_update = True
+                        if a_entry is not None:
+                            self.min_toks[b_index] = a_entry
+                            needs_update = True
 
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f9917c86287..f2e5bf75954 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -150,7 +150,7 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: set[str] = set()
 
-        # Requests with min-p enabled
+        # IDs of requests which do not support spec decoding
         self.spec_decode_unsupported_reqs: set[str] = set()
 
         # Frequency penalty related data structures
@@ -263,9 +263,6 @@ def _register_add_request(self, request: "CachedRequestState") -> int:
             (req_index, params, request.output_token_ids))
         return req_index
 
-    def has_step_removed_requests(self) -> bool:
-        return self.batch_update_builder.has_removed()
-
     def add_request(
         self,
         request: "CachedRequestState",
@@ -482,10 +479,6 @@ def swap_states(self, i1: int, i2: int) -> None:
                     self.allowed_token_ids_mask_cpu_tensor[i1]
         self.block_table.swap_row(i1, i2)
 
-    def _register_move_request(self, from_idx: int, to_idx: int) -> None:
-        self.batch_update_builder.moved.append(
-            (from_idx, to_idx, MoveDirectionality.UNIDIRECTIONAL))
-
     def condense(self) -> None:
         """Slide non-empty requests down into lower, empty indices.
 
@@ -499,7 +492,10 @@ def condense(self) -> None:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
         """
-        empty_req_indices = self.batch_update_builder.removed
+        if not (empty_req_indices := self.batch_update_builder.removed):
+            # All removed requests were replaced by added requests, or else no
+            # requests were removed at all. No condense() needed
+            return
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -582,15 +578,16 @@ def condense(self) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-    def _commit_logit_procs_state_changes(self) -> None:
-        """Apply batch add/remove/permute to logits procs' states"""
+    def update_reset(self):
+        """Apply batch updates, reset input batch at end of step
+        * Apply batch add/remove/permute to logits procs' states
+        * If batch state is modified, update sampling metadata
+        """
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
         for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
-
-    def refresh(self):
-        self._commit_logit_procs_state_changes()
-        self.sampling_metadata = self._make_sampling_metadata()
+        if batch_update:
+            self.sampling_metadata = self._make_sampling_metadata()
 
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e3b56c2f4be..702c6a8f624 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -317,7 +317,7 @@ def __init__(
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
 
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
+    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention
         backend's needs. For example, some attention backends (namely MLA) may
@@ -326,12 +326,9 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
 
         Args:
             scheduler_output: The scheduler output.
-
-        Returns:
-            True if the batch was reordered, False otherwise.
         """
-        batch_reordered = self.attn_metadata_builders[0].reorder_batch(
-            self.input_batch, scheduler_output)
+        self.attn_metadata_builders[0].reorder_batch(self.input_batch,
+                                                     scheduler_output)
 
         # For models with multiple KV cache groups, the groups should agree on
         # the same order of requests. We ensure this by only allowing the first
@@ -340,7 +337,6 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
             assert not self.attn_metadata_builders[i].reorder_batch(
                 self.input_batch, scheduler_output)
-        return batch_reordered
 
     # Note: used for model runner override.
     def _init_device_properties(self) -> None:
@@ -526,25 +522,18 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
             self.input_batch.num_tokens[req_index] = end_token_index
 
-        # Check if the batch has changed. If not, we can skip copying the
-        # sampling metadata from CPU to GPU.
-        has_removed_requests = self.input_batch.has_step_removed_requests()
-        batch_changed = has_removed_requests or len(req_ids_to_add) > 0
-
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for req_id in req_ids_to_add:
             req_state = self.requests[req_id]
             self.input_batch.add_request(req_state)
 
-        # Condense the batched states if there are empty indices.
-        if self.input_batch.has_step_removed_requests():
-            self.input_batch.condense()
-
-        batch_reordered = self._may_reorder_batch(scheduler_output)
-
-        if batch_changed or batch_reordered:
-            self.input_batch.refresh()
+        # Condense the batched states if there are gaps left by removed requests
+        self.input_batch.condense()
+        # Allow attention backend to reorder the batch, potentially
+        self._may_reorder_batch(scheduler_output)
+        # Apply batch updates then reset input batch
+        self.input_batch.update_reset()
 
     def _get_cumsum_and_arange(
         self,

From cd766a445a6c18945609a5bdafff98d0ce3dcd00 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 27 Jun 2025 21:54:00 -0400
Subject: [PATCH 118/180] feedback

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 19f0aec6866..9aa560d30ee 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -154,13 +154,13 @@ def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
         """
         # Reset removal-sorting logic
         self._is_removed_sorted = False
-        if not any((self.removed, self.moved, self.added)):
+        if not any((self._removed, self.moved, self.added)):
             # No update; short-circuit
             return None
         # Build batch state update
         batch_update = BatchUpdate(
             batch_size=batch_size,
-            removed=self.removed,
+            removed=self._removed,
             moved=self.moved,
             added=self.added,
         )

From 64ac2cf7967557e983d2be7135de7b041b714cc0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 30 Jun 2025 20:17:50 -0400
Subject: [PATCH 119/180] input batch tests

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/worker/test_gpu_input_batch.py | 47 ++++++++++++++-----------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index f2006c0023c..33cea0ed0ae 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import inspect
+from collections.abc import Sequence
 from typing import Optional
 
 import numpy as np
@@ -11,6 +12,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -24,13 +26,18 @@
 MAX_NUM_PROMPT_TOKENS = 64
 
 
-def _compare_objs(obj1, obj2):
+def _compare_objs(obj1,
+                  obj2,
+                  skip: Sequence = ("logitsprocs", "batch_update_builder")):
     attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
     attr_names = set([
         a[0] for a in attrs
         if not (a[0].startswith('__') and a[0].endswith('__'))
     ])
     for attr_name in attr_names:
+        if attr_name in skip:
+            continue
+
         a = getattr(obj1, attr_name)
         b = getattr(obj2, attr_name)
 
@@ -56,13 +63,11 @@ def _compare_objs(obj1, obj2):
             f" in {obj1} and {obj2}: {a} != {b}"
 
 
-def _remove_requests(
-        input_batch: InputBatch, batch_size: int,
-        reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]:
+def _remove_requests(input_batch: InputBatch, batch_size: int,
+                     reqs: list[CachedRequestState]) -> set[str]:
     """
-    Remove some requests randomly from the batch and returns a tuple
-    of 1) set of request removed 2) indices of the requests removed
-    ordered in descending order
+    Remove some requests randomly from the batch and returns
+    set of request removed
     """
 
     num_reqs_to_remove = np.random.randint(0, batch_size)
@@ -71,13 +76,11 @@ def _remove_requests(
         req_index_to_remove = np.random.randint(0, batch_size)
         req_indices_to_remove.add(req_index_to_remove)
 
-    req_indices_to_remove_list = list(req_indices_to_remove)
-    req_indices_to_remove_list.sort(reverse=True)
     req_ids_to_remove: set[str] = set()
     for index in req_indices_to_remove:
         input_batch.remove_request(reqs[index].req_id)
         req_ids_to_remove.add(reqs[index].req_id)
-    return req_ids_to_remove, req_indices_to_remove_list
+    return req_ids_to_remove
 
 
 def _construct_expected_sampling_metadata(
@@ -98,7 +101,6 @@ def _construct_expected_sampling_metadata(
     repetition_penalties = [1.0 for _ in range(num_reqs)]
     top_k = [0 for _ in range(num_reqs)]
     top_p = [0.0 for _ in range(num_reqs)]
-    min_p = [0.0 for _ in range(num_reqs)]
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
@@ -121,7 +123,6 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.repetition_penalty)
         top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
-        min_p[index_in_input_batch] = req.sampling_params.min_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         min_tokens[index_in_input_batch] = (
             req.sampling_params.min_tokens,
@@ -161,13 +162,12 @@ def _construct_expected_sampling_metadata(
                                           dtype=torch.float,
                                           device=device),
         output_token_ids=output_token_ids,
-        min_tokens=min_tokens,
         no_penalties=(all(x == 0 for x in presence_penalties)
                       and all(x == 0 for x in frequency_penalties)
                       and all(x == 1 for x in repetition_penalties)),
-        logit_bias=logit_bias,
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
+        logitsprocs=LogitsProcessorManager(),
     )
 
 
@@ -221,6 +221,8 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     and the `make_sampling_metadata` method is invoked on the batch. The
     output of `make_sampling_metadata` is then compared against the expected
     results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
     """
     input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -234,17 +236,18 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
     req_id_output_token_ids = {}
+
     # Add requests
     for req_index in range(batch_size):
         req: CachedRequestState = _construct_cached_request_state(req_index)
-        input_batch.add_request(req, req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert req_index == assigned_req_index
         reqs.append(req)
         req_id_reqs[req.req_id] = req
         req_id_output_token_ids[req.req_id] = req.output_token_ids
 
     # Remove some requests
-    req_ids_to_remove, req_indices_to_remove = _remove_requests(
-        input_batch, batch_size, reqs)
+    req_ids_to_remove = _remove_requests(input_batch, batch_size, reqs)
     req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
 
     # Compact the input batch
@@ -286,10 +289,8 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
                           sampling_metadata.prompt_token_ids)
     assert (expected_sampling_metadata.output_token_ids ==
             sampling_metadata.output_token_ids)
-    assert expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens
     assert expected_sampling_metadata.no_penalties == \
            sampling_metadata.no_penalties
-    assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
     if sampling_metadata.allowed_token_ids_mask:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
@@ -311,6 +312,8 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
     and the `make_sampling_metadata` method is invoked on the batch. The
     output of `make_sampling_metadata` is then compared against the expected
     results to ensure correctness.
+
+    Note: Ignore logits processor logic, which is tested separately
     """
     input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -337,7 +340,8 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
     # Add requests
     for req_index in range(batch_size):
         req: CachedRequestState = _construct_cached_request_state(req_index)
-        input_batch.add_request(req, req_index)
+        assigned_req_index = input_batch.add_request(req)
+        assert assigned_req_index == req_index
         reqs.append(req)
         req_id_reqs[req.req_id] = req
         req_id_output_token_ids[req.req_id] = req.output_token_ids
@@ -350,7 +354,8 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
 
     for req_index in range(batch_size):
         req = reordered_reqs[req_index]
-        ref_input_batch.add_request(req, req_index)
+        assigned_req_index = ref_input_batch.add_request(req)
+        assert assigned_req_index == req_index
 
     input_batch.update_reset()
     ref_input_batch.update_reset()

From bd62df48c16863dd42cb4834e3deae441ed68243 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 30 Jun 2025 20:47:53 -0400
Subject: [PATCH 120/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_input_batch.py  | 9 ++++++++-
 vllm/v1/worker/gpu_model_runner.py | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f2e5bf75954..70ed985b26b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -385,7 +385,14 @@ def add_request(
         return req_index
 
     def remove_request(self, req_id: str) -> Optional[int]:
-        """This method must always be followed by a call to condense()."""
+        """This method must always be followed by a call to condense().
+        
+        Args:
+          req_id: request to remove
+
+        Returns:
+          Removed request index, or `None` if `req_id` not recognized
+        """
 
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1cbf08d81bf..f72a1566bff 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -370,7 +370,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
         for req_id in scheduler_output.finished_req_ids:
-            self.input_batch.remove_request(req_id)
+            assert self.input_batch.remove_request(req_id) is not None
 
         # Free the cached encoder outputs.
         for req_id, input_id in scheduler_output.free_encoder_input_ids:
@@ -393,7 +393,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # have low request overlap (e.g., alternating between two distinct
         # sets of requests), this optimization becomes very inefficient.
         for req_id in unscheduled_req_ids:
-            assert self.input_batch.remove_request(req_id)
+            assert self.input_batch.remove_request(req_id) is not None
 
         req_ids_to_add: list[str] = []
         # Add new requests to the cached states.

From a6dc218b29884bfd0b4021b61484be72fe945029 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 30 Jun 2025 22:49:43 -0400
Subject: [PATCH 121/180] attempted fmt fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index ff2a436815e..a8e230a97ed 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -16,6 +16,7 @@
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
+# yapf: disable
 from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
                                              LogitBiasLogitsProcessor,
                                              LogitsProcessor,
@@ -23,6 +24,7 @@
                                              MinTokensLogitsProcessor,
                                              MoveDirectionality,
                                              init_builtin_logitsprocs)
+# yapf: enable
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()

From 072ee00002e9c9b4dcc4999b5f66eeb7515f2c15 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 02:52:17 -0400
Subject: [PATCH 122/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/engine/async_llm.py  | 27 ++++++++++++++-------------
 vllm/v1/engine/llm_engine.py | 21 +++++++++++----------
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3754570dfaa..27b6941c196 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from collections.abc import AsyncGenerator, Mapping
+from collections.abc import AsyncGenerator, Mapping, Sequence
 from copy import copy
 from typing import Any, Optional, Union
 
@@ -46,18 +46,19 @@
 class AsyncLLM(EngineClient):
 
     def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        use_cached_outputs: bool = False,
-        log_requests: bool = True,
-        start_engine_loop: bool = True,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        client_addresses: Optional[dict[str, str]] = None,
-        client_index: int = 0,
+            self,
+            vllm_config: VllmConfig,
+            executor_class: type[Executor],
+            log_stats: bool,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+            use_cached_outputs: bool = False,
+            log_requests: bool = True,
+            start_engine_loop: bool = True,
+            stat_loggers: Optional[list[StatLoggerFactory]] = None,
+            client_addresses: Optional[dict[str, str]] = None,
+            client_index: int = 0,
+            allowed_logitsprocs_ctors: Sequence[str] = (),
     ) -> None:
         """
         Create an AsyncLLM.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a2328c37ba0..a479d775965 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Mapping
+from collections.abc import Mapping, Sequence
 from copy import copy
 from typing import Any, Callable, Optional, Union
 
@@ -42,15 +42,16 @@ class LLMEngine:
     """Legacy LLMEngine for backwards compatibility."""
 
     def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        use_cached_outputs: bool = False,
-        multiprocess_mode: bool = False,
+            self,
+            vllm_config: VllmConfig,
+            executor_class: type[Executor],
+            log_stats: bool,
+            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+            stat_loggers: Optional[list[StatLoggerFactory]] = None,
+            mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+            use_cached_outputs: bool = False,
+            multiprocess_mode: bool = False,
+            allowed_logitsprocs_ctors: Sequence[str] = (),
     ) -> None:
         if not envs.VLLM_USE_V1:
             raise ValueError(

From 55fd6e7725ba945de93ae53042c0e64c10bc2144 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 02:58:26 -0400
Subject: [PATCH 123/180] fixed cancellation bug

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f72a1566bff..57a55c358a3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -370,7 +370,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
         for req_id in scheduler_output.finished_req_ids:
-            assert self.input_batch.remove_request(req_id) is not None
+            self.input_batch.remove_request(req_id)
 
         # Free the cached encoder outputs.
         for req_id, input_id in scheduler_output.free_encoder_input_ids:

From 1217b74d21136042230732da89060b1cb7374ebc Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 03:43:52 -0400
Subject: [PATCH 124/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/entrypoints/llm.py            | 59 ++++++++++++++--------------
 vllm/plugins/__init__.py           |  7 +++-
 vllm/v1/engine/async_llm.py        | 27 ++++++-------
 vllm/v1/engine/llm_engine.py       | 21 +++++-----
 vllm/v1/sample/logits_processor.py | 63 ++++++++++++++++++------------
 vllm/v1/worker/gpu_input_batch.py  |  9 +++--
 6 files changed, 102 insertions(+), 84 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f0404e0bc6e..2824c186edd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -155,35 +155,36 @@ def deprecate_legacy_api(cls):
         cls.DEPRECATE_LEGACY = False
 
     def __init__(
-        self,
-        model: str,
-        *,
-        task: TaskOption = "auto",
-        tokenizer: Optional[str] = None,
-        tokenizer_mode: TokenizerMode = "auto",
-        skip_tokenizer_init: bool = False,
-        trust_remote_code: bool = False,
-        allowed_local_media_path: str = "",
-        tensor_parallel_size: int = 1,
-        dtype: ModelDType = "auto",
-        quantization: Optional[QuantizationMethods] = None,
-        revision: Optional[str] = None,
-        tokenizer_revision: Optional[str] = None,
-        seed: Optional[int] = None,
-        gpu_memory_utilization: float = 0.9,
-        swap_space: float = 4,
-        cpu_offload_gb: float = 0,
-        enforce_eager: bool = False,
-        max_seq_len_to_capture: int = 8192,
-        disable_custom_all_reduce: bool = False,
-        disable_async_output_proc: bool = False,
-        hf_token: Optional[Union[bool, str]] = None,
-        hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        override_pooler_config: Optional[PoolerConfig] = None,
-        compilation_config: Optional[Union[int, dict[str, Any],
-                                           CompilationConfig]] = None,
-        **kwargs,
+            self,
+            model: str,
+            *,
+            task: TaskOption = "auto",
+            tokenizer: Optional[str] = None,
+            tokenizer_mode: TokenizerMode = "auto",
+            skip_tokenizer_init: bool = False,
+            trust_remote_code: bool = False,
+            allowed_local_media_path: str = "",
+            tensor_parallel_size: int = 1,
+            dtype: ModelDType = "auto",
+            quantization: Optional[QuantizationMethods] = None,
+            revision: Optional[str] = None,
+            tokenizer_revision: Optional[str] = None,
+            seed: Optional[int] = None,
+            gpu_memory_utilization: float = 0.9,
+            swap_space: float = 4,
+            cpu_offload_gb: float = 0,
+            enforce_eager: bool = False,
+            max_seq_len_to_capture: int = 8192,
+            disable_custom_all_reduce: bool = False,
+            disable_async_output_proc: bool = False,
+            hf_token: Optional[Union[bool, str]] = None,
+            hf_overrides: Optional[HfOverrides] = None,
+            mm_processor_kwargs: Optional[dict[str, Any]] = None,
+            override_pooler_config: Optional[PoolerConfig] = None,
+            compilation_config: Optional[Union[int, dict[str, Any],
+                                               CompilationConfig]] = None,
+            allowed_logitsprocs_ctors: Sequence[str] = (),
+            **kwargs,
     ) -> None:
         """LLM constructor."""
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 2cb177b9ba7..e4e423dd9ff 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -17,14 +17,17 @@
 plugins_loaded = False
 
 
-def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
+def load_plugins_by_group(
+        group: str,
+        allowed_plugins: list[str]) -> dict[str, Callable[[], Any]]:
     import sys
     if sys.version_info < (3, 10):
         from importlib_metadata import entry_points
     else:
         from importlib.metadata import entry_points
 
-    allowed_plugins = envs.VLLM_PLUGINS
+    if not allowed_plugins:
+        allowed_plugins = envs.VLLM_PLUGINS
 
     discovered_plugins = entry_points(group=group)
     if len(discovered_plugins) == 0:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 27b6941c196..3754570dfaa 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from collections.abc import AsyncGenerator, Mapping, Sequence
+from collections.abc import AsyncGenerator, Mapping
 from copy import copy
 from typing import Any, Optional, Union
 
@@ -46,19 +46,18 @@
 class AsyncLLM(EngineClient):
 
     def __init__(
-            self,
-            vllm_config: VllmConfig,
-            executor_class: type[Executor],
-            log_stats: bool,
-            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-            mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-            use_cached_outputs: bool = False,
-            log_requests: bool = True,
-            start_engine_loop: bool = True,
-            stat_loggers: Optional[list[StatLoggerFactory]] = None,
-            client_addresses: Optional[dict[str, str]] = None,
-            client_index: int = 0,
-            allowed_logitsprocs_ctors: Sequence[str] = (),
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        client_addresses: Optional[dict[str, str]] = None,
+        client_index: int = 0,
     ) -> None:
         """
         Create an AsyncLLM.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a479d775965..a2328c37ba0 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from copy import copy
 from typing import Any, Callable, Optional, Union
 
@@ -42,16 +42,15 @@ class LLMEngine:
     """Legacy LLMEngine for backwards compatibility."""
 
     def __init__(
-            self,
-            vllm_config: VllmConfig,
-            executor_class: type[Executor],
-            log_stats: bool,
-            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-            stat_loggers: Optional[list[StatLoggerFactory]] = None,
-            mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-            use_cached_outputs: bool = False,
-            multiprocess_mode: bool = False,
-            allowed_logitsprocs_ctors: Sequence[str] = (),
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        use_cached_outputs: bool = False,
+        multiprocess_mode: bool = False,
     ) -> None:
         if not envs.VLLM_USE_V1:
             raise ValueError(
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 844e7ced19f..b769a915c3b 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -5,16 +5,22 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from itertools import chain
-from typing import Optional, Union
+from typing import Callable, Optional, Union
 
 import torch
 from torch._prims_common import DeviceLikeType
 
 from vllm import PoolingParams, SamplingParams
 from vllm.logger import init_logger
+from vllm.plugins import load_plugins_by_group
 
 logger = init_logger(__name__)
 
+LOGITSPROCS_GROUP = 'vllm.logits_processors'
+
+# make sure one process only loads logitsprocs once
+logitsprocs_loaded = False
+
 
 class MoveDirectionality(Enum):
     # One-way i1->i2 req move within batch
@@ -204,6 +210,10 @@ def update_state(
         raise NotImplementedError
 
 
+LogitprocCtor = Callable[[], LogitsProcessor]
+logitsprocs_ctors: list[LogitprocCtor] = []
+
+
 @dataclass
 class LogitsProcessorManager:
     """Encapsulates initialized logitsproc objects."""
@@ -212,29 +222,11 @@ class LogitsProcessorManager:
     non_argmax_invariant: list[LogitsProcessor] = field(
         default_factory=list)  # non-argmax-invariant logitsprocs
 
-    # def add_logitsprocs_by_ids(
-    #         self, ids_logitsprocs: Sequence[tuple[str,
-    #                                               LogitsProcessor]]) -> None:
-    #     """Add a sequence of (logitproc ID, logitproc instance)'s to the
-    #     logitsprocs manager
-
-    #     Args:
-    #       ids_logitsprocs: sequence of
-    #       (logitproc ID, logitproc instance pairs)
-    #     """
-    #     ids = self.all_ids
-    #     for id, logitproc in ids_logitsprocs:
-    #         # Ensure no duplicate IDs
-    #         if id in ids:
-    #             raise ValueError(
-    #                 f"Logits processor ID {id} already loaded (loaded IDs: "
-    #                 f"{ids})")
-    #         ids.add(id)
-    #         if logitproc.requires_nongreedy():
-    #             self.nongreedy[id] = logitproc
-    #         else:
-    #             # Greedy-compatible logitproc
-    #             self.greedy[id] = logitproc
+    def add_logitsprocs_by_ctor(self, ctor_list: list[LogitprocCtor]) -> None:
+        for ctor in ctor_list:
+            logitproc: LogitsProcessor = ctor()
+            (self.argmax_invariant if logitproc.is_argmax_invariant() else
+             self.non_argmax_invariant).append(logitproc)
 
     @property
     def all(self) -> Iterator[LogitsProcessor]:
@@ -242,6 +234,29 @@ def all(self) -> Iterator[LogitsProcessor]:
         return chain(self.argmax_invariant, self.non_argmax_invariant)
 
 
+def load_logitsprocs(allowed_logitsprocs=list[str]) -> None:
+    """WARNING: logitsprocs can be loaded for multiple times in different
+    processes. They should be designed in a way that they can be loaded
+    multiple times without causing issues.
+    """
+    global logitsprocs_loaded
+    if logitsprocs_loaded:
+        return
+    logitsprocs_loaded = True
+
+    # some platform-specific configurations
+    from vllm.platforms import current_platform
+
+    if current_platform.is_tpu():
+        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
+        return
+    plugins = load_plugins_by_group(group=LOGITSPROCS_GROUP,
+                                    allowed_logitsprocs=allowed_logitsprocs)
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()
+
+
 ###### ----- Built-in LogitsProcessor impls below here
 
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 70ed985b26b..5ffaa32fa92 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -17,7 +17,8 @@
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
                                              MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             init_builtin_logitsprocs,
+                                             logitsprocs_ctors)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
@@ -215,13 +216,13 @@ def __init__(
         # updates. Should reset each step.
         self.batch_update_builder = BatchUpdateBuilder()
 
-        # Define logits processors.
-        # TODO(andy): logits processor list should be extensible via engine
-        # constructor argument; for now the list is fixed.
+        # Init builtin logits processors
         self.logitsprocs = init_builtin_logitsprocs(
             pin_memory_available=pin_memory,
             max_num_reqs=max_num_reqs + 1,
             device=device)
+        # Init custom logits processors
+        self.logitsprocs.add_logitsprocs_by_ctor(logitsprocs_ctors)
 
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()

From 402d012c54fb31866e6675f3337200d0cff150a0 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Tue, 1 Jul 2025 06:23:42 -0400
Subject: [PATCH 125/180] Update vllm/v1/worker/gpu_model_runner.py

Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7f300d11be..0d372b1f968 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -532,7 +532,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         self.input_batch.condense()
         # Allow attention backend to reorder the batch, potentially
         self._may_reorder_batch(scheduler_output)
-        # Apply batch updates then reset input batch
+        # Refresh batch metadata with any pending updates.
         self.input_batch.update_reset()
 
     def _get_cumsum_and_arange(

From 7c15b433cbbec8c1010b34c057ff4c3cbd796469 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 06:24:30 -0400
Subject: [PATCH 126/180] CLI

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/config.py           |  3 +++
 vllm/engine/arg_utils.py | 18 +++++++++++++++++-
 vllm/entrypoints/llm.py  |  3 ++-
 vllm/plugins/__init__.py |  5 +++--
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 46a5bf34f66..833a8ff8006 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,6 +11,7 @@
 import uuid
 import warnings
 from collections import Counter
+from collections.abc import Sequence
 from contextlib import contextmanager
 from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
                          replace)
@@ -4330,6 +4331,8 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
+    allowed_logitsprocs: Sequence[str] = ()
+    """Allowed logitsprocs plugin names"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 2d3783363c0..ca68fb13a33 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -13,7 +13,7 @@
 from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
 from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
-                    Type, TypeVar, Union, cast, get_args, get_origin)
+                    Sequence, Type, TypeVar, Union, cast, get_args, get_origin)
 
 import regex as re
 import torch
@@ -465,6 +465,8 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
+    allowed_logitsprocs: Sequence[str] = ()
+
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
@@ -890,6 +892,20 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--disable-hybrid-kv-cache-manager",
             **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
 
+        # Logits processor arguments
+        def comma_separated_list(value: str) -> list[str]:
+            return value.split(',')
+
+        logitsprocs_group = parser.add_argument_group(
+            title="Logits processors",
+            description="Logits processors settings.",
+        )
+        logitsprocs_group.add_argument(
+            "--allowed-logitsprocs",
+            type=comma_separated_list,
+            default=[],
+            help="Allowed logits processor plugins.")
+
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
         vllm_group = parser.add_argument_group(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 2824c186edd..9b0815330c9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -183,7 +183,7 @@ def __init__(
             override_pooler_config: Optional[PoolerConfig] = None,
             compilation_config: Optional[Union[int, dict[str, Any],
                                                CompilationConfig]] = None,
-            allowed_logitsprocs_ctors: Sequence[str] = (),
+            allowed_logitsprocs: Sequence[str] = (),
             **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -257,6 +257,7 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
+            allowed_logitsprocs=allowed_logitsprocs,
             **kwargs,
         )
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index e4e423dd9ff..cf04859ada3 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -3,6 +3,7 @@
 
 import logging
 import os
+from collections.abc import Sequence
 from typing import Any, Callable
 
 import torch
@@ -18,8 +19,8 @@
 
 
 def load_plugins_by_group(
-        group: str,
-        allowed_plugins: list[str]) -> dict[str, Callable[[], Any]]:
+    group: str, allowed_plugins: Sequence[str] = ()
+) -> dict[str, Callable[[], Any]]:
     import sys
     if sys.version_info < (3, 10):
         from importlib_metadata import entry_points

From 06fc926ad4b9cdf7ca07694b6f0e074c0860c854 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 06:28:35 -0400
Subject: [PATCH 127/180] pr feedback

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/worker/test_gpu_input_batch.py | 4 ++--
 vllm/v1/worker/gpu_input_batch.py       | 3 ++-
 vllm/v1/worker/gpu_model_runner.py      | 7 ++++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 33cea0ed0ae..cd5dc33f7f4 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -357,7 +357,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         assigned_req_index = ref_input_batch.add_request(req)
         assert assigned_req_index == req_index
 
-    input_batch.update_reset()
-    ref_input_batch.update_reset()
+    input_batch.refresh_metadata()
+    ref_input_batch.refresh_metadata()
 
     _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 70ed985b26b..1a79d72be0a 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -585,8 +585,9 @@ def condense(self) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-    def update_reset(self):
+    def refresh_metadata(self):
         """Apply batch updates, reset input batch at end of step
+        
         * Apply batch add/remove/permute to logits procs' states
         * If batch state is modified, update sampling metadata
         """
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0d372b1f968..55469e64cea 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -335,8 +335,9 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         # group to reorder the batch and asserting that all other groups do not
         # reorder the batch.
         for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
-            assert not self.attn_metadata_builders[i].reorder_batch(
+            batch_reordered = self.attn_metadata_builders[i].reorder_batch(
                 self.input_batch, scheduler_output)
+            assert not batch_reordered
 
     # Note: used for model runner override.
     def _init_device_properties(self) -> None:
@@ -393,7 +394,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # have low request overlap (e.g., alternating between two distinct
         # sets of requests), this optimization becomes very inefficient.
         for req_id in unscheduled_req_ids:
-            assert self.input_batch.remove_request(req_id) is not None
+            self.input_batch.remove_request(req_id)
 
         req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
@@ -533,7 +534,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Allow attention backend to reorder the batch, potentially
         self._may_reorder_batch(scheduler_output)
         # Refresh batch metadata with any pending updates.
-        self.input_batch.update_reset()
+        self.input_batch.refresh_metadata()
 
     def _get_cumsum_and_arange(
         self,

From 99c0c18dbcfc8e3cc83aee000fb5562364bdb54e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 07:50:17 -0400
Subject: [PATCH 128/180] skeleton of example

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor.py      | 35 +++++++++++++++++++
 vllm/v1/engine/core.py                        |  2 ++
 vllm/v1/sample/logits_processor.py            |  7 ++--
 3 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 examples/offline_inference/logits_processor/logits_processor.py

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
new file mode 100644
index 00000000000..78bfda9bcf4
--- /dev/null
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m")
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 453ed364dc8..d12511eb89b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -63,7 +63,9 @@ def __init__(self,
 
         # plugins need to be loaded at the engine/scheduler level too
         from vllm.plugins import load_general_plugins
+        from vllm.v1.sample.logits_processor import load_logitsprocs
         load_general_plugins()
+        load_logitsprocs(vllm_config.allowed_logitsprocs)
 
         self.vllm_config = vllm_config
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index b769a915c3b..46e8cb00de8 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -211,7 +211,7 @@ def update_state(
 
 
 LogitprocCtor = Callable[[], LogitsProcessor]
-logitsprocs_ctors: list[LogitprocCtor] = []
+logitsprocs_ctors: Sequence[LogitprocCtor] = []
 
 
 @dataclass
@@ -222,7 +222,8 @@ class LogitsProcessorManager:
     non_argmax_invariant: list[LogitsProcessor] = field(
         default_factory=list)  # non-argmax-invariant logitsprocs
 
-    def add_logitsprocs_by_ctor(self, ctor_list: list[LogitprocCtor]) -> None:
+    def add_logitsprocs_by_ctor(self,
+                                ctor_list: Sequence[LogitprocCtor]) -> None:
         for ctor in ctor_list:
             logitproc: LogitsProcessor = ctor()
             (self.argmax_invariant if logitproc.is_argmax_invariant() else
@@ -234,7 +235,7 @@ def all(self) -> Iterator[LogitsProcessor]:
         return chain(self.argmax_invariant, self.non_argmax_invariant)
 
 
-def load_logitsprocs(allowed_logitsprocs=list[str]) -> None:
+def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:
     """WARNING: logitsprocs can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.

From aabd1ddfd4d0553250be9e505697e8422e0dac24 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 08:17:08 -0400
Subject: [PATCH 129/180] fixes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 46e8cb00de8..8784e1edeb0 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -252,7 +252,7 @@ def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
         return
     plugins = load_plugins_by_group(group=LOGITSPROCS_GROUP,
-                                    allowed_logitsprocs=allowed_logitsprocs)
+                                    allowed_plugins=allowed_logitsprocs)
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
         func()

From 63b640c854654b883ccf43e64c0e3cd45144d302 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 10:19:37 -0400
Subject: [PATCH 130/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logprobs_e2e.py | 2 +-
 vllm/v1/sample/logits_processor.py   | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 0b135613ff6..176545d8567 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -13,7 +13,7 @@
 
 # FIXME(rob): enable prefix caching once supported.
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
     "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
 ]
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
index 8784e1edeb0..afd130ee547 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor.py
@@ -210,8 +210,8 @@ def update_state(
         raise NotImplementedError
 
 
-LogitprocCtor = Callable[[], LogitsProcessor]
-logitsprocs_ctors: Sequence[LogitprocCtor] = []
+LogitprocCtor = Callable[[], None]
+logitsprocs_ctors: list[LogitprocCtor] = []
 
 
 @dataclass
@@ -222,8 +222,7 @@ class LogitsProcessorManager:
     non_argmax_invariant: list[LogitsProcessor] = field(
         default_factory=list)  # non-argmax-invariant logitsprocs
 
-    def add_logitsprocs_by_ctor(self,
-                                ctor_list: Sequence[LogitprocCtor]) -> None:
+    def add_logitsprocs_by_ctor(self, ctor_list: list[LogitprocCtor]) -> None:
         for ctor in ctor_list:
             logitproc: LogitsProcessor = ctor()
             (self.argmax_invariant if logitproc.is_argmax_invariant() else

From 45dade4696038167b876759020309be46391c447 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 10:20:21 -0400
Subject: [PATCH 131/180] mem util

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logprobs_e2e.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 0b135613ff6..176545d8567 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -13,7 +13,7 @@
 
 # FIXME(rob): enable prefix caching once supported.
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
     "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
 ]

From 6ae75744c067c85a4c9e69fe129195f3080ad2f9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 13:51:12 -0400
Subject: [PATCH 132/180] memory util

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logprobs_e2e.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 176545d8567..50b14a15dc1 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -15,7 +15,8 @@
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
-    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
+    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
+    "--gpu-memory-utilization=0.8"
 ]
 NUM_CONCURRENT = 100
 

From 3a5564dc154dab23777dcf421286dbd72944529f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 1 Jul 2025 23:18:01 -0400
Subject: [PATCH 133/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py     |  15 +-
 tests/v1/sample/test_rejection_sampler.py     |   2 +-
 tests/v1/sample/test_sampler.py               |   2 +-
 tests/v1/sample/utils.py                      |   3 +-
 tests/v1/worker/test_gpu_input_batch.py       |   2 +-
 vllm/v1/engine/core.py                        |   2 +-
 vllm/v1/sample/logits_processor/core.py       |  39 +++
 .../impls.py}                                 | 294 +-----------------
 vllm/v1/sample/logits_processor/load.py       |  72 +++++
 vllm/v1/sample/logits_processor/state.py      | 190 +++++++++++
 vllm/v1/sample/metadata.py                    |   2 +-
 vllm/v1/worker/gpu_input_batch.py             |   8 +-
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 13 files changed, 328 insertions(+), 305 deletions(-)
 create mode 100644 vllm/v1/sample/logits_processor/core.py
 rename vllm/v1/sample/{logits_processor.py => logits_processor/impls.py} (54%)
 create mode 100644 vllm/v1/sample/logits_processor/load.py
 create mode 100644 vllm/v1/sample/logits_processor/state.py

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index a8e230a97ed..7ab51d9b741 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -17,14 +17,15 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
 # yapf: disable
-from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
-                                             LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+from vllm.v1.sample.logits_processor.core import LogitsProcessor
+from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
+                                                   MinPLogitsProcessor,
+                                                   MinTokensLogitsProcessor)
 # yapf: enable
+from vllm.v1.sample.logits_processor.load import init_builtin_logitsprocs
+from vllm.v1.sample.logits_processor.state import (BatchUpdate,
+                                                   BatchUpdateBuilder,
+                                                   MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3a4d48afc9d..3a521e9bb86 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index ea10661ea11..9fd4636734a 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,7 +9,7 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index e33efb413d0..6f6d8542316 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -10,7 +10,8 @@
 
 from vllm import CompletionOutput
 from vllm.utils import make_tensor_with_pad
-from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
+from vllm.v1.sample.logits_processor.core import LogitsProcessor
+from vllm.v1.sample.logits_processor.state import BatchUpdate
 from vllm.v1.sample.metadata import SamplingMetadata
 
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 943a13debad..7d77fc30c73 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -13,7 +13,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d12511eb89b..edb2f7fee87 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -63,7 +63,7 @@ def __init__(self,
 
         # plugins need to be loaded at the engine/scheduler level too
         from vllm.plugins import load_general_plugins
-        from vllm.v1.sample.logits_processor import load_logitsprocs
+        from vllm.v1.sample.logits_processor.load import load_logitsprocs
         load_general_plugins()
         load_logitsprocs(vllm_config.allowed_logitsprocs)
 
diff --git a/vllm/v1/sample/logits_processor/core.py b/vllm/v1/sample/logits_processor/core.py
new file mode 100644
index 00000000000..f519c2ec252
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/core.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+if TYPE_CHECKING:
+    from vllm.v1.sample.logits_processor.state import BatchUpdate
+
+
+class LogitsProcessor(ABC):
+
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_state(
+        self,
+        batch_update: Optional["BatchUpdate"],
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Args:
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
+        """
+        raise NotImplementedError
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor/impls.py
similarity index 54%
rename from vllm/v1/sample/logits_processor.py
rename to vllm/v1/sample/logits_processor/impls.py
index afd130ee547..60ba2e92b1c 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -1,263 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
-import dataclasses
-from abc import ABC, abstractmethod
-from collections.abc import Iterator, Sequence
-from dataclasses import dataclass, field
-from enum import Enum
-from itertools import chain
-from typing import Callable, Optional, Union
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Optional
 
 import torch
 from torch._prims_common import DeviceLikeType
 
-from vllm import PoolingParams, SamplingParams
-from vllm.logger import init_logger
-from vllm.plugins import load_plugins_by_group
-
-logger = init_logger(__name__)
-
-LOGITSPROCS_GROUP = 'vllm.logits_processors'
-
-# make sure one process only loads logitsprocs once
-logitsprocs_loaded = False
-
-
-class MoveDirectionality(Enum):
-    # One-way i1->i2 req move within batch
-    UNIDIRECTIONAL = 0
-    # Two-way i1<->i2 req swap within batch
-    SWAP = 1
-
-
-# (index, params, output_tok_ids) tuples for new
-# requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
-# (index 1, index 2, directionality) tuples representing
-# one-way moves or two-way swaps of requests in batch
-MovedRequest = tuple[int, int, MoveDirectionality]
-# Batch indices of any removed requests.
-RemovedRequest = int
-
-
-@dataclasses.dataclass(frozen=True)
-class BatchUpdate:
-    """Persistent batch state change info for logitsprocs"""
-    batch_size: int  # Current num reqs in batch
-
-    # Metadata for requests added to, removed from, and moved
-    # within the persistent batch.
-    #
-    # Note: each added request is represented as
-    # (index, params, output_tok_ids)
-    # Key assumption: output_tok_ids is a reference to the
-    # request's running output tokens list; in this way
-    # the logits processors always see the latest list of
-    # generated tokens
-    removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
-    added: Sequence[AddedRequest]
-
-
-class BatchUpdateBuilder:
-    """Helps track persistent batch state changes and build
-    a batch update data structure for logitsprocs
-    
-    Assumptions:
-    * All information about requests removed from persistent batch
-      during a step is aggregated in self._removed through calls to
-      self.removed_append() at the beginning of a step. This must happen
-      before the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are invoked in a given step
-    * After the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are read in a step, no new removals
-      are registered using self.removed_append()
-    * Elements of self._removed are never directly modified, added or
-      removed (i.e. modification is only via self.removed_append() and
-      self.pop_removed())
-    
-    Guarantees under above assumptions:
-    * self.removed is always sorted in descending order
-    * self.pop_removed() and self.peek_removed() both return
-      the lowest removed request index in the current step
-    """
-
-    _removed: list[RemovedRequest]
-    _is_removed_sorted: bool
-    moved: list[MovedRequest]
-    added: list[AddedRequest]
-
-    def __init__(
-        self,
-        removed: Optional[list[RemovedRequest]] = None,
-        moved: Optional[list[MovedRequest]] = None,
-        added: Optional[list[AddedRequest]] = None,
-    ) -> None:
-        self._removed = removed or []
-        self.moved = moved or []
-        self.added = added or []
-        self._is_removed_sorted = False
-
-    def _ensure_removed_sorted(self) -> None:
-        """Sort removed request indices in
-        descending order.
-        
-        Idempotent after first call in a
-        given step, until reset.
-        """
-        if not self._is_removed_sorted:
-            self._removed.sort(reverse=True)
-            self._is_removed_sorted = True
-
-    @property
-    def removed(self) -> list[RemovedRequest]:
-        """Removed request indices sorted in
-        descending order"""
-        self._ensure_removed_sorted()
-        return self._removed
-
-    def removed_append(self, index: int) -> None:
-        """Register the removal of a request from
-        the persistent batch.
-
-        Must not be called after the first time
-        self.removed, self.pop_removed() or
-        self.peek_removed() are invoked.
-        
-        Args:
-          index: request index
-        """
-        if self._is_removed_sorted:
-            raise RuntimeError("Cannot register new removed request after"
-                               " self.removed has been read.")
-        self._removed.append(index)
-
-    def has_removed(self) -> bool:
-        return bool(self._removed)
-
-    def peek_removed(self) -> Optional[int]:
-        """Return lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed[-1]
-        return None
-
-    def pop_removed(self) -> Optional[int]:
-        """Pop lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed.pop()
-        return None
-
-    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
-        """Generate a logitsprocs batch update data structure
-        and reset internal batch update builder state.
-        
-        Args:
-          batch_size: current persistent batch size
-
-        Returns:
-          Frozen logitsprocs batch update instance; `None` if no updates
-        """
-        # Reset removal-sorting logic
-        self._is_removed_sorted = False
-        if not any((self._removed, self.moved, self.added)):
-            # No update; short-circuit
-            return None
-        # Build batch state update
-        batch_update = BatchUpdate(
-            batch_size=batch_size,
-            removed=self._removed,
-            moved=self.moved,
-            added=self.added,
-        )
-        # Reset removed/moved/added update lists
-        self._removed = []
-        self.moved = []
-        self.added = []
-        return batch_update
-
-
-class LogitsProcessor(ABC):
-
-    @abstractmethod
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-
-    @abstractmethod
-    def is_argmax_invariant(self) -> bool:
-        """True if logits processor has no impact on the
-        argmax computation in greedy sampling.
-        NOTE: may or may not have the same value for all
-        instances of a given LogitsProcessor subclass,
-        depending on subclass implementation.
-        TODO(andy): won't be utilized until logits
-        processors are user-extensible
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_state(
-        self,
-        batch_update: Optional[BatchUpdate],
-    ) -> None:
-        """Called when there are new output tokens, prior
-        to each forward pass.
-
-        Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
-        """
-        raise NotImplementedError
-
-
-LogitprocCtor = Callable[[], None]
-logitsprocs_ctors: list[LogitprocCtor] = []
-
-
-@dataclass
-class LogitsProcessorManager:
-    """Encapsulates initialized logitsproc objects."""
-    argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # argmax-invariant logitsprocs
-    non_argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # non-argmax-invariant logitsprocs
-
-    def add_logitsprocs_by_ctor(self, ctor_list: list[LogitprocCtor]) -> None:
-        for ctor in ctor_list:
-            logitproc: LogitsProcessor = ctor()
-            (self.argmax_invariant if logitproc.is_argmax_invariant() else
-             self.non_argmax_invariant).append(logitproc)
-
-    @property
-    def all(self) -> Iterator[LogitsProcessor]:
-        """Iterator over all logits processors."""
-        return chain(self.argmax_invariant, self.non_argmax_invariant)
-
-
-def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:
-    """WARNING: logitsprocs can be loaded for multiple times in different
-    processes. They should be designed in a way that they can be loaded
-    multiple times without causing issues.
-    """
-    global logitsprocs_loaded
-    if logitsprocs_loaded:
-        return
-    logitsprocs_loaded = True
-
-    # some platform-specific configurations
-    from vllm.platforms import current_platform
-
-    if current_platform.is_tpu():
-        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
-        return
-    plugins = load_plugins_by_group(group=LOGITSPROCS_GROUP,
-                                    allowed_plugins=allowed_logitsprocs)
-    # general plugins, we only need to execute the loaded functions
-    for func in plugins.values():
-        func()
-
-
-###### ----- Built-in LogitsProcessor impls below here
+from vllm import SamplingParams
+from vllm.v1.sample.logits_processor.core import LogitsProcessor
+from vllm.v1.sample.logits_processor.state import (BatchUpdate,
+                                                   MoveDirectionality)
 
 
 class MinPLogitsProcessor(LogitsProcessor):
@@ -521,35 +273,3 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
-
-
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device) -> LogitsProcessorManager:
-    """Construct 'builtin' vLLM logitsprocs which the engine
-    loads by default.
-
-    Args:
-      pin_memory_available: pinned memory is available for use
-                            for use by logitsproc
-      max_num_reqs: ceiling on request count in persistent batch
-      device: inference device
-
-    Returns:
-      Data structure encapsulating loaded logitsprocs
-    """
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=pin_memory_available,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorManager(
-        non_argmax_invariant=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
-        ],
-        argmax_invariant=[min_p_logitproc],
-    )
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
new file mode 100644
index 00000000000..38d9b26a80c
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Callable
+
+import torch
+
+from vllm.plugins import load_plugins_by_group
+from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
+                                                   MinPLogitsProcessor,
+                                                   MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
+
+LOGITSPROCS_GROUP = 'vllm.logits_processors'
+LogitprocCtor = Callable[[], None]
+logitsprocs_ctors: list[LogitprocCtor] = []
+# make sure one process only loads logitsprocs once
+logitsprocs_loaded = False
+
+
+def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:
+    """WARNING: logitsprocs can be loaded for multiple times in different
+    processes. They should be designed in a way that they can be loaded
+    multiple times without causing issues.
+    """
+    global logitsprocs_loaded
+    if logitsprocs_loaded:
+        return
+    logitsprocs_loaded = True
+
+    # some platform-specific configurations
+    from vllm.platforms import current_platform
+
+    if current_platform.is_tpu():
+        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
+        return
+    plugins = load_plugins_by_group(group=LOGITSPROCS_GROUP,
+                                    allowed_plugins=allowed_logitsprocs)
+    # general plugins, we only need to execute the loaded functions
+    for func in plugins.values():
+        func()
+
+
+def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
+                             device: torch.device) -> LogitsProcessorManager:
+    """Construct 'builtin' vLLM logitsprocs which the engine
+    loads by default.
+
+    Args:
+      pin_memory_available: pinned memory is available for use
+                            for use by logitsproc
+      max_num_reqs: ceiling on request count in persistent batch
+      device: inference device
+
+    Returns:
+      Data structure encapsulating loaded logitsprocs
+    """
+    min_tokens_logitproc = MinTokensLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    logit_bias_logitproc = LogitBiasLogitsProcessor(
+        pin_memory=pin_memory_available, device=device)
+    min_p_logitproc = MinPLogitsProcessor(
+        pin_memory=pin_memory_available,
+        device=device,
+        # +1 for temporary swap space
+        max_num_reqs=max_num_reqs + 1)
+    return LogitsProcessorManager(
+        non_argmax_invariant=[
+            min_tokens_logitproc,
+            logit_bias_logitproc,
+        ],
+        argmax_invariant=[min_p_logitproc],
+    )
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
new file mode 100644
index 00000000000..a0beba3d622
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator, Sequence
+from dataclasses import dataclass, field
+from enum import Enum
+from itertools import chain
+from typing import TYPE_CHECKING, Optional, Union
+
+from vllm import PoolingParams, SamplingParams
+
+if TYPE_CHECKING:
+    from vllm.v1.sample.logits_processor.core import LogitsProcessor
+    from vllm.v1.sample.logits_processor.load import LogitprocCtor
+
+
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = 0
+    # Two-way i1<->i2 req swap within batch
+    SWAP = 1
+
+
+# (index, params, output_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+# Batch indices of any removed requests.
+RemovedRequest = int
+
+
+@dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Note: each added request is represented as
+    # (index, params, output_tok_ids)
+    # Key assumption: output_tok_ids is a reference to the
+    # request's running output tokens list; in this way
+    # the logits processors always see the latest list of
+    # generated tokens
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
+
+
+class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed())
+
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed() and self.peek_removed() both return
+      the lowest removed request index in the current step
+    """
+
+    _removed: list[RemovedRequest]
+    _is_removed_sorted: bool
+    moved: list[MovedRequest]
+    added: list[AddedRequest]
+
+    def __init__(
+        self,
+        removed: Optional[list[RemovedRequest]] = None,
+        moved: Optional[list[MovedRequest]] = None,
+        added: Optional[list[AddedRequest]] = None,
+    ) -> None:
+        self._removed = removed or []
+        self.moved = moved or []
+        self.added = added or []
+        self._is_removed_sorted = False
+
+    def _ensure_removed_sorted(self) -> None:
+        """Sort removed request indices in
+        descending order.
+
+        Idempotent after first call in a
+        given step, until reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted = True
+
+    @property
+    def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
+        self._ensure_removed_sorted()
+        return self._removed
+
+    def removed_append(self, index: int) -> None:
+        """Register the removal of a request from
+        the persistent batch.
+
+        Must not be called after the first time
+        self.removed, self.pop_removed() or
+        self.peek_removed() are invoked.
+
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError("Cannot register new removed request after"
+                               " self.removed has been read.")
+        self._removed.append(index)
+
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+
+    def peek_removed(self) -> Optional[int]:
+        """Return lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed[-1]
+        return None
+
+    def pop_removed(self) -> Optional[int]:
+        """Pop lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed.pop()
+        return None
+
+    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+        """Generate a logitsprocs batch update data structure
+        and reset internal batch update builder state.
+
+        Args:
+          batch_size: current persistent batch size
+
+        Returns:
+          Frozen logitsprocs batch update instance; `None` if no updates
+        """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        if not any((self._removed, self.moved, self.added)):
+            # No update; short-circuit
+            return None
+        # Build batch state update
+        batch_update = BatchUpdate(
+            batch_size=batch_size,
+            removed=self._removed,
+            moved=self.moved,
+            added=self.added,
+        )
+        # Reset removed/moved/added update lists
+        self._removed = []
+        self.moved = []
+        self.added = []
+        return batch_update
+
+
+@dataclass
+class LogitsProcessorManager:
+    """Encapsulates initialized logitsproc objects."""
+    argmax_invariant: list["LogitsProcessor"] = field(
+        default_factory=list)  # argmax-invariant logitsprocs
+    non_argmax_invariant: list["LogitsProcessor"] = field(
+        default_factory=list)  # non-argmax-invariant logitsprocs
+
+    def add_logitsprocs_by_ctor(self,
+                                ctor_list: list["LogitprocCtor"]) -> None:
+        pass
+        # for ctor in ctor_list:
+        #     logitproc: LogitsProcessor = ctor()
+        #     (self.argmax_invariant if logitproc.is_argmax_invariant() else
+        #      self.non_argmax_invariant).append(logitproc)
+
+    @property
+    def all(self) -> Iterator["LogitsProcessor"]:
+        """Iterator over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 1189b12f307..f9e08d55060 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
 
 
 @dataclass
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index fea06aa87c8..9d288c292d4 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -15,10 +15,10 @@
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs,
-                                             logitsprocs_ctors)
+from vllm.v1.sample.logits_processor.load import (init_builtin_logitsprocs,
+                                                  logitsprocs_ctors)
+from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder,
+                                                   MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4786d047acb..ae89a3d517d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -68,7 +68,7 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from ..sample.logits_processor import LogitsProcessorManager
+from ..sample.logits_processor.state import LogitsProcessorManager
 from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 

From 663bff192cc071d9a2c3ce7f2c28cab9cec2a871 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 3 Jul 2025 04:05:07 -0400
Subject: [PATCH 134/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py   | 15 +++++++-------
 tests/v1/sample/utils.py                    |  3 +--
 vllm/v1/sample/logits_processor/__init__.py | 23 +++++++++++++++++++++
 3 files changed, 31 insertions(+), 10 deletions(-)
 create mode 100644 vllm/v1/sample/logits_processor/__init__.py

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 7ab51d9b741..a8e230a97ed 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -17,15 +17,14 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
 # yapf: disable
-from vllm.v1.sample.logits_processor.core import LogitsProcessor
-from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
-                                                   MinPLogitsProcessor,
-                                                   MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+                                             LogitBiasLogitsProcessor,
+                                             LogitsProcessor,
+                                             MinPLogitsProcessor,
+                                             MinTokensLogitsProcessor,
+                                             MoveDirectionality,
+                                             init_builtin_logitsprocs)
 # yapf: enable
-from vllm.v1.sample.logits_processor.load import init_builtin_logitsprocs
-from vllm.v1.sample.logits_processor.state import (BatchUpdate,
-                                                   BatchUpdateBuilder,
-                                                   MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index 6f6d8542316..e33efb413d0 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -10,8 +10,7 @@
 
 from vllm import CompletionOutput
 from vllm.utils import make_tensor_with_pad
-from vllm.v1.sample.logits_processor.core import LogitsProcessor
-from vllm.v1.sample.logits_processor.state import BatchUpdate
+from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
new file mode 100644
index 00000000000..b88d21565f6
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.sample.logits_processor.core import LogitsProcessor
+from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
+                                                   MinPLogitsProcessor,
+                                                   MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
+                                                  init_builtin_logitsprocs)
+from vllm.v1.sample.logits_processor.state import (BatchUpdate,
+                                                   BatchUpdateBuilder,
+                                                   MoveDirectionality)
+
+__all__ = [
+    "LogitsProcessor",
+    "LogitBiasLogitsProcessor",
+    "MinPLogitsProcessor",
+    "MinTokensLogitsProcessor",
+    "LogitprocCtor",
+    "init_builtin_logitsprocs",
+    "BatchUpdate",
+    "BatchUpdateBuilder",
+    "MoveDirectionality",
+]
\ No newline at end of file

From 270b18449fef14a1616bed9b730cb5b9be7cb24c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 3 Jul 2025 10:59:17 -0400
Subject: [PATCH 135/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/engine/arg_utils.py                    | 47 ++++++++++++---
 vllm/entrypoints/llm.py                     | 63 +++++++++++----------
 vllm/v1/sample/logits_processor/__init__.py |  4 ++
 vllm/v1/sample/logits_processor/load.py     | 11 +++-
 vllm/v1/sample/logits_processor/utils.py    | 19 +++++++
 5 files changed, 104 insertions(+), 40 deletions(-)
 create mode 100644 vllm/v1/sample/logits_processor/utils.py

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5ed501907fb..632b497c7e4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -13,7 +13,7 @@
 from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
 from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
-                    Sequence, Type, TypeVar, Union, cast, get_args, get_origin)
+                    Type, TypeVar, Union, cast, get_args, get_origin)
 
 import regex as re
 import torch
@@ -43,6 +43,8 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
+from vllm.v1.sample.logits_processor import logitsprocs_package_pattern
+from vllm.v1.sample.logits_processor.load import LogitsProcessorEntrypoint
 
 # yapf: enable
 
@@ -285,6 +287,32 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     return copy.deepcopy(_compute_kwargs(cls))
 
 
+def comma_separated_list(
+    value: str,
+    pattern: Optional[re.Pattern],
+) -> list[str]:
+    """Argparse-compatible comma-separated list arg type.
+    
+    Expected format: "<val0>,<val1>,<val2>" or just "<val>".
+    Optionally validate that each value satisifies the `pattern` regex.
+
+    Args:
+      value: string comma-separate list representation
+      pattern: optional regex
+
+    Returns:
+      Parsed list of string values
+    """
+    items = value.split(',')
+    if pattern:
+        for item in items:
+            if not pattern.fullmatch(item):
+                raise argparse.ArgumentTypeError(
+                    f"Invalid item '{item}'. Each item must match the "
+                    f"pattern: {pattern.pattern}")
+    return items
+
+
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
@@ -469,7 +497,8 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
-    allowed_logitsprocs: Sequence[str] = ()
+    logitsprocs_qualnames: Optional[list[str]]
+    allowed_logitsprocs_plugins: Optional[list[LogitsProcessorEntrypoint]]
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -904,19 +933,21 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--disable-hybrid-kv-cache-manager",
             **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
 
-        # Logits processor arguments
-        def comma_separated_list(value: str) -> list[str]:
-            return value.split(',')
-
         logitsprocs_group = parser.add_argument_group(
             title="Logits processors",
             description="Logits processors settings.",
         )
         logitsprocs_group.add_argument(
-            "--allowed-logitsprocs",
-            type=comma_separated_list,
+            "--allowed-logitsprocs-plugins",
+            type=lambda x: comma_separated_list(x, logitsprocs_package_pattern
+                                                ),
             default=[],
             help="Allowed logits processor plugins.")
+        logitsprocs_group.add_argument(
+            "--logitsprocs-qualnames",
+            type=lambda x: comma_separated_list(x, None),
+            default=[],
+            help="Logits processors qualified names.")
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d4f891b8b4c..de555973bb3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -49,6 +49,7 @@
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
+from vllm.v1.sample.logits_processor import LogitsProcessorSpec
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -163,36 +164,36 @@ def deprecate_legacy_api(cls):
         cls.DEPRECATE_LEGACY = False
 
     def __init__(
-            self,
-            model: str,
-            *,
-            task: TaskOption = "auto",
-            tokenizer: Optional[str] = None,
-            tokenizer_mode: TokenizerMode = "auto",
-            skip_tokenizer_init: bool = False,
-            trust_remote_code: bool = False,
-            allowed_local_media_path: str = "",
-            tensor_parallel_size: int = 1,
-            dtype: ModelDType = "auto",
-            quantization: Optional[QuantizationMethods] = None,
-            revision: Optional[str] = None,
-            tokenizer_revision: Optional[str] = None,
-            seed: Optional[int] = None,
-            gpu_memory_utilization: float = 0.9,
-            swap_space: float = 4,
-            cpu_offload_gb: float = 0,
-            enforce_eager: bool = False,
-            max_seq_len_to_capture: int = 8192,
-            disable_custom_all_reduce: bool = False,
-            disable_async_output_proc: bool = False,
-            hf_token: Optional[Union[bool, str]] = None,
-            hf_overrides: Optional[HfOverrides] = None,
-            mm_processor_kwargs: Optional[dict[str, Any]] = None,
-            override_pooler_config: Optional[PoolerConfig] = None,
-            compilation_config: Optional[Union[int, dict[str, Any],
-                                               CompilationConfig]] = None,
-            allowed_logitsprocs: Sequence[str] = (),
-            **kwargs,
+        self,
+        model: str,
+        *,
+        task: TaskOption = "auto",
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: TokenizerMode = "auto",
+        skip_tokenizer_init: bool = False,
+        trust_remote_code: bool = False,
+        allowed_local_media_path: str = "",
+        tensor_parallel_size: int = 1,
+        dtype: ModelDType = "auto",
+        quantization: Optional[QuantizationMethods] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: Optional[int] = None,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: float = 4,
+        cpu_offload_gb: float = 0,
+        enforce_eager: bool = False,
+        max_seq_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+        disable_async_output_proc: bool = False,
+        hf_token: Optional[Union[bool, str]] = None,
+        hf_overrides: Optional[HfOverrides] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        override_pooler_config: Optional[PoolerConfig] = None,
+        compilation_config: Optional[Union[int, dict[str, Any],
+                                           CompilationConfig]] = None,
+        logits_processors: Optional[list[LogitsProcessorSpec]] = None,
+        **kwargs,
     ) -> None:
         """LLM constructor."""
 
@@ -265,7 +266,7 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
-            allowed_logitsprocs=allowed_logitsprocs,
+            logitsprocs_qualnames=logits_processors,
             **kwargs,
         )
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index b918b4b6801..b54f45d1018 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -5,12 +5,14 @@
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
+                                                  LogitsProcessorSpec,
                                                   init_builtin_logitsprocs,
                                                   logitsprocs_ctors)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
                                                    LogitsProcessorManager,
                                                    MoveDirectionality)
+from vllm.v1.sample.logits_processor.utils import logitsprocs_package_pattern
 
 __all__ = [
     "LogitsProcessor",
@@ -24,4 +26,6 @@
     "MoveDirectionality",
     "LogitsProcessorManager",
     "logitsprocs_ctors",
+    "LogitsProcessorSpec",
+    "logitsprocs_package_pattern",
 ]
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 38d9b26a80c..0fbadbbb2bb 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable
+from typing import Callable, TypedDict, Union
 
 import torch
 
@@ -17,6 +17,15 @@
 logitsprocs_loaded = False
 
 
+class LogitsProcessorEntrypoint(TypedDict):
+    package_name: str
+    entrypoint_name: str
+
+
+# Specify logitproc by qualname (str) or package and entrypoint name
+LogitsProcessorSpec = Union[str, LogitsProcessorEntrypoint]
+
+
 def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:
     """WARNING: logitsprocs can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
diff --git a/vllm/v1/sample/logits_processor/utils.py b/vllm/v1/sample/logits_processor/utils.py
new file mode 100644
index 00000000000..c0acd1cac12
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/utils.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import regex as re
+
+# <package name>.<entrypoint name> compiled regular expression
+package_name_regex = r'[a-z0-9](?:[a-z0-9._-]*[a-z0-9])?'
+function_name_or_wildcard_regex = r'[A-Za-z_][A-Za-z0-9_]*|\*'
+logitsprocs_package_pattern = re.compile(
+    rf'^({package_name_regex})\.({function_name_or_wildcard_regex})$')
+
+
+def extract_package_and_function(s: str) -> Optional[tuple[str, str]]:
+    """Return (package name,entrypoint name) (or `None` if no regex match)"""
+    match = logitsprocs_package_pattern.fullmatch(s)
+    if match:
+        return match.group(1), match.group(2)
+    return None

From fc9c308997f9ab02becf62b6afdd28103ea02f83 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 3 Jul 2025 12:36:09 -0400
Subject: [PATCH 136/180] py llm plumbing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/config.py                              |  6 +++---
 vllm/engine/arg_utils.py                    | 22 ++++++++++++++-------
 vllm/entrypoints/llm.py                     |  9 ++++++---
 vllm/v1/sample/logits_processor/__init__.py |  6 ++++--
 vllm/v1/sample/logits_processor/load.py     |  2 +-
 5 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a7e1eba3406..560cbe9cf8d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -11,7 +11,6 @@
 import uuid
 import warnings
 from collections import Counter
-from collections.abc import Sequence
 from contextlib import contextmanager
 from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
                          replace)
@@ -52,6 +51,7 @@
                         cuda_device_count_stateless, get_cpu_memory,
                         get_open_port, is_torch_equal_or_newer, random_uuid,
                         resolve_obj_by_qualname)
+from vllm.v1.sample.logits_processor import LogitsProcessorsSpec
 
 # yapf: enable
 
@@ -4353,8 +4353,8 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
-    allowed_logitsprocs: Sequence[str] = ()
-    """Allowed logitsprocs plugin names"""
+    logits_processors: Optional[list[LogitsProcessorsSpec]] = None
+    """Specifications for which logits processors to load"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 632b497c7e4..bcb040db323 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -44,7 +44,8 @@
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
 from vllm.v1.sample.logits_processor import logitsprocs_package_pattern
-from vllm.v1.sample.logits_processor.load import LogitsProcessorEntrypoint
+from vllm.v1.sample.logits_processor.load import (LogitsProcessorEntrypoint,
+                                                  LogitsProcessorsSpec)
 
 # yapf: enable
 
@@ -497,8 +498,8 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
-    logitsprocs_qualnames: Optional[list[str]]
-    allowed_logitsprocs_plugins: Optional[list[LogitsProcessorEntrypoint]]
+    logits_processors_qualnames: Optional[list[str]]
+    logits_processors_entrypoints: Optional[list[LogitsProcessorEntrypoint]]
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -938,16 +939,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             description="Logits processors settings.",
         )
         logitsprocs_group.add_argument(
-            "--allowed-logitsprocs-plugins",
+            "--logits-processors-entrypoints",
             type=lambda x: comma_separated_list(x, logitsprocs_package_pattern
                                                 ),
             default=[],
-            help="Allowed logits processor plugins.")
+            help="Comma-separated list of allowed logits processor "
+            "entrypoints (acceptable entrypoint formats: "
+            "package.entrypoint | package.*).")
         logitsprocs_group.add_argument(
-            "--logitsprocs-qualnames",
+            "--logits-processors-qualnames",
             type=lambda x: comma_separated_list(x, None),
             default=[],
-            help="Logits processors qualified names.")
+            help="Comma-separated list of logits processor qualified names.")
 
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
@@ -1330,6 +1333,10 @@ def create_engine_config(
             collect_detailed_traces=self.collect_detailed_traces,
         )
 
+        logits_processors: Optional[list[LogitsProcessorsSpec]] = (
+            ((self.logits_processors_qualnames or []) +
+             (self.logits_processors_entrypoints or [])) or None)
+
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1346,6 +1353,7 @@ def create_engine_config(
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             additional_config=self.additional_config,
+            logits_processors=logits_processors,
         )
 
         return config
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index cf938da7f5a..f452a9ab171 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -49,7 +49,7 @@
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
-from vllm.v1.sample.logits_processor import LogitsProcessorSpec
+from vllm.v1.sample.logits_processor import LogitsProcessorEntrypoint
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -192,7 +192,9 @@ def __init__(
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
-        logits_processors: Optional[list[LogitsProcessorSpec]] = None,
+        logits_processors_qualnames: Optional[list[str]] = None,
+        logits_processors_entrypoints: Optional[
+            list[LogitsProcessorEntrypoint]] = None,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -266,7 +268,8 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
-            logitsprocs_qualnames=logits_processors,
+            logits_processors_qualnames=logits_processors_qualnames,
+            logits_processors_entrypoints=logits_processors_entrypoints,
             **kwargs,
         )
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index b54f45d1018..f46bc07085f 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -5,7 +5,8 @@
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
-                                                  LogitsProcessorSpec,
+                                                  LogitsProcessorEntrypoint,
+                                                  LogitsProcessorsSpec,
                                                   init_builtin_logitsprocs,
                                                   logitsprocs_ctors)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
@@ -26,6 +27,7 @@
     "MoveDirectionality",
     "LogitsProcessorManager",
     "logitsprocs_ctors",
-    "LogitsProcessorSpec",
     "logitsprocs_package_pattern",
+    "LogitsProcessorEntrypoint",
+    "LogitsProcessorsSpec",
 ]
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 0fbadbbb2bb..aeee297faf0 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -23,7 +23,7 @@ class LogitsProcessorEntrypoint(TypedDict):
 
 
 # Specify logitproc by qualname (str) or package and entrypoint name
-LogitsProcessorSpec = Union[str, LogitsProcessorEntrypoint]
+LogitsProcessorsSpec = Union[str, LogitsProcessorEntrypoint]
 
 
 def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:

From 3aa383ed9c6d47dffee840c9b5c085cc9aba658b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 3 Jul 2025 13:06:31 -0400
Subject: [PATCH 137/180] wip lp example

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor.py      |  9 +++-
 ...enai_completion_client_logits_processor.py | 53 +++++++++++++++++++
 vllm/engine/arg_utils.py                      |  5 +-
 vllm/v1/sample/logits_processor/load.py       |  3 +-
 4 files changed, 66 insertions(+), 4 deletions(-)
 create mode 100644 examples/online_serving/openai_completion_client_logits_processor.py

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
index 78bfda9bcf4..043b404be24 100644
--- a/examples/offline_inference/logits_processor/logits_processor.py
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
+from vllm.v1.sample.logits_processor import LogitsProcessorEntrypoint
 
 # Sample prompts.
 prompts = [
@@ -16,7 +17,13 @@
 
 def main():
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m")
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors_entrypoints=[
+            LogitsProcessorEntrypoint(package_name="qsdf", entrypoint_name="bsdf")
+        ],
+        logits_processors_qualnames=["asdf"],
+    )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
diff --git a/examples/online_serving/openai_completion_client_logits_processor.py b/examples/online_serving/openai_completion_client_logits_processor.py
new file mode 100644
index 00000000000..df6e4e94296
--- /dev/null
+++ b/examples/online_serving/openai_completion_client_logits_processor.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=args.stream,
+        logprobs=3,
+    )
+
+    print("-" * 50)
+    print("Completion results:")
+    if args.stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bcb040db323..92c6df6fdec 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -498,8 +498,9 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
-    logits_processors_qualnames: Optional[list[str]]
-    logits_processors_entrypoints: Optional[list[LogitsProcessorEntrypoint]]
+    logits_processors_qualnames: Optional[list[str]] = None
+    logits_processors_entrypoints: Optional[
+        list[LogitsProcessorEntrypoint]] = None
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index aeee297faf0..bced3ae7b11 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, TypedDict, Union
+from typing import Callable, Union
 
 import torch
+from typing_extensions import TypedDict
 
 from vllm.plugins import load_plugins_by_group
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,

From b420aac531ac67aabdcc6e3ea846299808535905 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 7 Jul 2025 10:30:50 -0400
Subject: [PATCH 138/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/plugins/__init__.py                |  8 +---
 vllm/v1/engine/core.py                  |  2 +-
 vllm/v1/sample/logits_processor/load.py | 56 +++++++++++++++++++++----
 3 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index cf04859ada3..2cb177b9ba7 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -3,7 +3,6 @@
 
 import logging
 import os
-from collections.abc import Sequence
 from typing import Any, Callable
 
 import torch
@@ -18,17 +17,14 @@
 plugins_loaded = False
 
 
-def load_plugins_by_group(
-    group: str, allowed_plugins: Sequence[str] = ()
-) -> dict[str, Callable[[], Any]]:
+def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
     import sys
     if sys.version_info < (3, 10):
         from importlib_metadata import entry_points
     else:
         from importlib.metadata import entry_points
 
-    if not allowed_plugins:
-        allowed_plugins = envs.VLLM_PLUGINS
+    allowed_plugins = envs.VLLM_PLUGINS
 
     discovered_plugins = entry_points(group=group)
     if len(discovered_plugins) == 0:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8cb5de032a2..336daacc786 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -65,7 +65,7 @@ def __init__(self,
         from vllm.plugins import load_general_plugins
         from vllm.v1.sample.logits_processor.load import load_logitsprocs
         load_general_plugins()
-        load_logitsprocs(vllm_config.allowed_logitsprocs)
+        load_logitsprocs(vllm_config.logits_processors)
 
         self.vllm_config = vllm_config
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index bced3ae7b11..91a67a65c18 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -1,16 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Union
+import logging
+from typing import Callable, Optional, Union
 
 import torch
 from typing_extensions import TypedDict
 
-from vllm.plugins import load_plugins_by_group
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
 
+logger = logging.getLogger(__name__)
+
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
 LogitprocCtor = Callable[[], None]
 logitsprocs_ctors: list[LogitprocCtor] = []
@@ -27,24 +29,60 @@ class LogitsProcessorEntrypoint(TypedDict):
 LogitsProcessorsSpec = Union[str, LogitsProcessorEntrypoint]
 
 
-def load_logitsprocs(allowed_logitsprocs: list[str]) -> None:
+def _load_logitsprocs(
+    logitsprocs: list[LogitsProcessorsSpec]
+) -> dict[str, Callable[[], Any]]:
+    assert logitsprocs
+
+    import sys
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
+    if len(installed_logitsprocs_plugins) == 0:
+        logger.debug("No logitsprocs plugins installed (group %s).", LOGITSPROCS_GROUP)
+        return {}
+
+    # Use INFO for non-default groups and DEBUG for the default group
+    log_level = logger.info
+    log_level("Available logitsprocs plugins (group %s):", LOGITSPROCS_GROUP)
+    for plugin in installed_logitsprocs_plugins:
+        log_level("- %s -> %s", plugin.name, plugin.value)
+
+    plugins = dict[str, Callable[[], Any]]()
+    for plugin in installed_logitsprocs_plugins:
+        if logitsprocs is None or plugin.name in logitsprocs:
+            if logitsprocs is not None:
+                log_level("Loading plugin %s", plugin.name)
+
+            try:
+                func = plugin.load()
+                plugins[plugin.name] = func
+            except Exception:
+                logger.exception("Failed to load plugin %s", plugin.name)
+
+    return plugins
+
+
+def load_logitsprocs(logitsprocs: Optional[list[LogitsProcessorsSpec]]) -> None:
     """WARNING: logitsprocs can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
     """
     global logitsprocs_loaded
     if logitsprocs_loaded:
+        # Idempotent after first load in a process
         return
     logitsprocs_loaded = True
-
-    # some platform-specific configurations
     from vllm.platforms import current_platform
-
-    if current_platform.is_tpu():
+    if not logitsprocs or current_platform.is_tpu():
+        # No logitsprocs specified by caller
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
         return
-    plugins = load_plugins_by_group(group=LOGITSPROCS_GROUP,
-                                    allowed_plugins=allowed_logitsprocs)
+
+    plugins = _load_logitsprocs(logitsprocs)
     # general plugins, we only need to execute the loaded functions
     for func in plugins.values():
         func()

From 6a405ab6b88293ce0bb67d240cd87611eef308e6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 7 Jul 2025 17:09:48 -0400
Subject: [PATCH 139/180] first pass at lp loading system

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor.py      |  6 +-
 vllm/config.py                                |  7 +-
 vllm/engine/arg_utils.py                      | 14 +--
 vllm/entrypoints/llm.py                       |  8 +-
 vllm/v1/engine/core.py                        |  3 +-
 vllm/v1/sample/logits_processor/__init__.py   |  4 +-
 vllm/v1/sample/logits_processor/load.py       | 97 +++++++++++++------
 vllm/v1/sample/logits_processor/state.py      |  9 +-
 8 files changed, 90 insertions(+), 58 deletions(-)

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
index 043b404be24..908c8cb2ab6 100644
--- a/examples/offline_inference/logits_processor/logits_processor.py
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
-from vllm.v1.sample.logits_processor import LogitsProcessorEntrypoint
+from vllm.v1.sample.logits_processor import LogitProcessorEntrypoint
 
 # Sample prompts.
 prompts = [
@@ -20,9 +20,9 @@ def main():
     llm = LLM(
         model="facebook/opt-125m",
         logits_processors_entrypoints=[
-            LogitsProcessorEntrypoint(package_name="qsdf", entrypoint_name="bsdf")
+            LogitProcessorEntrypoint(package_name="qsdf", entrypoint_name="bsdf")
         ],
-        logits_processors_qualnames=["asdf"],
+        logits_processors_fqns=["asdf"],
     )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
diff --git a/vllm/config.py b/vllm/config.py
index 045bfae1f93..8ed79134e92 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -51,7 +51,6 @@
                         cuda_device_count_stateless, get_cpu_memory,
                         get_open_port, is_torch_equal_or_newer, random_uuid,
                         resolve_obj_by_qualname)
-from vllm.v1.sample.logits_processor import LogitsProcessorsSpec
 
 # yapf: enable
 
@@ -4360,8 +4359,10 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
-    logits_processors: Optional[list[LogitsProcessorsSpec]] = None
-    """Specifications for which logits processors to load"""
+    logits_processors_fqns: Optional[list[str]] = None
+    """Logits processors to load by fully-qualified name (FQN)"""
+    logits_processors_entrypoints: Optional[list[str]] = None
+    """Logits processors to load by entrypoint"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d34db6877b4..a605ef842ea 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -44,8 +44,6 @@
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
 from vllm.v1.sample.logits_processor import logitsprocs_package_pattern
-from vllm.v1.sample.logits_processor.load import (LogitsProcessorEntrypoint,
-                                                  LogitsProcessorsSpec)
 
 # yapf: enable
 
@@ -498,9 +496,8 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
-    logits_processors_qualnames: Optional[list[str]] = None
-    logits_processors_entrypoints: Optional[
-        list[LogitsProcessorEntrypoint]] = None
+    logits_processors_fqns: Optional[list[str]] = None
+    logits_processors_entrypoints: Optional[list[str]] = None
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -1334,10 +1331,6 @@ def create_engine_config(
             collect_detailed_traces=self.collect_detailed_traces,
         )
 
-        logits_processors: Optional[list[LogitsProcessorsSpec]] = (
-            ((self.logits_processors_qualnames or []) +
-             (self.logits_processors_entrypoints or [])) or None)
-
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1354,7 +1347,8 @@ def create_engine_config(
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             additional_config=self.additional_config,
-            logits_processors=logits_processors,
+            logits_processors_fqns=self.logits_processors_fqns,
+            logits_processors_entrypoints=self.logits_processors_entrypoints,
         )
 
         return config
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e24f5ecceef..439579bdb28 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -49,7 +49,6 @@
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
-from vllm.v1.sample.logits_processor import LogitsProcessorEntrypoint
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -192,9 +191,8 @@ def __init__(
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
-        logits_processors_qualnames: Optional[list[str]] = None,
-        logits_processors_entrypoints: Optional[
-            list[LogitsProcessorEntrypoint]] = None,
+        logits_processors_fqns: Optional[list[str]] = None,
+        logits_processors_entrypoints: Optional[list[str]] = None,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -268,7 +266,7 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
-            logits_processors_qualnames=logits_processors_qualnames,
+            logits_processors_fqns=logits_processors_fqns,
             logits_processors_entrypoints=logits_processors_entrypoints,
             **kwargs,
         )
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 336daacc786..b6435034095 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -65,7 +65,8 @@ def __init__(self,
         from vllm.plugins import load_general_plugins
         from vllm.v1.sample.logits_processor.load import load_logitsprocs
         load_general_plugins()
-        load_logitsprocs(vllm_config.logits_processors)
+        load_logitsprocs(vllm_config.logits_processors_fqns,
+                         vllm_config.logits_processors_entrypoints)
 
         self.vllm_config = vllm_config
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index f46bc07085f..06c7c32788b 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -5,7 +5,7 @@
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
-                                                  LogitsProcessorEntrypoint,
+                                                  LogitProcessorEntrypoint,
                                                   LogitsProcessorsSpec,
                                                   init_builtin_logitsprocs,
                                                   logitsprocs_ctors)
@@ -28,6 +28,6 @@
     "LogitsProcessorManager",
     "logitsprocs_ctors",
     "logitsprocs_package_pattern",
-    "LogitsProcessorEntrypoint",
+    "LogitProcessorEntrypoint",
     "LogitsProcessorsSpec",
 ]
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 91a67a65c18..19774459d6d 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
 import logging
-from typing import Callable, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from typing_extensions import TypedDict
 
+from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
@@ -14,25 +16,53 @@
 logger = logging.getLogger(__name__)
 
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
-LogitprocCtor = Callable[[], None]
+LogitprocCtor = Callable[[], LogitsProcessor]
 logitsprocs_ctors: list[LogitprocCtor] = []
 # make sure one process only loads logitsprocs once
 logitsprocs_loaded = False
 
 
-class LogitsProcessorEntrypoint(TypedDict):
+class LogitProcessorEntrypoint(TypedDict):
     package_name: str
     entrypoint_name: str
 
 
 # Specify logitproc by qualname (str) or package and entrypoint name
-LogitsProcessorsSpec = Union[str, LogitsProcessorEntrypoint]
+LogitsProcessorsSpec = Union[str, LogitProcessorEntrypoint]
 
 
-def _load_logitsprocs(
-    logitsprocs: list[LogitsProcessorsSpec]
-) -> dict[str, Callable[[], Any]]:
-    assert logitsprocs
+def load_logitsprocs_fqns(
+        fqns: Optional[list[str]]) -> list[Callable[[], LogitsProcessor]]:
+    if not fqns:
+        return []
+
+    logger.info("Attempting to load the following logits processors via FQNs:")
+
+    constructors: list[Callable[[], LogitsProcessor]] = []
+    for fqn in fqns:
+        logger.info(" - Loading logits processor %s", fqn)
+        module_path, qualname = fqn.split(":")
+        # Load module
+        module = importlib.import_module(module_path)
+        # Walk down dotted path to get logitproc constructor
+        obj = module
+        for attr in qualname.split("."):
+            obj = getattr(obj, attr)
+        if not callable(obj):
+            raise ValueError(f"{fqn} does not point to a Callable.")
+        constructors.append(obj)
+
+    return constructors
+
+
+def load_logitsprocs_entrypoints(
+        entrypoints: Optional[list[str]]) -> list[Callable[[], Any]]:
+    if not entrypoints:
+        return []
+
+    logger.info(
+        "Attempting to load the following logits processors via "
+        "entrypoints: %s", entrypoints)
 
     import sys
     if sys.version_info < (3, 10):
@@ -40,52 +70,61 @@ def _load_logitsprocs(
     else:
         from importlib.metadata import entry_points
 
-    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
+    installed_logitsprocs_plugins = {
+        plugin.name: plugin
+        for plugin in entry_points(group=LOGITSPROCS_GROUP)
+    }
     if len(installed_logitsprocs_plugins) == 0:
-        logger.debug("No logitsprocs plugins installed (group %s).", LOGITSPROCS_GROUP)
-        return {}
+        logger.debug("No logitsprocs plugins installed (group %s).",
+                     LOGITSPROCS_GROUP)
+        return []
 
     # Use INFO for non-default groups and DEBUG for the default group
     log_level = logger.info
     log_level("Available logitsprocs plugins (group %s):", LOGITSPROCS_GROUP)
-    for plugin in installed_logitsprocs_plugins:
+    for plugin in installed_logitsprocs_plugins.values():
         log_level("- %s -> %s", plugin.name, plugin.value)
 
-    plugins = dict[str, Callable[[], Any]]()
-    for plugin in installed_logitsprocs_plugins:
-        if logitsprocs is None or plugin.name in logitsprocs:
-            if logitsprocs is not None:
-                log_level("Loading plugin %s", plugin.name)
+    constructors: list[Callable[[], LogitsProcessor]] = []
+    for entrypoint in entrypoints:
+        if entrypoint not in installed_logitsprocs_plugins:
+            raise ValueError(
+                f"Logit processor entrypoint string {entrypoint} does not "
+                "name a valid entrypoint.")
+        log_level("Loading plugin %s", entrypoint)
 
-            try:
-                func = plugin.load()
-                plugins[plugin.name] = func
-            except Exception:
-                logger.exception("Failed to load plugin %s", plugin.name)
+        try:
+            func = installed_logitsprocs_plugins[entrypoint].load()
+            constructors.append(func)
+        except Exception:
+            logger.exception("Failed to load plugin %s", entrypoint)
 
-    return plugins
+    return constructors
 
 
-def load_logitsprocs(logitsprocs: Optional[list[LogitsProcessorsSpec]]) -> None:
+def load_logitsprocs(
+    logits_processors_fqns: Optional[list[str]],
+    logits_processors_entrypoints: Optional[list[str]],
+) -> None:
     """WARNING: logitsprocs can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
     """
     global logitsprocs_loaded
+    global logitsprocs_ctors
     if logitsprocs_loaded:
         # Idempotent after first load in a process
         return
     logitsprocs_loaded = True
     from vllm.platforms import current_platform
-    if not logitsprocs or current_platform.is_tpu():
+    if current_platform.is_tpu():
         # No logitsprocs specified by caller
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
         return
 
-    plugins = _load_logitsprocs(logitsprocs)
-    # general plugins, we only need to execute the loaded functions
-    for func in plugins.values():
-        func()
+    logitsprocs_ctors = (
+        load_logitsprocs_entrypoints(logits_processors_entrypoints) +
+        load_logitsprocs_fqns(logits_processors_fqns))
 
 
 def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index a0beba3d622..9878a05ff7d 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -178,11 +178,10 @@ class LogitsProcessorManager:
 
     def add_logitsprocs_by_ctor(self,
                                 ctor_list: list["LogitprocCtor"]) -> None:
-        pass
-        # for ctor in ctor_list:
-        #     logitproc: LogitsProcessor = ctor()
-        #     (self.argmax_invariant if logitproc.is_argmax_invariant() else
-        #      self.non_argmax_invariant).append(logitproc)
+        for ctor in ctor_list:
+            logitproc: LogitsProcessor = ctor()
+            (self.argmax_invariant if logitproc.is_argmax_invariant() else
+             self.non_argmax_invariant).append(logitproc)
 
     @property
     def all(self) -> Iterator["LogitsProcessor"]:

From 0de1e732b5b7ee245e2eea60889da3a872ffd099 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Mon, 7 Jul 2025 23:21:17 -0400
Subject: [PATCH 140/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor/load.py | 28 +++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 19774459d6d..d63c9746ae7 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -40,17 +40,20 @@ def load_logitsprocs_fqns(
 
     constructors: list[Callable[[], LogitsProcessor]] = []
     for fqn in fqns:
-        logger.info(" - Loading logits processor %s", fqn)
-        module_path, qualname = fqn.split(":")
-        # Load module
-        module = importlib.import_module(module_path)
-        # Walk down dotted path to get logitproc constructor
-        obj = module
-        for attr in qualname.split("."):
-            obj = getattr(obj, attr)
-        if not callable(obj):
-            raise ValueError(f"{fqn} does not point to a Callable.")
-        constructors.append(obj)
+        logger.info("Loading logits processor %s", fqn)
+        try:
+            module_path, qualname = fqn.split(":")
+            # Load module
+            module = importlib.import_module(module_path)
+            # Walk down dotted name to get logitproc constructor
+            obj = module
+            for attr in qualname.split("."):
+                obj = getattr(obj, attr)
+            if not callable(obj):
+                raise ValueError(f"{fqn} is not a Callable.")
+            constructors.append(obj)
+        except Exception:
+            logger.exception("Failed to load logits processor %s", fqn)
 
     return constructors
 
@@ -89,8 +92,7 @@ def load_logitsprocs_entrypoints(
     for entrypoint in entrypoints:
         if entrypoint not in installed_logitsprocs_plugins:
             raise ValueError(
-                f"Logit processor entrypoint string {entrypoint} does not "
-                "name a valid entrypoint.")
+                f"Invalid logit processor entrypoint string {entrypoint}.")
         log_level("Loading plugin %s", entrypoint)
 
         try:

From c8e86710e3e93437972fca46bf927bbaef509f60 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 00:45:11 -0400
Subject: [PATCH 141/180] loading logitsprocs

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor.py         |  7 +++----
 vllm/engine/arg_utils.py                         | 16 +++++++---------
 vllm/v1/engine/core.py                           |  3 ---
 vllm/v1/sample/logits_processor/load.py          | 11 +++++++----
 vllm/v1/worker/gpu_input_batch.py                |  9 +++++++--
 vllm/v1/worker/gpu_model_runner.py               |  3 +++
 6 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
index 908c8cb2ab6..e24d358dbac 100644
--- a/examples/offline_inference/logits_processor/logits_processor.py
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import LLM, SamplingParams
-from vllm.v1.sample.logits_processor import LogitProcessorEntrypoint
 
 # Sample prompts.
 prompts = [
@@ -19,10 +18,10 @@ def main():
     # Create an LLM.
     llm = LLM(
         model="facebook/opt-125m",
-        logits_processors_entrypoints=[
-            LogitProcessorEntrypoint(package_name="qsdf", entrypoint_name="bsdf")
+        logits_processors_entrypoints=["register_lp"],
+        logits_processors_fqns=[
+            "vllm.v1.sample.logits_processor.impls:MinPLogitsProcessor"
         ],
-        logits_processors_fqns=["asdf"],
     )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9ce8a9e26f4..0f22adb665d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -43,7 +43,6 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
-from vllm.v1.sample.logits_processor import logitsprocs_package_pattern
 
 # yapf: enable
 
@@ -303,12 +302,12 @@ def comma_separated_list(
       Parsed list of string values
     """
     items = value.split(',')
-    if pattern:
-        for item in items:
-            if not pattern.fullmatch(item):
-                raise argparse.ArgumentTypeError(
-                    f"Invalid item '{item}'. Each item must match the "
-                    f"pattern: {pattern.pattern}")
+    # if pattern:
+    #     for item in items:
+    #         if not pattern.fullmatch(item):
+    #             raise argparse.ArgumentTypeError(
+    #                 f"Invalid item '{item}'. Each item must match the "
+    #                 f"pattern: {pattern.pattern}")
     return items
 
 
@@ -942,8 +941,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         logitsprocs_group.add_argument(
             "--logits-processors-entrypoints",
-            type=lambda x: comma_separated_list(x, logitsprocs_package_pattern
-                                                ),
+            type=lambda x: comma_separated_list(x, None),
             default=[],
             help="Comma-separated list of allowed logits processor "
             "entrypoints (acceptable entrypoint formats: "
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b6435034095..e2fdf6f8a11 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -63,10 +63,7 @@ def __init__(self,
 
         # plugins need to be loaded at the engine/scheduler level too
         from vllm.plugins import load_general_plugins
-        from vllm.v1.sample.logits_processor.load import load_logitsprocs
         load_general_plugins()
-        load_logitsprocs(vllm_config.logits_processors_fqns,
-                         vllm_config.logits_processors_entrypoints)
 
         self.vllm_config = vllm_config
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index d63c9746ae7..c0d34943207 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -36,7 +36,9 @@ def load_logitsprocs_fqns(
     if not fqns:
         return []
 
-    logger.info("Attempting to load the following logits processors via FQNs:")
+    logger.info(
+        "Attempting to load the following logits processors via FQNs: %s",
+        fqns)
 
     constructors: list[Callable[[], LogitsProcessor]] = []
     for fqn in fqns:
@@ -107,7 +109,7 @@ def load_logitsprocs_entrypoints(
 def load_logitsprocs(
     logits_processors_fqns: Optional[list[str]],
     logits_processors_entrypoints: Optional[list[str]],
-) -> None:
+) -> list[LogitprocCtor]:
     """WARNING: logitsprocs can be loaded for multiple times in different
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
@@ -116,17 +118,18 @@ def load_logitsprocs(
     global logitsprocs_ctors
     if logitsprocs_loaded:
         # Idempotent after first load in a process
-        return
+        return logitsprocs_ctors
     logitsprocs_loaded = True
     from vllm.platforms import current_platform
     if current_platform.is_tpu():
         # No logitsprocs specified by caller
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
-        return
+        return []
 
     logitsprocs_ctors = (
         load_logitsprocs_entrypoints(logits_processors_entrypoints) +
         load_logitsprocs_fqns(logits_processors_fqns))
+    return logitsprocs_ctors
 
 
 def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index fea06aa87c8..786581a0af2 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -17,8 +17,7 @@
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
                                              MoveDirectionality,
-                                             init_builtin_logitsprocs,
-                                             logitsprocs_ctors)
+                                             init_builtin_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
@@ -70,9 +69,15 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        logits_processors_fqns: Optional[list[str]] = None,
+        logits_processors_entrypoints: Optional[list[str]] = None,
         is_spec_decode: bool = False,
         logits_processing_needs_token_ids: bool = False,
     ):
+        from vllm.v1.sample.logits_processor.load import load_logitsprocs
+        logitsprocs_ctors = load_logitsprocs(logits_processors_fqns,
+                                             logits_processors_entrypoints)
+
         self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5a26e88db1f..6adf7f45adb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -213,6 +213,9 @@ def __init__(
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
+            logits_processors_fqns=vllm_config.logits_processors_fqns,
+            logits_processors_entrypoints=vllm_config.
+            logits_processors_entrypoints,
         )
 
         self.use_cuda_graph = (

From 52146dc1ef40b3e453a0af5230679c4f2175b6c8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 08:52:38 -0400
Subject: [PATCH 142/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_rejection_sampler.py   |  4 +-
 tests/v1/sample/test_sampler.py             |  4 +-
 tests/v1/worker/test_gpu_input_batch.py     |  4 +-
 vllm/v1/sample/logits_processor/__init__.py | 14 +---
 vllm/v1/sample/logits_processor/core.py     |  1 +
 vllm/v1/sample/logits_processor/impls.py    | 22 ++---
 vllm/v1/sample/logits_processor/load.py     | 90 +++++++--------------
 vllm/v1/sample/logits_processor/state.py    | 22 ++---
 vllm/v1/sample/logits_processor/utils.py    | 11 +++
 vllm/v1/sample/metadata.py                  |  4 +-
 vllm/v1/worker/gpu_input_batch.py           | 40 +++++----
 vllm/v1/worker/gpu_model_runner.py          | 20 +++--
 12 files changed, 106 insertions(+), 130 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3a4d48afc9d..06403b6ca13 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessorsManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
@@ -69,7 +69,7 @@ def create_sampling_metadata(
         output_token_ids=[],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessorsManager(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index ea10661ea11..e7a7ee4ce70 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,7 +9,7 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessorsManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
@@ -147,7 +147,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessorsManager(),
     )
     return fake_sampling_metadata
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 943a13debad..b9d9ec5d7d7 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -13,7 +13,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessorsManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 1 for x in repetition_penalties)),
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessorsManager(),
     )
 
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 06c7c32788b..9b608172db1 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -4,14 +4,10 @@
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
-                                                  LogitProcessorEntrypoint,
-                                                  LogitsProcessorsSpec,
-                                                  init_builtin_logitsprocs,
-                                                  logitsprocs_ctors)
+from vllm.v1.sample.logits_processor.load import LogitprocCtor
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
-                                                   LogitsProcessorManager,
+                                                   LogitsProcessorsManager,
                                                    MoveDirectionality)
 from vllm.v1.sample.logits_processor.utils import logitsprocs_package_pattern
 
@@ -21,13 +17,9 @@
     "MinPLogitsProcessor",
     "MinTokensLogitsProcessor",
     "LogitprocCtor",
-    "init_builtin_logitsprocs",
     "BatchUpdate",
     "BatchUpdateBuilder",
     "MoveDirectionality",
-    "LogitsProcessorManager",
-    "logitsprocs_ctors",
+    "LogitsProcessorsManager",
     "logitsprocs_package_pattern",
-    "LogitProcessorEntrypoint",
-    "LogitsProcessorsSpec",
 ]
diff --git a/vllm/v1/sample/logits_processor/core.py b/vllm/v1/sample/logits_processor/core.py
index f519c2ec252..3c8f24811cc 100644
--- a/vllm/v1/sample/logits_processor/core.py
+++ b/vllm/v1/sample/logits_processor/core.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Optional
 
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 60ba2e92b1c..052a19cb19d 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -4,30 +4,30 @@
 from typing import Optional
 
 import torch
-from torch._prims_common import DeviceLikeType
 
 from vllm import SamplingParams
 from vllm.v1.sample.logits_processor.core import LogitsProcessor
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    MoveDirectionality)
+from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 
 
 class MinPLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, max_num_reqs: int, pin_memory: bool,
-                 device: DeviceLikeType):
+    def __init__(self, args: LogitProcessorCtorArgs):
         super().__init__()
+        max_num_reqs = args.vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
         self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
                                             dtype=torch.float32,
                                             device="cpu",
-                                            pin_memory=pin_memory)
+                                            pin_memory=args.is_pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
         # Pre-allocated device tensor
         self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
                                                       dtype=torch.float32,
-                                                      device=device)
+                                                      device=args.device)
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
@@ -98,11 +98,11 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class LogitBiasLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, pin_memory: bool, device: torch.device):
+    def __init__(self, args: LogitProcessorCtorArgs):
         super().__init__()
         self.biases: dict[int, dict[int, float]] = {}
-        self.device = device
-        self.pin_memory = pin_memory
+        self.device = args.device
+        self.pin_memory = args.is_pin_memory
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
         self.logits_slice = (self._device_tensor([], torch.int32),
@@ -177,12 +177,12 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class MinTokensLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, pin_memory: bool, device: torch.device):
+    def __init__(self, args: LogitProcessorCtorArgs):
         # index -> (min_toks, output_token_ids, stop_token_ids)
         super().__init__()
         self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
-        self.device = device
-        self.pin_memory = pin_memory
+        self.device = args.device
+        self.pin_memory = args.is_pin_memory
 
         # (req_idx_tensor,eos_tok_id_tensor)
         self.logits_slice: tuple[torch.Tensor,
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index c0d34943207..8f288823b99 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -1,38 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+import itertools
 import logging
-from typing import Any, Callable, Optional, Union
-
-import torch
-from typing_extensions import TypedDict
+from typing import Callable, Optional
 
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.state import LogitsProcessorManager
+from vllm.v1.sample.logits_processor.state import LogitsProcessorsManager
+from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 
 logger = logging.getLogger(__name__)
 
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
-LogitprocCtor = Callable[[], LogitsProcessor]
-logitsprocs_ctors: list[LogitprocCtor] = []
-# make sure one process only loads logitsprocs once
-logitsprocs_loaded = False
-
-
-class LogitProcessorEntrypoint(TypedDict):
-    package_name: str
-    entrypoint_name: str
-
+LogitprocCtor = Callable[[LogitProcessorCtorArgs], LogitsProcessor]
 
-# Specify logitproc by qualname (str) or package and entrypoint name
-LogitsProcessorsSpec = Union[str, LogitProcessorEntrypoint]
+_builtin_logitsprocs_ctors: list[LogitprocCtor] = [
+    MinTokensLogitsProcessor,
+    LogitBiasLogitsProcessor,
+    MinPLogitsProcessor,
+]
 
 
-def load_logitsprocs_fqns(
-        fqns: Optional[list[str]]) -> list[Callable[[], LogitsProcessor]]:
+def _load_logitsprocs_ctors_by_fqns(
+        fqns: Optional[list[str]]) -> list[LogitprocCtor]:
     if not fqns:
         return []
 
@@ -40,7 +33,7 @@ def load_logitsprocs_fqns(
         "Attempting to load the following logits processors via FQNs: %s",
         fqns)
 
-    constructors: list[Callable[[], LogitsProcessor]] = []
+    constructors: list[LogitprocCtor] = []
     for fqn in fqns:
         logger.info("Loading logits processor %s", fqn)
         try:
@@ -60,8 +53,8 @@ def load_logitsprocs_fqns(
     return constructors
 
 
-def load_logitsprocs_entrypoints(
-        entrypoints: Optional[list[str]]) -> list[Callable[[], Any]]:
+def _load_logitsprocs_ctors_by_entrypoints(
+        entrypoints: Optional[list[str]]) -> list[LogitprocCtor]:
     if not entrypoints:
         return []
 
@@ -90,7 +83,7 @@ def load_logitsprocs_entrypoints(
     for plugin in installed_logitsprocs_plugins.values():
         log_level("- %s -> %s", plugin.name, plugin.value)
 
-    constructors: list[Callable[[], LogitsProcessor]] = []
+    constructors: list[LogitprocCtor] = []
     for entrypoint in entrypoints:
         if entrypoint not in installed_logitsprocs_plugins:
             raise ValueError(
@@ -106,7 +99,7 @@ def load_logitsprocs_entrypoints(
     return constructors
 
 
-def load_logitsprocs(
+def _load_custom_logitsprocs_ctors(
     logits_processors_fqns: Optional[list[str]],
     logits_processors_entrypoints: Optional[list[str]],
 ) -> list[LogitprocCtor]:
@@ -114,51 +107,22 @@ def load_logitsprocs(
     processes. They should be designed in a way that they can be loaded
     multiple times without causing issues.
     """
-    global logitsprocs_loaded
-    global logitsprocs_ctors
-    if logitsprocs_loaded:
-        # Idempotent after first load in a process
-        return logitsprocs_ctors
-    logitsprocs_loaded = True
     from vllm.platforms import current_platform
     if current_platform.is_tpu():
         # No logitsprocs specified by caller
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
         return []
 
-    logitsprocs_ctors = (
-        load_logitsprocs_entrypoints(logits_processors_entrypoints) +
-        load_logitsprocs_fqns(logits_processors_fqns))
-    return logitsprocs_ctors
+    return (
+        _load_logitsprocs_ctors_by_entrypoints(logits_processors_entrypoints) +
+        _load_logitsprocs_ctors_by_fqns(logits_processors_fqns))
 
 
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device) -> LogitsProcessorManager:
-    """Construct 'builtin' vLLM logitsprocs which the engine
-    loads by default.
-
-    Args:
-      pin_memory_available: pinned memory is available for use
-                            for use by logitsproc
-      max_num_reqs: ceiling on request count in persistent batch
-      device: inference device
-
-    Returns:
-      Data structure encapsulating loaded logitsprocs
-    """
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=pin_memory_available,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorManager(
-        non_argmax_invariant=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
-        ],
-        argmax_invariant=[min_p_logitproc],
+def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessorsManager:
+    _custom_logitsprocs_ctors = _load_custom_logitsprocs_ctors(
+        args.vllm_config.logits_processors_fqns,
+        args.vllm_config.logits_processors_entrypoints,
     )
+    return LogitsProcessorsManager(
+        ctor(args) for ctor in itertools.chain(_builtin_logitsprocs_ctors,
+                                               _custom_logitsprocs_ctors))
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 9878a05ff7d..7c58fa2af67 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -10,7 +10,6 @@
 
 if TYPE_CHECKING:
     from vllm.v1.sample.logits_processor.core import LogitsProcessor
-    from vllm.v1.sample.logits_processor.load import LogitprocCtor
 
 
 class MoveDirectionality(Enum):
@@ -169,19 +168,20 @@ def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
 
 
 @dataclass
-class LogitsProcessorManager:
+class LogitsProcessorsManager:
     """Encapsulates initialized logitsproc objects."""
     argmax_invariant: list["LogitsProcessor"] = field(
-        default_factory=list)  # argmax-invariant logitsprocs
+        default_factory=list, init=False)  # argmax-invariant logitsprocs
     non_argmax_invariant: list["LogitsProcessor"] = field(
-        default_factory=list)  # non-argmax-invariant logitsprocs
-
-    def add_logitsprocs_by_ctor(self,
-                                ctor_list: list["LogitprocCtor"]) -> None:
-        for ctor in ctor_list:
-            logitproc: LogitsProcessor = ctor()
-            (self.argmax_invariant if logitproc.is_argmax_invariant() else
-             self.non_argmax_invariant).append(logitproc)
+        default_factory=list, init=False)  # non-argmax-invariant logitsprocs
+
+    def __init__(
+            self,
+            logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None:
+        if logitsprocs:
+            for logitproc in logitsprocs:
+                (self.argmax_invariant if logitproc.is_argmax_invariant() else
+                 self.non_argmax_invariant).append(logitproc)
 
     @property
     def all(self) -> Iterator["LogitsProcessor"]:
diff --git a/vllm/v1/sample/logits_processor/utils.py b/vllm/v1/sample/logits_processor/utils.py
index c0acd1cac12..b13c3fbd9dc 100644
--- a/vllm/v1/sample/logits_processor/utils.py
+++ b/vllm/v1/sample/logits_processor/utils.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
 from typing import Optional
 
 import regex as re
+import torch
+
+from vllm.config import VllmConfig
 
 # <package name>.<entrypoint name> compiled regular expression
 package_name_regex = r'[a-z0-9](?:[a-z0-9._-]*[a-z0-9])?'
@@ -17,3 +21,10 @@ def extract_package_and_function(s: str) -> Optional[tuple[str, str]]:
     if match:
         return match.group(1), match.group(2)
     return None
+
+
+@dataclass
+class LogitProcessorCtorArgs:
+    vllm_config: VllmConfig
+    device: torch.device
+    is_pin_memory: bool
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 1189b12f307..d578bbb2190 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessorsManager
 
 
 @dataclass
@@ -40,4 +40,4 @@ class SamplingMetadata:
     bad_words_token_ids: dict[int, list[list[int]]]
 
     # Loaded logits processors
-    logitsprocs: LogitsProcessorManager
+    logitsprocs: LogitsProcessorsManager
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 786581a0af2..9bbb9c931ae 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -16,8 +16,8 @@
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             MoveDirectionality)
+from vllm.v1.sample.logits_processor.state import LogitsProcessorsManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
@@ -69,15 +69,10 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
-        logits_processors_fqns: Optional[list[str]] = None,
-        logits_processors_entrypoints: Optional[list[str]] = None,
+        logitsprocs: LogitsProcessorsManager,
         is_spec_decode: bool = False,
         logits_processing_needs_token_ids: bool = False,
     ):
-        from vllm.v1.sample.logits_processor.load import load_logitsprocs
-        logitsprocs_ctors = load_logitsprocs(logits_processors_fqns,
-                                             logits_processors_entrypoints)
-
         self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -221,14 +216,6 @@ def __init__(
         # updates. Should reset each step.
         self.batch_update_builder = BatchUpdateBuilder()
 
-        # Init builtin logits processors
-        self.logitsprocs = init_builtin_logitsprocs(
-            pin_memory_available=pin_memory,
-            max_num_reqs=max_num_reqs + 1,
-            device=device)
-        # Init custom logits processors
-        self.logitsprocs.add_logitsprocs_by_ctor(logitsprocs_ctors)
-
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
@@ -242,7 +229,7 @@ def __init__(
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
         # This is updated each time the batch constituents change.
-        self.sampling_metadata = self._make_sampling_metadata()
+        self.sampling_metadata = self._make_sampling_metadata(logitsprocs)
 
         self.pooling_params: dict[str, PoolingParams] = {}
 
@@ -591,19 +578,30 @@ def condense(self) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
+    @property
+    def _logitsprocs(self) -> Optional[LogitsProcessorsManager]:
+        if not self.sampling_metadata:
+            return None
+        return self.sampling_metadata.logitsprocs
+
     def refresh_metadata(self):
         """Apply batch updates, reset input batch at end of step
         
         * Apply batch add/remove/permute to logits procs' states
         * If batch state is modified, update sampling metadata
         """
+        if not (old_logitsprocs := self._logitsprocs):
+            raise RuntimeError("Expected input batch sampling metadata "
+                               "to be initialized.")
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
-        for logit_proc in self.logitsprocs.all:
+        for logit_proc in old_logitsprocs.all:
             logit_proc.update_state(batch_update)
         if batch_update:
-            self.sampling_metadata = self._make_sampling_metadata()
+            self.sampling_metadata = self._make_sampling_metadata(
+                old_logitsprocs)
 
-    def _make_sampling_metadata(self) -> SamplingMetadata:
+    def _make_sampling_metadata(
+            self, logitsprocs: LogitsProcessorsManager) -> SamplingMetadata:
         num_reqs = self.num_reqs
         if not self.all_greedy:
             temperature = copy_slice(self.temperature_cpu_tensor,
@@ -661,7 +659,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             no_penalties=self.no_penalties,
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
-            logitsprocs=self.logitsprocs,
+            logitsprocs=logitsprocs,
         )
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6adf7f45adb..df517e7a5aa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,6 +56,8 @@
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor.load import build_logitsprocs
+from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -68,7 +70,7 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from ..sample.logits_processor import LogitsProcessorManager
+from ..sample.logits_processor import LogitsProcessorsManager
 from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
@@ -213,9 +215,7 @@ def __init__(
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
-            logits_processors_fqns=vllm_config.logits_processors_fqns,
-            logits_processors_entrypoints=vllm_config.
-            logits_processors_entrypoints,
+            logitsprocs=self.logitsprocs,
         )
 
         self.use_cuda_graph = (
@@ -320,6 +320,15 @@ def __init__(
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
 
+        # Build logits processors. If specified by user, load custom
+        # logitsprocs constructors.
+        self.logitsprocs: LogitsProcessorsManager = build_logitsprocs(
+            LogitProcessorCtorArgs(
+                vllm_config=vllm_config,
+                device=self.device,
+                is_pin_memory=self.pin_memory,
+            ))
+
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention
@@ -2117,7 +2126,7 @@ def _dummy_sampler_run(
             output_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logitsprocs=LogitsProcessorManager(),
+            logitsprocs=LogitsProcessorsManager(),
         )
         try:
             sampler_output = self.sampler(logits=logits,
@@ -2410,6 +2419,7 @@ def may_reinitialize_input_batch(self,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
                 is_spec_decode=bool(self.vllm_config.speculative_config),
+                logitsprocs=self.logitsprocs,
             )
 
     def _allocate_kv_cache_tensors(

From e79f9add76d9e4ac49ebd257026913b47b9340fc Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 09:15:58 -0400
Subject: [PATCH 143/180] lp tests passing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/test_logits_processors.py | 21 +++++++++++++++------
 vllm/v1/sample/logits_processor/state.py  |  2 ++
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/test_logits_processors.py
index 84ee3b0392b..6b9a7dbfdbe 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/sample/test_logits_processors.py
@@ -14,6 +14,7 @@
                                    create_prompt_tokens_tensor,
                                    fake_apply_logitsprocs,
                                    fake_update_logitsprocs_state)
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -23,9 +24,10 @@
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             MoveDirectionality)
 # yapf: enable
+from vllm.v1.sample.logits_processor.load import build_logitsprocs
+from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -71,6 +73,15 @@ def __str__(self):
         return f"MyClass({summ})"
 
 
+def _generate_fake_logitsprocs_args(
+        device: torch.device) -> LogitProcessorCtorArgs:
+    return LogitProcessorCtorArgs(
+        vllm_config=VllmConfig(max_num_reqs=MAX_NUM_REQS),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+    )
+
+
 def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -88,10 +99,8 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_builtin_logitsprocs(
-        pin_memory_available=PIN_MEMORY_AVAILABLE,
-        max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
+    logitsprocs = build_logitsprocs(
+        _generate_fake_logitsprocs_args(device=device))
 
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 7c58fa2af67..d66e39bf642 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -178,6 +178,8 @@ class LogitsProcessorsManager:
     def __init__(
             self,
             logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None:
+        self.argmax_invariant = []
+        self.non_argmax_invariant = []
         if logitsprocs:
             for logitproc in logitsprocs:
                 (self.argmax_invariant if logitproc.is_argmax_invariant() else

From 2e330e19046b42058fd55868a2b4804e1bee0d1e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 09:23:16 -0400
Subject: [PATCH 144/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor/__init__.py | 6 +++++-
 vllm/v1/worker/gpu_input_batch.py           | 2 +-
 vllm/v1/worker/gpu_model_runner.py          | 6 +++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 9b608172db1..529186c4213 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -4,7 +4,9 @@
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.load import LogitprocCtor
+from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
+                                                  LogitProcessorCtorArgs,
+                                                  build_logitsprocs)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
                                                    LogitsProcessorsManager,
@@ -22,4 +24,6 @@
     "MoveDirectionality",
     "LogitsProcessorsManager",
     "logitsprocs_package_pattern",
+    "build_logitsprocs",
+    "LogitProcessorCtorArgs",
 ]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9bbb9c931ae..859067f6499 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -16,8 +16,8 @@
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
+                                             LogitsProcessorsManager,
                                              MoveDirectionality)
-from vllm.v1.sample.logits_processor.state import LogitsProcessorsManager
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c01be34c285..2fd88c51bd2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,8 +56,9 @@
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor.load import build_logitsprocs
-from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
+from vllm.v1.sample.logits_processor import (LogitProcessorCtorArgs,
+                                             LogitsProcessorsManager,
+                                             build_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -70,7 +71,6 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from ..sample.logits_processor import LogitsProcessorsManager
 from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 

From 7a60363dd0f86c1fd28ef10eca6237f0b3632519 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 09:43:44 -0400
Subject: [PATCH 145/180] logitsprocs

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor.py      |  4 +-
 vllm/v1/sample/logits_processor/impls.py      | 84 +++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py            | 18 ++--
 3 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
index e24d358dbac..ff6f87afe8e 100644
--- a/examples/offline_inference/logits_processor/logits_processor.py
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -18,9 +18,9 @@ def main():
     # Create an LLM.
     llm = LLM(
         model="facebook/opt-125m",
-        logits_processors_entrypoints=["register_lp"],
+        # logits_processors_entrypoints=["register_lp"],
         logits_processors_fqns=[
-            "vllm.v1.sample.logits_processor.impls:MinPLogitsProcessor"
+            "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor"
         ],
     )
     # Generate texts from the prompts.
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 052a19cb19d..0d39c3564f6 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -273,3 +273,87 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
+
+
+class DummyLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, args: LogitProcessorCtorArgs):
+        super().__init__()
+        max_num_reqs = args.vllm_config.scheduler_config.max_num_seqs
+        self.min_p_count: int = 0
+
+        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=args.is_pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        # Pre-allocated device tensor
+        self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
+                                                      dtype=torch.float32,
+                                                      device=args.device)
+        # Current slice of the device tensor
+        self.min_p: torch.Tensor = self.min_p_device[:0]
+
+    def is_argmax_invariant(self) -> bool:
+        """Min-p never impacts greedy sampling"""
+        return True
+
+    def get_min_p_by_index(self, index: int) -> float:
+        return float(self.min_p_cpu[index])
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        needs_update = False
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
+            if self.min_p_cpu[index] != min_p:
+                needs_update = True
+                self.min_p_cpu[index] = min_p
+            if min_p:
+                self.min_p_count += 1
+
+        if self.min_p_count:
+            # Process removed requests.
+            needs_update |= bool(batch_update.removed)
+            for index in batch_update.removed:
+                if self.min_p_cpu[index]:
+                    self.min_p_count -= 1
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                change = (min_p_a :=
+                          self.min_p_cpu[adx]) != (min_p_b :=
+                                                   self.min_p_cpu[bdx])
+                needs_update |= change
+                if change:
+                    self.min_p_cpu[bdx] = min_p_a
+                    if direct == MoveDirectionality.SWAP:
+                        self.min_p_cpu[adx] = min_p_b
+
+        # Update tensors if needed.
+        size = batch_update.batch_size
+        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
+            self.min_p = self.min_p_device[:size]
+            self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
+            self.min_p.unsqueeze_(1)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.min_p_count:
+            return logits
+
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Adjust min_p
+        adjusted_min_p = max_probabilities.mul_(self.min_p)
+        # Identify valid tokens using threshold comparison
+        invalid_token_mask = probability_values < adjusted_min_p
+        # Apply mask using boolean indexing
+        logits[invalid_token_mask] = -float('inf')
+        return logits
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2fd88c51bd2..14cace2eccb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -197,6 +197,15 @@ def __init__(
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        # Build logits processors. If specified by user, load custom
+        # logitsprocs constructors.
+        self.logitsprocs: LogitsProcessorsManager = build_logitsprocs(
+            LogitProcessorCtorArgs(
+                vllm_config=vllm_config,
+                device=self.device,
+                is_pin_memory=self.pin_memory,
+            ))
+
         # Input Batch
         # NOTE(Chen): Ideally, we should initialize the input batch inside
         # `initialize_kv_cache` based on the kv cache config. However, as in
@@ -320,15 +329,6 @@ def __init__(
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
 
-        # Build logits processors. If specified by user, load custom
-        # logitsprocs constructors.
-        self.logitsprocs: LogitsProcessorsManager = build_logitsprocs(
-            LogitProcessorCtorArgs(
-                vllm_config=vllm_config,
-                device=self.device,
-                is_pin_memory=self.pin_memory,
-            ))
-
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention

From 18129b4ec2fceaf57ed0908d1cfedcc832cb990b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 10:21:38 -0400
Subject: [PATCH 146/180] example w/ dummy logitproc

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor.py      | 11 ++-
 vllm/v1/sample/logits_processor/impls.py      | 82 +++++--------------
 2 files changed, 30 insertions(+), 63 deletions(-)

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
index ff6f87afe8e..129482f1302 100644
--- a/examples/offline_inference/logits_processor/logits_processor.py
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -10,8 +10,13 @@
     "The capital of France is",
     "The future of AI is",
 ]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.8, top_p=0.95),
+    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.8, top_p=0.95),
+]
 
 
 def main():
@@ -26,7 +31,7 @@ def main():
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params_list)
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
     for output in outputs:
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 0d39c3564f6..507454438fe 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -277,83 +277,45 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class DummyLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, args: LogitProcessorCtorArgs):
+    def __init__(self, _):
         super().__init__()
-        max_num_reqs = args.vllm_config.scheduler_config.max_num_seqs
-        self.min_p_count: int = 0
-
-        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=args.is_pin_memory)
-        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        # Pre-allocated device tensor
-        self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
-                                                      dtype=torch.float32,
-                                                      device=args.device)
-        # Current slice of the device tensor
-        self.min_p: torch.Tensor = self.min_p_device[:0]
+        self.req_info = {}
 
     def is_argmax_invariant(self) -> bool:
-        """Min-p never impacts greedy sampling"""
-        return True
-
-    def get_min_p_by_index(self, index: int) -> float:
-        return float(self.min_p_cpu[index])
+        """Never impacts greedy sampling"""
+        return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
         if not batch_update:
             return
 
-        needs_update = False
         # Process added requests.
         for index, params, _ in batch_update.added:
-            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
-            if self.min_p_cpu[index] != min_p:
-                needs_update = True
-                self.min_p_cpu[index] = min_p
-            if min_p:
-                self.min_p_count += 1
+            if isinstance(params, SamplingParams) and params.extra_args:
+                target_token = params.extra_args.get("target_token", None)
+            else:
+                target_token = None
+            self.req_info[index] = target_token
 
-        if self.min_p_count:
+        if self.req_info:
             # Process removed requests.
-            needs_update |= bool(batch_update.removed)
             for index in batch_update.removed:
-                if self.min_p_cpu[index]:
-                    self.min_p_count -= 1
+                self.req_info.pop(index, None)
 
             # Process moved requests, unidirectional (a->b) and swap (a<->b)
             for adx, bdx, direct in batch_update.moved:
-                change = (min_p_a :=
-                          self.min_p_cpu[adx]) != (min_p_b :=
-                                                   self.min_p_cpu[bdx])
-                needs_update |= change
-                if change:
-                    self.min_p_cpu[bdx] = min_p_a
-                    if direct == MoveDirectionality.SWAP:
-                        self.min_p_cpu[adx] = min_p_b
-
-        # Update tensors if needed.
-        size = batch_update.batch_size
-        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
-            self.min_p = self.min_p_device[:size]
-            self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
-            self.min_p.unsqueeze_(1)
+                if direct == MoveDirectionality.SWAP:
+                    (self.req_info[adx],
+                     self.req_info[bdx]) = (self.req_info[bdx],
+                                            self.req_info[adx])
+                else:
+                    self.req_info[bdx] = self.req_info[adx]
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if not self.min_p_count:
-            return logits
+        for bdx in range(logits.shape[0]):
+            if (target_token := self.req_info[bdx]) is not None:
+                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
+                mask[target_token] = False
+                logits[bdx, mask] = float('-inf')
 
-        # Convert logits to probability distribution
-        probability_values = torch.nn.functional.softmax(logits, dim=-1)
-        # Calculate maximum probabilities per sequence
-        max_probabilities = torch.amax(probability_values,
-                                       dim=-1,
-                                       keepdim=True)
-        # Adjust min_p
-        adjusted_min_p = max_probabilities.mul_(self.min_p)
-        # Identify valid tokens using threshold comparison
-        invalid_token_mask = probability_values < adjusted_min_p
-        # Apply mask using boolean indexing
-        logits[invalid_token_mask] = -float('inf')
         return logits

From e73c00c141c750e173a896d5888cee91cb70bf82 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 10:22:03 -0400
Subject: [PATCH 147/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 examples/offline_inference/logits_processor/logits_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor.py
index 129482f1302..027e7599745 100644
--- a/examples/offline_inference/logits_processor/logits_processor.py
+++ b/examples/offline_inference/logits_processor/logits_processor.py
@@ -23,7 +23,6 @@ def main():
     # Create an LLM.
     llm = LLM(
         model="facebook/opt-125m",
-        # logits_processors_entrypoints=["register_lp"],
         logits_processors_fqns=[
             "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor"
         ],

From 4af5159fa1a5f2dc91069033e4cafaa76a5f1988 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 10:31:44 -0400
Subject: [PATCH 148/180] entrypoint example

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor_entrypoint.py            | 43 +++++++++++++++++++
 ...s_processor.py => logits_processor_fqn.py} |  0
 pyproject.toml                                |  3 ++
 3 files changed, 46 insertions(+)
 create mode 100644 examples/offline_inference/logits_processor/logits_processor_entrypoint.py
 rename examples/offline_inference/logits_processor/{logits_processor.py => logits_processor_fqn.py} (100%)

diff --git a/examples/offline_inference/logits_processor/logits_processor_entrypoint.py b/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
new file mode 100644
index 00000000000..b5e6fe32e2f
--- /dev/null
+++ b/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.8, top_p=0.95),
+    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.8, top_p=0.95),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors_entrypoints=["dummy_logitproc"],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/logits_processor/logits_processor.py b/examples/offline_inference/logits_processor/logits_processor_fqn.py
similarity index 100%
rename from examples/offline_inference/logits_processor/logits_processor.py
rename to examples/offline_inference/logits_processor/logits_processor_fqn.py
diff --git a/pyproject.toml b/pyproject.toml
index 340abb38565..648690d6b27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,9 @@ vllm = "vllm.entrypoints.cli.main:main"
 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
 
+[project.entry-points."vllm.logits_processors"]
+dummy_logitproc = "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor"
+
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
 

From be7177a126bd5378879992b50b5e3105a9844ba3 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 10:42:32 -0400
Subject: [PATCH 149/180] cli arg

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f5cf858baf8..c9b1c4f4f5a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -948,7 +948,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "entrypoints (acceptable entrypoint formats: "
             "package.entrypoint | package.*).")
         logitsprocs_group.add_argument(
-            "--logits-processors-qualnames",
+            "--logits-processors-fqns",
             type=lambda x: comma_separated_list(x, None),
             default=[],
             help="Comma-separated list of logits processor qualified names.")

From 0ad8b1c62773bc4cc8d8bd1f7f22e7c20e9586ae Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 10:50:16 -0400
Subject: [PATCH 150/180] removed regex

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/engine/arg_utils.py                    | 17 +++--------------
 vllm/v1/sample/logits_processor/__init__.py |  2 --
 vllm/v1/sample/logits_processor/utils.py    | 16 ----------------
 3 files changed, 3 insertions(+), 32 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c9b1c4f4f5a..6799ef40af7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -286,29 +286,18 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     return copy.deepcopy(_compute_kwargs(cls))
 
 
-def comma_separated_list(
-    value: str,
-    pattern: Optional[re.Pattern],
-) -> list[str]:
+def comma_separated_list(value: str) -> list[str]:
     """Argparse-compatible comma-separated list arg type.
     
     Expected format: "<val0>,<val1>,<val2>" or just "<val>".
-    Optionally validate that each value satisifies the `pattern` regex.
 
     Args:
       value: string comma-separate list representation
-      pattern: optional regex
 
     Returns:
       Parsed list of string values
     """
     items = value.split(',')
-    # if pattern:
-    #     for item in items:
-    #         if not pattern.fullmatch(item):
-    #             raise argparse.ArgumentTypeError(
-    #                 f"Invalid item '{item}'. Each item must match the "
-    #                 f"pattern: {pattern.pattern}")
     return items
 
 
@@ -942,14 +931,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         logitsprocs_group.add_argument(
             "--logits-processors-entrypoints",
-            type=lambda x: comma_separated_list(x, None),
+            type=lambda x: comma_separated_list(x),
             default=[],
             help="Comma-separated list of allowed logits processor "
             "entrypoints (acceptable entrypoint formats: "
             "package.entrypoint | package.*).")
         logitsprocs_group.add_argument(
             "--logits-processors-fqns",
-            type=lambda x: comma_separated_list(x, None),
+            type=lambda x: comma_separated_list(x),
             default=[],
             help="Comma-separated list of logits processor qualified names.")
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 529186c4213..9d1e9c04022 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -11,7 +11,6 @@
                                                    BatchUpdateBuilder,
                                                    LogitsProcessorsManager,
                                                    MoveDirectionality)
-from vllm.v1.sample.logits_processor.utils import logitsprocs_package_pattern
 
 __all__ = [
     "LogitsProcessor",
@@ -23,7 +22,6 @@
     "BatchUpdateBuilder",
     "MoveDirectionality",
     "LogitsProcessorsManager",
-    "logitsprocs_package_pattern",
     "build_logitsprocs",
     "LogitProcessorCtorArgs",
 ]
diff --git a/vllm/v1/sample/logits_processor/utils.py b/vllm/v1/sample/logits_processor/utils.py
index b13c3fbd9dc..98a58c7a1ca 100644
--- a/vllm/v1/sample/logits_processor/utils.py
+++ b/vllm/v1/sample/logits_processor/utils.py
@@ -1,27 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
-import regex as re
 import torch
 
 from vllm.config import VllmConfig
 
-# <package name>.<entrypoint name> compiled regular expression
-package_name_regex = r'[a-z0-9](?:[a-z0-9._-]*[a-z0-9])?'
-function_name_or_wildcard_regex = r'[A-Za-z_][A-Za-z0-9_]*|\*'
-logitsprocs_package_pattern = re.compile(
-    rf'^({package_name_regex})\.({function_name_or_wildcard_regex})$')
-
-
-def extract_package_and_function(s: str) -> Optional[tuple[str, str]]:
-    """Return (package name,entrypoint name) (or `None` if no regex match)"""
-    match = logitsprocs_package_pattern.fullmatch(s)
-    if match:
-        return match.group(1), match.group(2)
-    return None
-
 
 @dataclass
 class LogitProcessorCtorArgs:

From c21a2ec53c141072c3cf609de4a5f89c91411ca2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 12:40:26 -0400
Subject: [PATCH 151/180] fqn/entrypoint examples

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor_entrypoint.py            |   8 +-
 .../logits_processor/logits_processor_fqn.py  |   8 +-
 .../test_correctness.py}                      |   0
 .../sample/logits_processors/test_custom.py   | 105 ++++++++++++++++++
 4 files changed, 113 insertions(+), 8 deletions(-)
 rename tests/v1/sample/{test_logits_processors.py => logits_processors/test_correctness.py} (100%)
 create mode 100644 tests/v1/sample/logits_processors/test_custom.py

diff --git a/examples/offline_inference/logits_processor/logits_processor_entrypoint.py b/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
index b5e6fe32e2f..5489ca69f8e 100644
--- a/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
+++ b/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
@@ -12,10 +12,10 @@
 ]
 # Create a mixture of requests which do and don't utilize the dummy logitproc
 sampling_params_list = [
-    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 128}),
-    SamplingParams(temperature=0.8, top_p=0.95),
-    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 67}),
-    SamplingParams(temperature=0.8, top_p=0.95),
+    SamplingParams(temperature=0.0, top_p=0.95, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0, top_p=0.95),
+    SamplingParams(temperature=0.0, top_p=0.95, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0, top_p=0.95),
 ]
 
 
diff --git a/examples/offline_inference/logits_processor/logits_processor_fqn.py b/examples/offline_inference/logits_processor/logits_processor_fqn.py
index 027e7599745..cc3a48ebc40 100644
--- a/examples/offline_inference/logits_processor/logits_processor_fqn.py
+++ b/examples/offline_inference/logits_processor/logits_processor_fqn.py
@@ -12,10 +12,10 @@
 ]
 # Create a mixture of requests which do and don't utilize the dummy logitproc
 sampling_params_list = [
-    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 128}),
-    SamplingParams(temperature=0.8, top_p=0.95),
-    SamplingParams(temperature=0.8, top_p=0.95, extra_args={"target_token": 67}),
-    SamplingParams(temperature=0.8, top_p=0.95),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
 ]
 
 
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/sample/logits_processors/test_correctness.py
similarity index 100%
rename from tests/v1/sample/test_logits_processors.py
rename to tests/v1/sample/logits_processors/test_correctness.py
diff --git a/tests/v1/sample/logits_processors/test_custom.py b/tests/v1/sample/logits_processors/test_custom.py
new file mode 100644
index 00000000000..38b9731e202
--- /dev/null
+++ b/tests/v1/sample/logits_processors/test_custom.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+MODEL = "facebook/opt-125m"
+DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
+DUMMY_LOGITPROC_FQN = (
+    "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor")
+DUMMY_LOGITPROC_ARG = "target_token"
+LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
+LOGITPROC_SOURCE_FQN = "fqn"
+TEMP_GREEDY = 0.0
+MAX_TOKENS = 20
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=TEMP_GREEDY,
+                   max_tokens=MAX_TOKENS,
+                   extra_args={DUMMY_LOGITPROC_ARG: 128}),
+    SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
+    SamplingParams(temperature=TEMP_GREEDY,
+                   max_tokens=MAX_TOKENS,
+                   extra_args={DUMMY_LOGITPROC_ARG: 67}),
+    SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
+]
+
+
+@pytest.mark.parametrize("logitproc_source",
+                         [LOGITPROC_SOURCE_FQN, LOGITPROC_SOURCE_ENTRYPOINT])
+def test_custom_logitsprocs(logitproc_source: str):
+    """Test Python interface for passing custom logitsprocs
+    
+    Construct an `LLM` instance which loads a custom logitproc that has a
+    well-defined behavior (mask out all tokens except one `target_token`)
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Args:
+      logitproc_source: what source (entrypoint or fully-qualified name) the 
+                        user pulls the logitproc from
+    """
+    random.seed(40)
+
+    # Choose LLM args based on logitproc source
+    kwargs = ({
+        "logits_processors_entrypoints": [DUMMY_LOGITPROC_ENTRYPOINT]
+    } if logitproc_source == LOGITPROC_SOURCE_ENTRYPOINT else {
+        "logits_processors_fqns": [DUMMY_LOGITPROC_FQN]
+    })
+
+    # Create a vLLM instance and load custom logitproc
+    llm_logitproc = LLM(
+        model=MODEL,
+        gpu_memory_utilization=0.1,
+        **kwargs,
+    )
+
+    # Create a reference vLLM instance without custom logitproc
+    llm_ref = LLM(model=MODEL, gpu_memory_utilization=0.1)
+
+    # Run inference with logitproc loaded
+    outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list)
+
+    # Reference run
+    outputs_ref = llm_ref.generate(prompts, sampling_params_list)
+
+    # Validate outputs
+    for bdx, (out_lp, out_ref, params) in enumerate(
+            zip(outputs_logitproc, outputs_ref, sampling_params_list)):
+        lp_toks = out_lp.outputs[0].token_ids
+        if params.extra_args:
+            # This request exercises custom logitproc; validate that logitproc
+            # forces `target_token` to be decoded in each step
+            target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
+            if not all(x == target_token for x in lp_toks):
+                raise AssertionError(
+                    f"Request {bdx} generated {lp_toks}, shoud all be "
+                    f"{target_token}")
+        else:
+            # This request does not exercise custom logitproc; validate
+            # against reference result
+            ref_toks = out_ref.outputs[0].token_ids
+            if lp_toks != ref_toks:
+                raise AssertionError(
+                    f"Request {bdx} generated {lp_toks}, should match "
+                    f"{ref_toks}")

From 4730d7a224f1d1878c0a3f34f639affa6fbc54e1 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Tue, 8 Jul 2025 13:41:17 -0400
Subject: [PATCH 152/180] cli tests

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_custom_cli.py      | 113 ++++++++++++++++++
 .../{test_custom.py => test_custom_py.py}     |  27 ++---
 tests/v1/sample/logits_processors/utils.py    |  19 +++
 3 files changed, 139 insertions(+), 20 deletions(-)
 create mode 100644 tests/v1/sample/logits_processors/test_custom_cli.py
 rename tests/v1/sample/logits_processors/{test_custom.py => test_custom_py.py} (84%)
 create mode 100644 tests/v1/sample/logits_processors/utils.py

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
new file mode 100644
index 00000000000..5fc7748d41e
--- /dev/null
+++ b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from tests.v1.sample.logits_processors.utils import (
+    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQN,
+    MAX_TOKENS, MODEL_NAME, TEMP_GREEDY, prompts)
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager"
+    ]
+
+
+@pytest.fixture(
+    scope="module",
+    params=[[
+        "--logits-processors-entrypoints",
+        DUMMY_LOGITPROC_ENTRYPOINT + "," + DUMMY_LOGITPROC_ENTRYPOINT
+    ],
+            [
+                "--logits-processors-fqns",
+                DUMMY_LOGITPROC_FQN + "," + DUMMY_LOGITPROC_FQN
+            ]])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args = default_server_args + request.param
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+api_kwargs = {
+    "temperature": TEMP_GREEDY,
+    "max_tokens": MAX_TOKENS,
+    "logprobs": 0,
+}
+
+extra_body_kwargs = {"vllm_xargs": {DUMMY_LOGITPROC_ARG: 128}}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_custom_logitsprocs_cli(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    """Test CLI interface for passing custom logitsprocs
+    
+    Launch vLLM OpenAI-compatible server with CLI argument to loads a custom
+    logitproc that has a well-defined behavior (mask out all tokens except one
+    `target_token`) Test is implicitly parameterized by the logitproc source
+    (fully-qualified name or entrypoint)
+
+    Pass in requests, 50% of which pass a `target_token` value
+    in through `extra_body["vllm_xargs"]`, 50% of which do not.
+
+    Validate that requests which activate the custom logitproc, only output
+    `target_token`
+    """
+    use_dummy_logitproc = True
+    for prompt in prompts:
+        # Send vLLM API request; for some requests, activate dummy logitproc
+        kwargs = {
+            **api_kwargs,
+        }
+        if use_dummy_logitproc:
+            target_token = random.choice([128, 67])
+            # For requests which activate the dummy logitproc, choose one of
+            # two `target_token` values which are known not to be EOS tokens
+            kwargs["extra_body"] = {
+                "vllm_xargs": {
+                    DUMMY_LOGITPROC_ARG: target_token
+                }
+            }
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **kwargs,
+        )
+
+        if use_dummy_logitproc:
+            # Only for requests which activate dummy logitproc - validate that
+            # only `target_token` is generated
+            choices: openai.types.CompletionChoice = batch.choices
+            toks = choices[0].logprobs.tokens
+            if not all([x == toks[0] for x in toks]):
+                raise AssertionError(
+                    f"Generated {toks} should all be {toks[0]}")
+
+        # Alternate whether to activate dummy logitproc for each request
+        use_dummy_logitproc = not use_dummy_logitproc
diff --git a/tests/v1/sample/logits_processors/test_custom.py b/tests/v1/sample/logits_processors/test_custom_py.py
similarity index 84%
rename from tests/v1/sample/logits_processors/test_custom.py
rename to tests/v1/sample/logits_processors/test_custom_py.py
index 38b9731e202..33fa901d311 100644
--- a/tests/v1/sample/logits_processors/test_custom.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -4,25 +4,12 @@
 
 import pytest
 
+from tests.v1.sample.logits_processors.utils import (
+    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQN,
+    LOGITPROC_SOURCE_ENTRYPOINT, LOGITPROC_SOURCE_FQN, MAX_TOKENS, MODEL_NAME,
+    TEMP_GREEDY, prompts)
 from vllm import LLM, SamplingParams
 
-MODEL = "facebook/opt-125m"
-DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQN = (
-    "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor")
-DUMMY_LOGITPROC_ARG = "target_token"
-LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
-LOGITPROC_SOURCE_FQN = "fqn"
-TEMP_GREEDY = 0.0
-MAX_TOKENS = 20
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
 # Create a mixture of requests which do and don't utilize the dummy logitproc
 sampling_params_list = [
     SamplingParams(temperature=TEMP_GREEDY,
@@ -38,7 +25,7 @@
 
 @pytest.mark.parametrize("logitproc_source",
                          [LOGITPROC_SOURCE_FQN, LOGITPROC_SOURCE_ENTRYPOINT])
-def test_custom_logitsprocs(logitproc_source: str):
+def test_custom_logitsprocs_py(logitproc_source: str):
     """Test Python interface for passing custom logitsprocs
     
     Construct an `LLM` instance which loads a custom logitproc that has a
@@ -69,13 +56,13 @@ def test_custom_logitsprocs(logitproc_source: str):
 
     # Create a vLLM instance and load custom logitproc
     llm_logitproc = LLM(
-        model=MODEL,
+        model=MODEL_NAME,
         gpu_memory_utilization=0.1,
         **kwargs,
     )
 
     # Create a reference vLLM instance without custom logitproc
-    llm_ref = LLM(model=MODEL, gpu_memory_utilization=0.1)
+    llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1)
 
     # Run inference with logitproc loaded
     outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list)
diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
new file mode 100644
index 00000000000..f44af3e0583
--- /dev/null
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+MODEL_NAME = "facebook/opt-125m"
+DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
+DUMMY_LOGITPROC_FQN = (
+    "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor")
+DUMMY_LOGITPROC_ARG = "target_token"
+LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
+LOGITPROC_SOURCE_FQN = "fqn"
+TEMP_GREEDY = 0.0
+MAX_TOKENS = 20
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
\ No newline at end of file

From f078ce749f94760ea3a360545a5f5d396076c9b0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 10 Jul 2025 15:20:51 -0400
Subject: [PATCH 153/180] tail end of merge

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor.py       | 517 -----------------------
 vllm/v1/sample/logits_processor/state.py |   5 -
 2 files changed, 522 deletions(-)
 delete mode 100644 vllm/v1/sample/logits_processor.py

diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor.py
deleted file mode 100644
index 16bd2b9ffd8..00000000000
--- a/vllm/v1/sample/logits_processor.py
+++ /dev/null
@@ -1,517 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import dataclasses
-from abc import ABC, abstractmethod
-from collections.abc import Iterator, Sequence
-from dataclasses import dataclass, field
-from enum import Enum
-from itertools import chain
-from typing import Optional, Union
-
-import torch
-from torch._prims_common import DeviceLikeType
-
-from vllm import PoolingParams, SamplingParams
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class MoveDirectionality(Enum):
-    # One-way i1->i2 req move within batch
-    UNIDIRECTIONAL = 0
-    # Two-way i1<->i2 req swap within batch
-    SWAP = 1
-
-
-# (index, params, output_tok_ids) tuples for new
-# requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
-# (index 1, index 2, directionality) tuples representing
-# one-way moves or two-way swaps of requests in batch
-MovedRequest = tuple[int, int, MoveDirectionality]
-# Batch indices of any removed requests.
-RemovedRequest = int
-
-
-@dataclasses.dataclass(frozen=True)
-class BatchUpdate:
-    """Persistent batch state change info for logitsprocs"""
-    batch_size: int  # Current num reqs in batch
-
-    # Metadata for requests added to, removed from, and moved
-    # within the persistent batch.
-    #
-    # Note: each added request is represented as
-    # (index, params, output_tok_ids)
-    # Key assumption: output_tok_ids is a reference to the
-    # request's running output tokens list; in this way
-    # the logits processors always see the latest list of
-    # generated tokens
-    removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
-    added: Sequence[AddedRequest]
-
-
-class BatchUpdateBuilder:
-    """Helps track persistent batch state changes and build
-    a batch update data structure for logitsprocs
-    
-    Assumptions:
-    * All information about requests removed from persistent batch
-      during a step is aggregated in self._removed through calls to
-      self.removed_append() at the beginning of a step. This must happen
-      before the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are invoked in a given step
-    * After the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are read in a step, no new removals
-      are registered using self.removed_append()
-    * Elements of self._removed are never directly modified, added or
-      removed (i.e. modification is only via self.removed_append() and
-      self.pop_removed())
-    
-    Guarantees under above assumptions:
-    * self.removed is always sorted in descending order
-    * self.pop_removed() and self.peek_removed() both return
-      the lowest removed request index in the current step
-    """
-
-    _removed: list[RemovedRequest]
-    _is_removed_sorted: bool
-    moved: list[MovedRequest]
-    added: list[AddedRequest]
-
-    def __init__(
-        self,
-        removed: Optional[list[RemovedRequest]] = None,
-        moved: Optional[list[MovedRequest]] = None,
-        added: Optional[list[AddedRequest]] = None,
-    ) -> None:
-        self._removed = removed or []
-        self.moved = moved or []
-        self.added = added or []
-        self._is_removed_sorted = False
-
-    def _ensure_removed_sorted(self) -> None:
-        """Sort removed request indices in
-        descending order.
-        
-        Idempotent after first call in a
-        given step, until reset.
-        """
-        if not self._is_removed_sorted:
-            self._removed.sort(reverse=True)
-            self._is_removed_sorted = True
-
-    @property
-    def removed(self) -> list[RemovedRequest]:
-        """Removed request indices sorted in
-        descending order"""
-        self._ensure_removed_sorted()
-        return self._removed
-
-    def removed_append(self, index: int) -> None:
-        """Register the removal of a request from
-        the persistent batch.
-
-        Must not be called after the first time
-        self.removed, self.pop_removed() or
-        self.peek_removed() are invoked.
-        
-        Args:
-          index: request index
-        """
-        if self._is_removed_sorted:
-            raise RuntimeError("Cannot register new removed request after"
-                               " self.removed has been read.")
-        self._removed.append(index)
-
-    def has_removed(self) -> bool:
-        return bool(self._removed)
-
-    def peek_removed(self) -> Optional[int]:
-        """Return lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed[-1]
-        return None
-
-    def pop_removed(self) -> Optional[int]:
-        """Pop lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed.pop()
-        return None
-
-    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
-        """Generate a logitsprocs batch update data structure
-        and reset internal batch update builder state.
-        
-        Args:
-          batch_size: current persistent batch size
-
-        Returns:
-          Frozen logitsprocs batch update instance; `None` if no updates
-        """
-        # Reset removal-sorting logic
-        self._is_removed_sorted = False
-        if not any((self._removed, self.moved, self.added)):
-            # No update; short-circuit
-            return None
-        # Build batch state update
-        batch_update = BatchUpdate(
-            batch_size=batch_size,
-            removed=self._removed,
-            moved=self.moved,
-            added=self.added,
-        )
-        # Reset removed/moved/added update lists
-        self._removed = []
-        self.moved = []
-        self.added = []
-        return batch_update
-
-
-class LogitsProcessor(ABC):
-
-    @abstractmethod
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-
-    @abstractmethod
-    def is_argmax_invariant(self) -> bool:
-        """True if logits processor has no impact on the
-        argmax computation in greedy sampling.
-        NOTE: may or may not have the same value for all
-        instances of a given LogitsProcessor subclass,
-        depending on subclass implementation.
-        TODO(andy): won't be utilized until logits
-        processors are user-extensible
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_state(
-        self,
-        batch_update: Optional[BatchUpdate],
-    ) -> None:
-        """Called when there are new output tokens, prior
-        to each forward pass.
-
-        Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
-        """
-        raise NotImplementedError
-
-
-@dataclass
-class LogitsProcessorManager:
-    """Encapsulates initialized logitsproc objects."""
-    argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # argmax-invariant logitsprocs
-    non_argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # non-argmax-invariant logitsprocs
-
-    @property
-    def all(self) -> Iterator[LogitsProcessor]:
-        """Iterator over all logits processors."""
-        return chain(self.argmax_invariant, self.non_argmax_invariant)
-
-
-###### ----- Built-in LogitsProcessor impls below here
-
-
-class MinPLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, max_num_reqs: int, pin_memory: bool,
-                 device: DeviceLikeType):
-        super().__init__()
-        self.min_p_count: int = 0
-
-        self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
-                                            dtype=torch.float32,
-                                            device="cpu",
-                                            pin_memory=pin_memory)
-        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        # Pre-allocated device tensor
-        self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
-                                                      dtype=torch.float32,
-                                                      device=device)
-        # Current slice of the device tensor
-        self.min_p: torch.Tensor = self.min_p_device[:0]
-
-    def is_argmax_invariant(self) -> bool:
-        """Min-p never impacts greedy sampling"""
-        return True
-
-    def get_min_p_by_index(self, index: int) -> float:
-        return float(self.min_p_cpu[index])
-
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        needs_update = False
-        # Process added requests.
-        for index, params, _ in batch_update.added:
-            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
-            if self.min_p_cpu[index] != min_p:
-                needs_update = True
-                self.min_p_cpu[index] = min_p
-            if min_p:
-                self.min_p_count += 1
-
-        if self.min_p_count:
-            # Process removed requests.
-            needs_update |= bool(batch_update.removed)
-            for index in batch_update.removed:
-                if self.min_p_cpu[index]:
-                    self.min_p_count -= 1
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                change = (min_p_a :=
-                          self.min_p_cpu[adx]) != (min_p_b :=
-                                                   self.min_p_cpu[bdx])
-                needs_update |= change
-                if change:
-                    self.min_p_cpu[bdx] = min_p_a
-                    if direct == MoveDirectionality.SWAP:
-                        self.min_p_cpu[adx] = min_p_b
-
-        # Update tensors if needed.
-        size = batch_update.batch_size
-        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
-            self.min_p = self.min_p_device[:size]
-            self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
-            self.min_p.unsqueeze_(1)
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if not self.min_p_count:
-            return logits
-
-        # Convert logits to probability distribution
-        probability_values = torch.nn.functional.softmax(logits, dim=-1)
-        # Calculate maximum probabilities per sequence
-        max_probabilities = torch.amax(probability_values,
-                                       dim=-1,
-                                       keepdim=True)
-        # Adjust min_p
-        adjusted_min_p = max_probabilities.mul_(self.min_p)
-        # Identify valid tokens using threshold comparison
-        invalid_token_mask = probability_values < adjusted_min_p
-        # Apply mask using boolean indexing
-        logits[invalid_token_mask] = -float('inf')
-        return logits
-
-
-class LogitBiasLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, pin_memory: bool, device: torch.device):
-        super().__init__()
-        self.biases: dict[int, dict[int, float]] = {}
-        self.device = device
-        self.pin_memory = pin_memory
-
-        self.bias_tensor: torch.Tensor = torch.tensor(())
-        self.logits_slice = (self._device_tensor([], torch.int32),
-                             self._device_tensor([], torch.int32))
-
-    def is_argmax_invariant(self) -> bool:
-        """Logit bias can rebalance token probabilities and change the
-        outcome of argmax in greedy sampling."""
-        return False
-
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        needs_update = bool(batch_update.added)
-        for index, params, _ in batch_update.added:
-            if isinstance(params, SamplingParams) and (lb :=
-                                                       params.logit_bias):
-                self.biases[index] = lb
-            else:
-                self.biases.pop(index, None)
-
-        if self.biases:
-            # Process removed requests.
-            for index in batch_update.removed:
-                if self.biases.pop(index, None):
-                    needs_update = True
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for a_index, b_index, direct in batch_update.moved:
-                if direct == MoveDirectionality.UNIDIRECTIONAL:
-                    if (a_entry := self.biases.pop(a_index, None)) is None:
-                        if self.biases.pop(b_index, None) is not None:
-                            needs_update = True
-                    else:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
-                else:
-                    a_entry = self.biases.pop(a_index, None)
-                    if (b_entry := self.biases.pop(b_index, None)) is not None:
-                        self.biases[a_index] = b_entry
-                        needs_update = True
-                    if a_entry is not None:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
-
-        # Update tensors if needed.
-        if needs_update:
-            reqs, tok_ids, biases = [], [], []
-            for req, lb in self.biases.items():
-                reqs.extend([req] * len(lb))
-                tok_ids.extend(lb.keys())
-                biases.extend(lb.values())
-
-            self.bias_tensor = self._device_tensor(biases, torch.float32)
-            self.logits_slice = (self._device_tensor(reqs, torch.int32),
-                                 self._device_tensor(tok_ids, torch.int32))
-
-    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
-        return (torch.tensor(data,
-                             device="cpu",
-                             dtype=dtype,
-                             pin_memory=self.pin_memory).to(device=self.device,
-                                                            non_blocking=True))
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if self.biases:
-            logits[self.logits_slice] += self.bias_tensor
-        return logits
-
-
-class MinTokensLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, pin_memory: bool, device: torch.device):
-        # index -> (min_toks, output_token_ids, stop_token_ids)
-        super().__init__()
-        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
-        self.device = device
-        self.pin_memory = pin_memory
-
-        # (req_idx_tensor,eos_tok_id_tensor)
-        self.logits_slice: tuple[torch.Tensor,
-                                 torch.Tensor] = (self._device_tensor(
-                                     [], torch.int32),
-                                                  self._device_tensor(
-                                                      [], torch.int32))
-
-    def is_argmax_invariant(self) -> bool:
-        """By censoring stop tokens, min-tokens can change the outcome
-        of the argmax operation in greedy sampling."""
-        return False
-
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        needs_update = False
-
-        if batch_update:
-            # Process added requests.
-            needs_update |= bool(batch_update.added)
-            for index, params, output_tok_ids in batch_update.added:
-                if (isinstance(params, SamplingParams)
-                        and (min_tokens := params.min_tokens)
-                        and len(output_tok_ids) < min_tokens):
-                    # Replace request metadata at batch index
-                    self.min_toks[index] = (min_tokens, output_tok_ids,
-                                            params.all_stop_token_ids)
-                else:
-                    # Drop request metadata at batch index
-                    self.min_toks.pop(index, None)
-
-            if self.min_toks:
-                # Process removed requests.
-                for index in batch_update.removed:
-                    if self.min_toks.pop(index, None):
-                        needs_update = True
-
-                # Process moved requests, unidirectional (a->b) and
-                # swapped (a<->b)
-                for a_index, b_index, direct in batch_update.moved:
-                    if direct == MoveDirectionality.UNIDIRECTIONAL:
-                        if (a_entry := self.min_toks.pop(a_index,
-                                                         None)) is None:
-                            if self.min_toks.pop(b_index, None) is not None:
-                                needs_update = True
-                        else:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-                    else:
-                        a_entry = self.min_toks.pop(a_index, None)
-                        if (b_entry := self.min_toks.pop(b_index,
-                                                         None)) is not None:
-                            self.min_toks[a_index] = b_entry
-                            needs_update = True
-                        if a_entry is not None:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-
-        if self.min_toks:
-            # Check for any requests that have attained their min tokens.
-            to_remove = tuple(index for index, (min_toks, out_tok_ids,
-                                                _) in self.min_toks.items()
-                              if len(out_tok_ids) >= min_toks)
-            if to_remove:
-                needs_update = True
-                for index in to_remove:
-                    del self.min_toks[index]
-
-        # Update tensors if needed.
-        if needs_update:
-            reqs: list[int] = []
-            tok_ids: list[int] = []
-            for req, (_, _, stop_tok_ids) in self.min_toks.items():
-                reqs.extend([req] * len(stop_tok_ids))
-                tok_ids.extend(stop_tok_ids)
-
-            self.logits_slice = (self._device_tensor(reqs, torch.int32),
-                                 self._device_tensor(tok_ids, torch.int32))
-
-    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
-        return (torch.tensor(data,
-                             device="cpu",
-                             dtype=dtype,
-                             pin_memory=self.pin_memory).to(device=self.device,
-                                                            non_blocking=True))
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if self.min_toks:
-            # Inhibit EOS token for requests which have not reached min length
-            logits[self.logits_slice] = -float("inf")
-        return logits
-
-
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device) -> LogitsProcessorManager:
-    """Construct 'builtin' vLLM logitsprocs which the engine
-    loads by default.
-
-    Args:
-      pin_memory_available: pinned memory is available for use
-                            for use by logitsproc
-      max_num_reqs: ceiling on request count in persistent batch
-      device: inference device
-
-    Returns:
-      Data structure encapsulating loaded logitsprocs
-    """
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=pin_memory_available,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorManager(
-        non_argmax_invariant=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
-        ],
-        argmax_invariant=[min_p_logitproc],
-    )
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index d66e39bf642..2f181f448bd 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -51,7 +51,6 @@ class BatchUpdate:
 class BatchUpdateBuilder:
     """Helps track persistent batch state changes and build
     a batch update data structure for logitsprocs
-
     Assumptions:
     * All information about requests removed from persistent batch
       during a step is aggregated in self._removed through calls to
@@ -64,7 +63,6 @@ class BatchUpdateBuilder:
     * Elements of self._removed are never directly modified, added or
       removed (i.e. modification is only via self.removed_append() and
       self.pop_removed())
-
     Guarantees under above assumptions:
     * self.removed is always sorted in descending order
     * self.pop_removed() and self.peek_removed() both return
@@ -90,7 +88,6 @@ def __init__(
     def _ensure_removed_sorted(self) -> None:
         """Sort removed request indices in
         descending order.
-
         Idempotent after first call in a
         given step, until reset.
         """
@@ -112,7 +109,6 @@ def removed_append(self, index: int) -> None:
         Must not be called after the first time
         self.removed, self.pop_removed() or
         self.peek_removed() are invoked.
-
         Args:
           index: request index
         """
@@ -141,7 +137,6 @@ def pop_removed(self) -> Optional[int]:
     def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
         """Generate a logitsprocs batch update data structure
         and reset internal batch update builder state.
-
         Args:
           batch_size: current persistent batch size
 

From ac1509f4454d44770b5bb11cde73853a9682f37e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 10 Jul 2025 16:32:43 -0400
Subject: [PATCH 154/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processor/logits_processor_fqn.py  |  4 +-
 pyproject.toml                                |  2 +-
 tests/v1/sample/logits_processors/utils.py    |  6 +-
 tests/v1/sample/test_rejection_sampler.py     |  4 +-
 tests/v1/sample/test_sampler.py               |  4 +-
 tests/v1/worker/test_gpu_input_batch.py       |  4 +-
 vllm/test_utils.py                            | 55 +++++++++++++++++++
 vllm/v1/sample/logits_processor/__init__.py   |  4 +-
 vllm/v1/sample/logits_processor/impls.py      | 46 ----------------
 vllm/v1/sample/logits_processor/load.py       | 12 ++--
 vllm/v1/sample/logits_processor/state.py      |  2 +-
 vllm/v1/sample/metadata.py                    |  4 +-
 vllm/v1/worker/gpu_input_batch.py             |  8 +--
 vllm/v1/worker/gpu_model_runner.py            |  6 +-
 14 files changed, 85 insertions(+), 76 deletions(-)

diff --git a/examples/offline_inference/logits_processor/logits_processor_fqn.py b/examples/offline_inference/logits_processor/logits_processor_fqn.py
index cc3a48ebc40..d59c9f1c357 100644
--- a/examples/offline_inference/logits_processor/logits_processor_fqn.py
+++ b/examples/offline_inference/logits_processor/logits_processor_fqn.py
@@ -23,9 +23,7 @@ def main():
     # Create an LLM.
     llm = LLM(
         model="facebook/opt-125m",
-        logits_processors_fqns=[
-            "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor"
-        ],
+        logits_processors_fqns=["vllm.test_utils:DummyLogitsProcessor"],
     )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
diff --git a/pyproject.toml b/pyproject.toml
index 648690d6b27..85c3c606f21 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ vllm = "vllm.entrypoints.cli.main:main"
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
 
 [project.entry-points."vllm.logits_processors"]
-dummy_logitproc = "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor"
+dummy_logitproc = "vllm.test_utils:DummyLogitsProcessor"
 
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index f44af3e0583..0549d630bfa 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 MODEL_NAME = "facebook/opt-125m"
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQN = (
-    "vllm.v1.sample.logits_processor.impls:DummyLogitsProcessor")
+DUMMY_LOGITPROC_FQN = ("vllm.test_utils:DummyLogitsProcessor")
 DUMMY_LOGITPROC_ARG = "target_token"
 LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
 LOGITPROC_SOURCE_FQN = "fqn"
@@ -16,4 +16,4 @@
     "The president of the United States is",
     "The capital of France is",
     "The future of AI is",
-]
\ No newline at end of file
+]
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 06403b6ca13..4e912f98f37 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.logits_processor import LogitsProcessorsManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
@@ -69,7 +69,7 @@ def create_sampling_metadata(
         output_token_ids=[],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorsManager(),
+        logitsprocs=LogitsProcessors(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index e7a7ee4ce70..d584ee98cb5 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,7 +9,7 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import LogitsProcessorsManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
@@ -147,7 +147,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorsManager(),
+        logitsprocs=LogitsProcessors(),
     )
     return fake_sampling_metadata
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index b9d9ec5d7d7..d4814beb9b7 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -13,7 +13,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import LogitsProcessorsManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 1 for x in repetition_penalties)),
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
-        logitsprocs=LogitsProcessorsManager(),
+        logitsprocs=LogitsProcessors(),
     )
 
 
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index c6b126d002b..d67b0b8de96 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -1,5 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm import SamplingParams
+from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
+                                             MoveDirectionality)
+
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
     "ai21labs/AI21-Jamba-1.5-Mini",
@@ -128,3 +136,50 @@
 ]
 
 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
+
+
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(self, _):
+        super().__init__()
+        self.req_info = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            if isinstance(params, SamplingParams) and params.extra_args:
+                target_token = params.extra_args.get("target_token", None)
+            else:
+                target_token = None
+            self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                if direct == MoveDirectionality.SWAP:
+                    (self.req_info[adx],
+                     self.req_info[bdx]) = (self.req_info[bdx],
+                                            self.req_info[adx])
+                else:
+                    self.req_info[bdx] = self.req_info[adx]
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        for bdx in range(logits.shape[0]):
+            if (target_token := self.req_info[bdx]) is not None:
+                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
+                mask[target_token] = False
+                logits[bdx, mask] = float('-inf')
+
+        return logits
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 9d1e9c04022..c951d2b2daf 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -9,7 +9,7 @@
                                                   build_logitsprocs)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
-                                                   LogitsProcessorsManager,
+                                                   LogitsProcessors,
                                                    MoveDirectionality)
 
 __all__ = [
@@ -21,7 +21,7 @@
     "BatchUpdate",
     "BatchUpdateBuilder",
     "MoveDirectionality",
-    "LogitsProcessorsManager",
+    "LogitsProcessors",
     "build_logitsprocs",
     "LogitProcessorCtorArgs",
 ]
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 507454438fe..052a19cb19d 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -273,49 +273,3 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
-
-
-class DummyLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, _):
-        super().__init__()
-        self.req_info = {}
-
-    def is_argmax_invariant(self) -> bool:
-        """Never impacts greedy sampling"""
-        return False
-
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _ in batch_update.added:
-            if isinstance(params, SamplingParams) and params.extra_args:
-                target_token = params.extra_args.get("target_token", None)
-            else:
-                target_token = None
-            self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                if direct == MoveDirectionality.SWAP:
-                    (self.req_info[adx],
-                     self.req_info[bdx]) = (self.req_info[bdx],
-                                            self.req_info[adx])
-                else:
-                    self.req_info[bdx] = self.req_info[adx]
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        for bdx in range(logits.shape[0]):
-            if (target_token := self.req_info[bdx]) is not None:
-                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
-                mask[target_token] = False
-                logits[bdx, mask] = float('-inf')
-
-        return logits
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 8f288823b99..d2c62996e72 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -9,7 +9,7 @@
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.state import LogitsProcessorsManager
+from vllm.v1.sample.logits_processor.state import LogitsProcessors
 from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 
 logger = logging.getLogger(__name__)
@@ -47,8 +47,9 @@ def _load_logitsprocs_ctors_by_fqns(
             if not callable(obj):
                 raise ValueError(f"{fqn} is not a Callable.")
             constructors.append(obj)
-        except Exception:
+        except Exception as e:
             logger.exception("Failed to load logits processor %s", fqn)
+            raise e
 
     return constructors
 
@@ -93,8 +94,9 @@ def _load_logitsprocs_ctors_by_entrypoints(
         try:
             func = installed_logitsprocs_plugins[entrypoint].load()
             constructors.append(func)
-        except Exception:
+        except Exception as e:
             logger.exception("Failed to load plugin %s", entrypoint)
+            raise e
 
     return constructors
 
@@ -118,11 +120,11 @@ def _load_custom_logitsprocs_ctors(
         _load_logitsprocs_ctors_by_fqns(logits_processors_fqns))
 
 
-def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessorsManager:
+def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessors:
     _custom_logitsprocs_ctors = _load_custom_logitsprocs_ctors(
         args.vllm_config.logits_processors_fqns,
         args.vllm_config.logits_processors_entrypoints,
     )
-    return LogitsProcessorsManager(
+    return LogitsProcessors(
         ctor(args) for ctor in itertools.chain(_builtin_logitsprocs_ctors,
                                                _custom_logitsprocs_ctors))
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 2f181f448bd..716d9b50f22 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -163,7 +163,7 @@ def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
 
 
 @dataclass
-class LogitsProcessorsManager:
+class LogitsProcessors:
     """Encapsulates initialized logitsproc objects."""
     argmax_invariant: list["LogitsProcessor"] = field(
         default_factory=list, init=False)  # argmax-invariant logitsprocs
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index d578bbb2190..9d6a87cea3d 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.sample.logits_processor import LogitsProcessorsManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 
 
 @dataclass
@@ -40,4 +40,4 @@ class SamplingMetadata:
     bad_words_token_ids: dict[int, list[list[int]]]
 
     # Loaded logits processors
-    logitsprocs: LogitsProcessorsManager
+    logitsprocs: LogitsProcessors
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 859067f6499..fb6980c877b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -16,7 +16,7 @@
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             LogitsProcessorsManager,
+                                             LogitsProcessors,
                                              MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
@@ -69,7 +69,7 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
-        logitsprocs: LogitsProcessorsManager,
+        logitsprocs: LogitsProcessors,
         is_spec_decode: bool = False,
         logits_processing_needs_token_ids: bool = False,
     ):
@@ -579,7 +579,7 @@ def condense(self) -> None:
         del self.req_output_token_ids[self.num_reqs:]
 
     @property
-    def _logitsprocs(self) -> Optional[LogitsProcessorsManager]:
+    def _logitsprocs(self) -> Optional[LogitsProcessors]:
         if not self.sampling_metadata:
             return None
         return self.sampling_metadata.logitsprocs
@@ -601,7 +601,7 @@ def refresh_metadata(self):
                 old_logitsprocs)
 
     def _make_sampling_metadata(
-            self, logitsprocs: LogitsProcessorsManager) -> SamplingMetadata:
+            self, logitsprocs: LogitsProcessors) -> SamplingMetadata:
         num_reqs = self.num_reqs
         if not self.all_greedy:
             temperature = copy_slice(self.temperature_cpu_tensor,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 17a338d174c..fa6e46a73f5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,7 +56,7 @@
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (LogitProcessorCtorArgs,
-                                             LogitsProcessorsManager,
+                                             LogitsProcessors,
                                              build_logitsprocs)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
@@ -198,7 +198,7 @@ def __init__(
 
         # Build logits processors. If specified by user, load custom
         # logitsprocs constructors.
-        self.logitsprocs: LogitsProcessorsManager = build_logitsprocs(
+        self.logitsprocs: LogitsProcessors = build_logitsprocs(
             LogitProcessorCtorArgs(
                 vllm_config=vllm_config,
                 device=self.device,
@@ -2093,7 +2093,7 @@ def _dummy_sampler_run(
             output_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logitsprocs=LogitsProcessorsManager(),
+            logitsprocs=LogitsProcessors(),
         )
         try:
             sampler_output = self.sampler(logits=logits,

From 5b852558f2d1dd56199399cbe6cf8a10fd9d79aa Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 10 Jul 2025 16:51:00 -0400
Subject: [PATCH 155/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/logits_processors/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 0549d630bfa..9ab1e186ba4 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -3,7 +3,7 @@
 
 MODEL_NAME = "facebook/opt-125m"
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQN = ("vllm.test_utils:DummyLogitsProcessor")
+DUMMY_LOGITPROC_FQN = "vllm.test_utils:DummyLogitsProcessor"
 DUMMY_LOGITPROC_ARG = "target_token"
 LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
 LOGITPROC_SOURCE_FQN = "fqn"

From ee74904be95d7795ab37198430f35a6114da5c67 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 11 Jul 2025 03:25:43 -0400
Subject: [PATCH 156/180] all lp plugins are loaded; can pass lp types to LLM;
 refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../offline_inference/logits_processor.py     | 104 ++++++++++++++
 .../logits_processor_entrypoint.py            |  43 ------
 .../logits_processor/logits_processor_fqn.py  |  43 ------
 pyproject.toml                                |   3 -
 vllm/config.py                                |   7 +-
 vllm/engine/arg_utils.py                      |  55 +++----
 vllm/entrypoints/llm.py                       |   8 +-
 vllm/v1/sample/logits_processor/__init__.py   |   4 +-
 vllm/v1/sample/logits_processor/load.py       | 136 ++++++++++--------
 vllm/v1/sample/logits_processor/utils.py      |   6 +-
 10 files changed, 213 insertions(+), 196 deletions(-)
 create mode 100644 examples/offline_inference/logits_processor.py
 delete mode 100644 examples/offline_inference/logits_processor/logits_processor_entrypoint.py
 delete mode 100644 examples/offline_inference/logits_processor/logits_processor_fqn.py

diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
new file mode 100644
index 00000000000..887c5706d3c
--- /dev/null
+++ b/examples/offline_inference/logits_processor.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+
+
+def make_dummy_logitproc_type():
+    class DummyLogitsProcessor(LogitsProcessor):
+        """Fake logit processor to support unit testing and examples"""
+
+        def __init__(self, _):
+            super().__init__()
+            self.req_info = {}
+
+        def is_argmax_invariant(self) -> bool:
+            """Never impacts greedy sampling"""
+            return False
+
+        def update_state(self, batch_update: Optional[BatchUpdate]):
+            if not batch_update:
+                return
+
+            # Process added requests.
+            for index, params, _ in batch_update.added:
+                if isinstance(params, SamplingParams) and params.extra_args:
+                    target_token = params.extra_args.get("target_token", None)
+                else:
+                    target_token = None
+                self.req_info[index] = target_token
+
+            if self.req_info:
+                # Process removed requests.
+                for index in batch_update.removed:
+                    self.req_info.pop(index, None)
+
+                # Process moved requests, unidirectional (a->b) and swap (a<->b)
+                for adx, bdx, direct in batch_update.moved:
+                    if direct == MoveDirectionality.SWAP:
+                        (self.req_info[adx], self.req_info[bdx]) = (
+                            self.req_info[bdx],
+                            self.req_info[adx],
+                        )
+                    else:
+                        self.req_info[bdx] = self.req_info[adx]
+
+        def apply(self, logits: torch.Tensor) -> torch.Tensor:
+            for bdx in range(logits.shape[0]):
+                if (target_token := self.req_info[bdx]) is not None:
+                    mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
+                    mask[target_token] = False
+                    logits[bdx, mask] = float("-inf")
+
+            return logits
+
+    return DummyLogitsProcessor
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[make_dummy_logitproc_type()],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/logits_processor/logits_processor_entrypoint.py b/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
deleted file mode 100644
index 5489ca69f8e..00000000000
--- a/examples/offline_inference/logits_processor/logits_processor_entrypoint.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a mixture of requests which do and don't utilize the dummy logitproc
-sampling_params_list = [
-    SamplingParams(temperature=0.0, top_p=0.95, extra_args={"target_token": 128}),
-    SamplingParams(temperature=0.0, top_p=0.95),
-    SamplingParams(temperature=0.0, top_p=0.95, extra_args={"target_token": 67}),
-    SamplingParams(temperature=0.0, top_p=0.95),
-]
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="facebook/opt-125m",
-        logits_processors_entrypoints=["dummy_logitproc"],
-    )
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params_list)
-    # Print the outputs.
-    print("\nGenerated Outputs:\n" + "-" * 60)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt:    {prompt!r}")
-        print(f"Output:    {generated_text!r}")
-        print("-" * 60)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/logits_processor/logits_processor_fqn.py b/examples/offline_inference/logits_processor/logits_processor_fqn.py
deleted file mode 100644
index d59c9f1c357..00000000000
--- a/examples/offline_inference/logits_processor/logits_processor_fqn.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a mixture of requests which do and don't utilize the dummy logitproc
-sampling_params_list = [
-    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
-    SamplingParams(temperature=0.0),
-    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
-    SamplingParams(temperature=0.0),
-]
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="facebook/opt-125m",
-        logits_processors_fqns=["vllm.test_utils:DummyLogitsProcessor"],
-    )
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params_list)
-    # Print the outputs.
-    print("\nGenerated Outputs:\n" + "-" * 60)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt:    {prompt!r}")
-        print(f"Output:    {generated_text!r}")
-        print("-" * 60)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pyproject.toml b/pyproject.toml
index 85c3c606f21..340abb38565 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,9 +44,6 @@ vllm = "vllm.entrypoints.cli.main:main"
 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
 
-[project.entry-points."vllm.logits_processors"]
-dummy_logitproc = "vllm.test_utils:DummyLogitsProcessor"
-
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
 
diff --git a/vllm/config.py b/vllm/config.py
index 7518f1dd0b8..b06f45129a2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -52,6 +52,7 @@
                         cuda_device_count_stateless, get_cpu_memory,
                         get_open_port, is_torch_equal_or_newer, random_uuid,
                         resolve_obj_by_qualname)
+from vllm.v1.sample.logits_processor.core import LogitsProcessor
 
 # yapf: enable
 
@@ -4395,10 +4396,8 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
-    logits_processors_fqns: Optional[list[str]] = None
-    """Logits processors to load by fully-qualified name (FQN)"""
-    logits_processors_entrypoints: Optional[list[str]] = None
-    """Logits processors to load by entrypoint"""
+    logits_processors: Optional[list[type[LogitsProcessor]]] = None
+    """A list of logitproc types to construct for this vLLM instance"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 035fa8f2cdb..5fa26856c88 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -42,6 +42,8 @@
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
+from vllm.v1.sample.logits_processor import (LogitsProcessor,
+                                             load_custom_logitsprocs)
 
 # yapf: enable
 
@@ -297,21 +299,6 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     return copy.deepcopy(_compute_kwargs(cls))
 
 
-def comma_separated_list(value: str) -> list[str]:
-    """Argparse-compatible comma-separated list arg type.
-    
-    Expected format: "<val0>,<val1>,<val2>" or just "<val>".
-
-    Args:
-      value: string comma-separate list representation
-
-    Returns:
-      Parsed list of string values
-    """
-    items = value.split(',')
-    return items
-
-
 @dataclass
 class EngineArgs:
     """Arguments for vLLM engine."""
@@ -499,8 +486,8 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
-    logits_processors_fqns: Optional[list[str]] = None
-    logits_processors_entrypoints: Optional[list[str]] = None
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
+    """Custom logitproc types"""
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -516,9 +503,17 @@ def __post_init__(self):
                 DeprecationWarning,
                 stacklevel=2,
             )
-        # Setup plugins
+        # Setup general plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
+        if envs.VLLM_USE_V1:
+            # Setup V1 custom logitsprocs. Load plugins & any logitsprocs
+            # specified by FQN
+            self.logits_processors = load_custom_logitsprocs(
+                self.logits_processors)
+        elif self.logits_processors is not None:
+            raise ValueError(
+                "vLLM V0 does not support logits_processors engine args")
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -610,6 +605,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                  **model_kwargs["model_impl"])
         model_group.add_argument("--override-attention-dtype",
                                  **model_kwargs["override_attention_dtype"])
+        model_group.add_argument(
+            "--logits-processors",
+            nargs='+',
+            help="List of logits processors' fully-qualified names (FQNs).")
 
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
@@ -940,23 +939,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--disable-hybrid-kv-cache-manager",
             **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
 
-        logitsprocs_group = parser.add_argument_group(
-            title="Logits processors",
-            description="Logits processors settings.",
-        )
-        logitsprocs_group.add_argument(
-            "--logits-processors-entrypoints",
-            type=lambda x: comma_separated_list(x),
-            default=[],
-            help="Comma-separated list of allowed logits processor "
-            "entrypoints (acceptable entrypoint formats: "
-            "package.entrypoint | package.*).")
-        logitsprocs_group.add_argument(
-            "--logits-processors-fqns",
-            type=lambda x: comma_separated_list(x),
-            default=[],
-            help="Comma-separated list of logits processor qualified names.")
-
         # vLLM arguments
         vllm_kwargs = get_kwargs(VllmConfig)
         vllm_group = parser.add_argument_group(
@@ -1383,8 +1365,7 @@ def create_engine_config(
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             additional_config=self.additional_config,
-            logits_processors_fqns=self.logits_processors_fqns,
-            logits_processors_entrypoints=self.logits_processors_entrypoints,
+            logits_processors=self.logits_processors,
         )
 
         return config
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 20552f8f06e..c6801b49048 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -52,6 +52,7 @@
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -194,8 +195,8 @@ def __init__(
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
-        logits_processors_fqns: Optional[list[str]] = None,
-        logits_processors_entrypoints: Optional[list[str]] = None,
+        logits_processors: Optional[list[Union[str,
+                                               type[LogitsProcessor]]]] = None,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -269,8 +270,7 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
-            logits_processors_fqns=logits_processors_fqns,
-            logits_processors_entrypoints=logits_processors_entrypoints,
+            logits_processors=logits_processors,
             **kwargs,
         )
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index c951d2b2daf..4eb2ac087be 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -6,7 +6,8 @@
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
                                                   LogitProcessorCtorArgs,
-                                                  build_logitsprocs)
+                                                  build_logitsprocs,
+                                                  load_custom_logitsprocs)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
                                                    LogitsProcessors,
@@ -23,5 +24,6 @@
     "MoveDirectionality",
     "LogitsProcessors",
     "build_logitsprocs",
+    "load_custom_logitsprocs",
     "LogitProcessorCtorArgs",
 ]
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index d2c62996e72..46c836b1c71 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -3,7 +3,7 @@
 import importlib
 import itertools
 import logging
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
@@ -24,90 +24,113 @@
 ]
 
 
-def _load_logitsprocs_ctors_by_fqns(
-        fqns: Optional[list[str]]) -> list[LogitprocCtor]:
-    if not fqns:
+def _load_logitsprocs_by_fqns(
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]]
+) -> list[type[LogitsProcessor]]:
+    """Load logit processor types, identifying them by fully-qualified names
+    (FQNs).
+
+    Effectively, a mixed list of logitproc types and FQN strings is converted
+    into a list of entirely logitproc types, by loading the FQNs.
+
+    FQN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
+
+    Already-loaded logitproc types must be subclasses of LogitsProcessor
+
+    Args:
+      fqns: Potentially mixed list of logitsprocs types and FQN strings for
+            logitproc types
+
+    Returns:
+      List of logitproc types
+    
+    """
+    if not logits_processors:
         return []
 
     logger.info(
-        "Attempting to load the following logits processors via FQNs: %s",
-        fqns)
-
-    constructors: list[LogitprocCtor] = []
-    for fqn in fqns:
-        logger.info("Loading logits processor %s", fqn)
+        "%s additional custom logits processors specified, checking whether "
+        "they need to be loaded.", len(logits_processors))
+
+    constructors: list[type[LogitsProcessor]] = []
+    for ldx, logitproc in enumerate(logits_processors):
+        if isinstance(logitproc, type):
+            logger.info(" - Already loaded logit processor: %s",
+                        logitproc.__name__)
+            if not issubclass(logitproc, LogitsProcessor):
+                raise ValueError(
+                    f"{logitproc.__name__} is not a subclass of LogitsProcessor"
+                )
+            constructors.append(logitproc)
+            continue
+
+        logger.info("- Loading logits processor %s", logitproc)
         try:
-            module_path, qualname = fqn.split(":")
+            module_path, qualname = logitproc.split(":")
             # Load module
             module = importlib.import_module(module_path)
             # Walk down dotted name to get logitproc constructor
             obj = module
             for attr in qualname.split("."):
                 obj = getattr(obj, attr)
-            if not callable(obj):
-                raise ValueError(f"{fqn} is not a Callable.")
+            if not isinstance(obj, type):
+                raise ValueError("Loaded logit processor must be a type.")
+            if not issubclass(obj, LogitsProcessor):
+                raise ValueError(
+                    f"{obj.__name__} must be a subclass of LogitsProcessor")
             constructors.append(obj)
         except Exception as e:
-            logger.exception("Failed to load logits processor %s", fqn)
+            logger.exception("Failed to load %sth logits processor %s", ldx,
+                             logitproc)
             raise e
 
     return constructors
 
 
-def _load_logitsprocs_ctors_by_entrypoints(
-        entrypoints: Optional[list[str]]) -> list[LogitprocCtor]:
-    if not entrypoints:
-        return []
-
-    logger.info(
-        "Attempting to load the following logits processors via "
-        "entrypoints: %s", entrypoints)
-
+def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
+    """Load all installed logit processor plugins"""
     import sys
     if sys.version_info < (3, 10):
         from importlib_metadata import entry_points
     else:
         from importlib.metadata import entry_points
 
-    installed_logitsprocs_plugins = {
-        plugin.name: plugin
-        for plugin in entry_points(group=LOGITSPROCS_GROUP)
-    }
+    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
     if len(installed_logitsprocs_plugins) == 0:
         logger.debug("No logitsprocs plugins installed (group %s).",
                      LOGITSPROCS_GROUP)
         return []
 
-    # Use INFO for non-default groups and DEBUG for the default group
-    log_level = logger.info
-    log_level("Available logitsprocs plugins (group %s):", LOGITSPROCS_GROUP)
-    for plugin in installed_logitsprocs_plugins.values():
-        log_level("- %s -> %s", plugin.name, plugin.value)
-
-    constructors: list[LogitprocCtor] = []
-    for entrypoint in entrypoints:
-        if entrypoint not in installed_logitsprocs_plugins:
-            raise ValueError(
-                f"Invalid logit processor entrypoint string {entrypoint}.")
-        log_level("Loading plugin %s", entrypoint)
-
+    # Load logitsprocs plugins
+    logger.info("Loading installed logitsprocs plugins (group %s):",
+                LOGITSPROCS_GROUP)
+    constructors: list[type[LogitsProcessor]] = []
+    for entrypoint in installed_logitsprocs_plugins:
         try:
-            func = installed_logitsprocs_plugins[entrypoint].load()
-            constructors.append(func)
+            logger.info("- Loading logitproc plugin entrypoint=%s target=%s",
+                        entrypoint.name, entrypoint.value)
+            constructors.append(entrypoint.load())
         except Exception as e:
             logger.exception("Failed to load plugin %s", entrypoint)
             raise e
-
     return constructors
 
 
-def _load_custom_logitsprocs_ctors(
-    logits_processors_fqns: Optional[list[str]],
-    logits_processors_entrypoints: Optional[list[str]],
-) -> list[LogitprocCtor]:
-    """WARNING: logitsprocs can be loaded for multiple times in different
-    processes. They should be designed in a way that they can be loaded
-    multiple times without causing issues.
+def load_custom_logitsprocs(
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]],
+) -> list[type[LogitsProcessor]]:
+    """Load all custom logits processors.
+    
+    * First load all installed logitproc plugins
+    * Second load custom logitsprocs pass by the user at initialization time
+
+    Args:
+      logits_processors: potentially mixed list of logitproc types and
+                         logitproc type fully-qualified names (FQNs)
+                         which need to be loaded
+
+    Returns:
+      A list of all loaded logitproc types
     """
     from vllm.platforms import current_platform
     if current_platform.is_tpu():
@@ -115,16 +138,11 @@ def _load_custom_logitsprocs_ctors(
         # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
         return []
 
-    return (
-        _load_logitsprocs_ctors_by_entrypoints(logits_processors_entrypoints) +
-        _load_logitsprocs_ctors_by_fqns(logits_processors_fqns))
+    return (_load_logitsprocs_plugins() +
+            _load_logitsprocs_by_fqns(logits_processors))
 
 
 def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessors:
-    _custom_logitsprocs_ctors = _load_custom_logitsprocs_ctors(
-        args.vllm_config.logits_processors_fqns,
-        args.vllm_config.logits_processors_entrypoints,
-    )
     return LogitsProcessors(
-        ctor(args) for ctor in itertools.chain(_builtin_logitsprocs_ctors,
-                                               _custom_logitsprocs_ctors))
+        ctor(args) for ctor in itertools.chain(
+            _builtin_logitsprocs_ctors, args.vllm_config.logits_processors))
diff --git a/vllm/v1/sample/logits_processor/utils.py b/vllm/v1/sample/logits_processor/utils.py
index 98a58c7a1ca..aedc2bfbdc8 100644
--- a/vllm/v1/sample/logits_processor/utils.py
+++ b/vllm/v1/sample/logits_processor/utils.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 import torch
 
-from vllm.config import VllmConfig
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
 
 
 @dataclass
 class LogitProcessorCtorArgs:
-    vllm_config: VllmConfig
+    vllm_config: "VllmConfig"
     device: torch.device
     is_pin_memory: bool

From 4f80bee7348b4f085cf477e3a236c18292c40863 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 11 Jul 2025 03:37:15 -0400
Subject: [PATCH 157/180] unit test fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/logits_processors/test_custom_py.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index 33fa901d311..abe092233ff 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -51,7 +51,7 @@ def test_custom_logitsprocs_py(logitproc_source: str):
     kwargs = ({
         "logits_processors_entrypoints": [DUMMY_LOGITPROC_ENTRYPOINT]
     } if logitproc_source == LOGITPROC_SOURCE_ENTRYPOINT else {
-        "logits_processors_fqns": [DUMMY_LOGITPROC_FQN]
+        "logits_processors": [DUMMY_LOGITPROC_FQN]
     })
 
     # Create a vLLM instance and load custom logitproc

From 683b99f3798f41578b9177e96306a729a392bf4b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 11 Jul 2025 03:42:35 -0400
Subject: [PATCH 158/180] typo

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5fa26856c88..3f5840dcca3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -608,7 +608,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         model_group.add_argument(
             "--logits-processors",
             nargs='+',
-            help="List of logits processors' fully-qualified names (FQNs).")
+            help="One or more logits processors' fully-qualified class names.")
 
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)

From f7ee5ee20b4d32cdfbaa6e0f21cc1bf08871f9c0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 11 Jul 2025 03:54:45 -0400
Subject: [PATCH 159/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor/__init__.py |  4 +---
 vllm/v1/sample/logits_processor/load.py     | 23 ++++++++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 4eb2ac087be..5d7c1f2aaf2 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -4,8 +4,7 @@
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.load import (LogitprocCtor,
-                                                  LogitProcessorCtorArgs,
+from vllm.v1.sample.logits_processor.load import (LogitProcessorCtorArgs,
                                                   build_logitsprocs,
                                                   load_custom_logitsprocs)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
@@ -18,7 +17,6 @@
     "LogitBiasLogitsProcessor",
     "MinPLogitsProcessor",
     "MinTokensLogitsProcessor",
-    "LogitprocCtor",
     "BatchUpdate",
     "BatchUpdateBuilder",
     "MoveDirectionality",
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 46c836b1c71..8705e280cfc 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -3,7 +3,7 @@
 import importlib
 import itertools
 import logging
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
@@ -15,9 +15,8 @@
 logger = logging.getLogger(__name__)
 
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
-LogitprocCtor = Callable[[LogitProcessorCtorArgs], LogitsProcessor]
 
-_builtin_logitsprocs_ctors: list[LogitprocCtor] = [
+_builtin_logitsprocs_classes: list[type[LogitsProcessor]] = [
     MinTokensLogitsProcessor,
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
@@ -52,7 +51,7 @@ def _load_logitsprocs_by_fqns(
         "%s additional custom logits processors specified, checking whether "
         "they need to be loaded.", len(logits_processors))
 
-    constructors: list[type[LogitsProcessor]] = []
+    classes: list[type[LogitsProcessor]] = []
     for ldx, logitproc in enumerate(logits_processors):
         if isinstance(logitproc, type):
             logger.info(" - Already loaded logit processor: %s",
@@ -61,7 +60,7 @@ def _load_logitsprocs_by_fqns(
                 raise ValueError(
                     f"{logitproc.__name__} is not a subclass of LogitsProcessor"
                 )
-            constructors.append(logitproc)
+            classes.append(logitproc)
             continue
 
         logger.info("- Loading logits processor %s", logitproc)
@@ -69,7 +68,7 @@ def _load_logitsprocs_by_fqns(
             module_path, qualname = logitproc.split(":")
             # Load module
             module = importlib.import_module(module_path)
-            # Walk down dotted name to get logitproc constructor
+            # Walk down dotted name to get logitproc class
             obj = module
             for attr in qualname.split("."):
                 obj = getattr(obj, attr)
@@ -78,13 +77,13 @@ def _load_logitsprocs_by_fqns(
             if not issubclass(obj, LogitsProcessor):
                 raise ValueError(
                     f"{obj.__name__} must be a subclass of LogitsProcessor")
-            constructors.append(obj)
+            classes.append(obj)
         except Exception as e:
             logger.exception("Failed to load %sth logits processor %s", ldx,
                              logitproc)
             raise e
 
-    return constructors
+    return classes
 
 
 def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
@@ -104,16 +103,16 @@ def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
     # Load logitsprocs plugins
     logger.info("Loading installed logitsprocs plugins (group %s):",
                 LOGITSPROCS_GROUP)
-    constructors: list[type[LogitsProcessor]] = []
+    classes: list[type[LogitsProcessor]] = []
     for entrypoint in installed_logitsprocs_plugins:
         try:
             logger.info("- Loading logitproc plugin entrypoint=%s target=%s",
                         entrypoint.name, entrypoint.value)
-            constructors.append(entrypoint.load())
+            classes.append(entrypoint.load())
         except Exception as e:
             logger.exception("Failed to load plugin %s", entrypoint)
             raise e
-    return constructors
+    return classes
 
 
 def load_custom_logitsprocs(
@@ -145,4 +144,4 @@ def load_custom_logitsprocs(
 def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessors:
     return LogitsProcessors(
         ctor(args) for ctor in itertools.chain(
-            _builtin_logitsprocs_ctors, args.vllm_config.logits_processors))
+            _builtin_logitsprocs_classes, args.vllm_config.logits_processors))

From ad08c451647e027506efd9189dd5b9a6ae0cf1fe Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Fri, 11 Jul 2025 04:05:05 -0400
Subject: [PATCH 160/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/logits_processors/utils.py | 58 +++++++++++++++++++++-
 vllm/test_utils.py                         | 55 --------------------
 2 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 9ab1e186ba4..924d0be252e 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -1,9 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Optional
+
+import torch
+
+from vllm import SamplingParams
+from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
+                                             MoveDirectionality)
+
 MODEL_NAME = "facebook/opt-125m"
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQN = "vllm.test_utils:DummyLogitsProcessor"
+DUMMY_LOGITPROC_FQN = (
+    "tests.v1.sample.logits_processors.utils:DummyLogitsProcessor")
 DUMMY_LOGITPROC_ARG = "target_token"
 LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
 LOGITPROC_SOURCE_FQN = "fqn"
@@ -17,3 +26,50 @@
     "The capital of France is",
     "The future of AI is",
 ]
+
+
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(self, _):
+        super().__init__()
+        self.req_info = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            if isinstance(params, SamplingParams) and params.extra_args:
+                target_token = params.extra_args.get("target_token", None)
+            else:
+                target_token = None
+            self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                if direct == MoveDirectionality.SWAP:
+                    (self.req_info[adx],
+                     self.req_info[bdx]) = (self.req_info[bdx],
+                                            self.req_info[adx])
+                else:
+                    self.req_info[bdx] = self.req_info[adx]
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        for bdx in range(logits.shape[0]):
+            if (target_token := self.req_info[bdx]) is not None:
+                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
+                mask[target_token] = False
+                logits[bdx, mask] = float('-inf')
+
+        return logits
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index d67b0b8de96..c6b126d002b 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -1,13 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
-
-import torch
-
-from vllm import SamplingParams
-from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
-                                             MoveDirectionality)
-
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
     "ai21labs/AI21-Jamba-1.5-Mini",
@@ -136,50 +128,3 @@
 ]
 
 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
-
-
-class DummyLogitsProcessor(LogitsProcessor):
-    """Fake logit processor to support unit testing and examples"""
-
-    def __init__(self, _):
-        super().__init__()
-        self.req_info = {}
-
-    def is_argmax_invariant(self) -> bool:
-        """Never impacts greedy sampling"""
-        return False
-
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _ in batch_update.added:
-            if isinstance(params, SamplingParams) and params.extra_args:
-                target_token = params.extra_args.get("target_token", None)
-            else:
-                target_token = None
-            self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                if direct == MoveDirectionality.SWAP:
-                    (self.req_info[adx],
-                     self.req_info[bdx]) = (self.req_info[bdx],
-                                            self.req_info[adx])
-                else:
-                    self.req_info[bdx] = self.req_info[adx]
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        for bdx in range(logits.shape[0]):
-            if (target_token := self.req_info[bdx]) is not None:
-                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
-                mask[target_token] = False
-                logits[bdx, mask] = float('-inf')
-
-        return logits

From bbabe50b62dd34490ff07910d228bb71ce8b99ba Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Mon, 14 Jul 2025 10:14:16 -0400
Subject: [PATCH 161/180] abstract __init__

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 .../logits_processors/test_correctness.py     | 19 ++++----------
 vllm/v1/sample/logits_processor/__init__.py   |  4 +--
 vllm/v1/sample/logits_processor/core.py       |  6 +++++
 vllm/v1/sample/logits_processor/impls.py      | 25 ++++++++-----------
 vllm/v1/sample/logits_processor/load.py       | 11 +++++---
 vllm/v1/sample/logits_processor/utils.py      | 16 ------------
 vllm/v1/worker/gpu_model_runner.py            |  6 +----
 7 files changed, 31 insertions(+), 56 deletions(-)
 delete mode 100644 vllm/v1/sample/logits_processor/utils.py

diff --git a/tests/v1/sample/logits_processors/test_correctness.py b/tests/v1/sample/logits_processors/test_correctness.py
index 6b9a7dbfdbe..43e7181a7a7 100644
--- a/tests/v1/sample/logits_processors/test_correctness.py
+++ b/tests/v1/sample/logits_processors/test_correctness.py
@@ -27,7 +27,6 @@
                                              MoveDirectionality)
 # yapf: enable
 from vllm.v1.sample.logits_processor.load import build_logitsprocs
-from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -72,16 +71,6 @@ def __str__(self):
         summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
         return f"MyClass({summ})"
 
-
-def _generate_fake_logitsprocs_args(
-        device: torch.device) -> LogitProcessorCtorArgs:
-    return LogitProcessorCtorArgs(
-        vllm_config=VllmConfig(max_num_reqs=MAX_NUM_REQS),
-        device=device,
-        is_pin_memory=PIN_MEMORY_AVAILABLE,
-    )
-
-
 def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -99,9 +88,11 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = build_logitsprocs(
-        _generate_fake_logitsprocs_args(device=device))
-
+    logitsprocs = build_logitsprocs(   
+        vllm_config=VllmConfig(),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 5d7c1f2aaf2..31a924da441 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -4,8 +4,7 @@
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.load import (LogitProcessorCtorArgs,
-                                                  build_logitsprocs,
+from vllm.v1.sample.logits_processor.load import (build_logitsprocs,
                                                   load_custom_logitsprocs)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
@@ -23,5 +22,4 @@
     "LogitsProcessors",
     "build_logitsprocs",
     "load_custom_logitsprocs",
-    "LogitProcessorCtorArgs",
 ]
diff --git a/vllm/v1/sample/logits_processor/core.py b/vllm/v1/sample/logits_processor/core.py
index 3c8f24811cc..170652089ff 100644
--- a/vllm/v1/sample/logits_processor/core.py
+++ b/vllm/v1/sample/logits_processor/core.py
@@ -5,12 +5,18 @@
 
 import torch
 
+from vllm.config import VllmConfig
+
 if TYPE_CHECKING:
     from vllm.v1.sample.logits_processor.state import BatchUpdate
 
 
 class LogitsProcessor(ABC):
 
+    @abstractmethod
+    def __init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 052a19cb19d..e9e1151cb1e 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -9,25 +9,24 @@
 from vllm.v1.sample.logits_processor.core import LogitsProcessor
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    MoveDirectionality)
-from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 
+from vllm.config import VllmConfig
 
 class MinPLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, args: LogitProcessorCtorArgs):
-        super().__init__()
-        max_num_reqs = args.vllm_config.scheduler_config.max_num_seqs
+    def __init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool):
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
         self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
                                             dtype=torch.float32,
                                             device="cpu",
-                                            pin_memory=args.is_pin_memory)
+                                            pin_memory=is_pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
         # Pre-allocated device tensor
         self.min_p_device: torch.Tensor = torch.empty((max_num_reqs, ),
                                                       dtype=torch.float32,
-                                                      device=args.device)
+                                                      device=device)
         # Current slice of the device tensor
         self.min_p: torch.Tensor = self.min_p_device[:0]
 
@@ -98,11 +97,10 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class LogitBiasLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, args: LogitProcessorCtorArgs):
-        super().__init__()
+    def __init__(self, _, device: torch.device, is_pin_memory: bool):
+        self.device=device
+        self.pin_memory=is_pin_memory
         self.biases: dict[int, dict[int, float]] = {}
-        self.device = args.device
-        self.pin_memory = args.is_pin_memory
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
         self.logits_slice = (self._device_tensor([], torch.int32),
@@ -177,12 +175,11 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class MinTokensLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, args: LogitProcessorCtorArgs):
+    def __init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool):
         # index -> (min_toks, output_token_ids, stop_token_ids)
-        super().__init__()
+        self.device=device
+        self.pin_memory=is_pin_memory
         self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
-        self.device = args.device
-        self.pin_memory = args.is_pin_memory
 
         # (req_idx_tensor,eos_tok_id_tensor)
         self.logits_slice: tuple[torch.Tensor,
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 8705e280cfc..c4fa24b7d8a 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -5,12 +5,14 @@
 import logging
 from typing import Optional, Union
 
+import torch
+
+from vllm.config import VllmConfig
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.state import LogitsProcessors
-from vllm.v1.sample.logits_processor.utils import LogitProcessorCtorArgs
 
 logger = logging.getLogger(__name__)
 
@@ -141,7 +143,8 @@ def load_custom_logitsprocs(
             _load_logitsprocs_by_fqns(logits_processors))
 
 
-def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessors:
+def build_logitsprocs(vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool) -> LogitsProcessors:
+    custom_logitsprocs_classes = vllm_config.logits_processors or []
     return LogitsProcessors(
-        ctor(args) for ctor in itertools.chain(
-            _builtin_logitsprocs_classes, args.vllm_config.logits_processors))
+        ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
+            _builtin_logitsprocs_classes, custom_logitsprocs_classes))
diff --git a/vllm/v1/sample/logits_processor/utils.py b/vllm/v1/sample/logits_processor/utils.py
deleted file mode 100644
index aedc2bfbdc8..00000000000
--- a/vllm/v1/sample/logits_processor/utils.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
-
-import torch
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-
-
-@dataclass
-class LogitProcessorCtorArgs:
-    vllm_config: "VllmConfig"
-    device: torch.device
-    is_pin_memory: bool
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7b0e1802286..521d02b6e4b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -199,11 +199,7 @@ def __init__(
         # Build logits processors. If specified by user, load custom
         # logitsprocs constructors.
         self.logitsprocs: LogitsProcessors = build_logitsprocs(
-            LogitProcessorCtorArgs(
-                vllm_config=vllm_config,
-                device=self.device,
-                is_pin_memory=self.pin_memory,
-            ))
+            vllm_config,self.device,self.pin_memory)
 
         # Input Batch
         # NOTE(Chen): Ideally, we should initialize the input batch inside

From 9e88f378795b55e88728b29bc0d5487a300e82ec Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Mon, 14 Jul 2025 10:21:35 -0400
Subject: [PATCH 162/180] fqn

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 .../logits_processors/test_custom_cli.py      |  2 +-
 .../logits_processors/test_custom_py.py       |  6 ++---
 tests/v1/sample/logits_processors/utils.py    |  2 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/utils/__init__.py                        |  2 +-
 vllm/v1/sample/logits_processor/load.py       | 24 ++++++++++---------
 6 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
index 5fc7748d41e..902d4641f83 100644
--- a/tests/v1/sample/logits_processors/test_custom_cli.py
+++ b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -71,7 +71,7 @@ async def test_custom_logitsprocs_cli(client: openai.AsyncOpenAI,
     Launch vLLM OpenAI-compatible server with CLI argument to loads a custom
     logitproc that has a well-defined behavior (mask out all tokens except one
     `target_token`) Test is implicitly parameterized by the logitproc source
-    (fully-qualified name or entrypoint)
+    (fully-qualified class name or entrypoint)
 
     Pass in requests, 50% of which pass a `target_token` value
     in through `extra_body["vllm_xargs"]`, 50% of which do not.
diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index abe092233ff..3de515338c9 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -6,7 +6,7 @@
 
 from tests.v1.sample.logits_processors.utils import (
     DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQN,
-    LOGITPROC_SOURCE_ENTRYPOINT, LOGITPROC_SOURCE_FQN, MAX_TOKENS, MODEL_NAME,
+    LOGITPROC_SOURCE_ENTRYPOINT, LOGITPROC_SOURCE_FQCN, MAX_TOKENS, MODEL_NAME,
     TEMP_GREEDY, prompts)
 from vllm import LLM, SamplingParams
 
@@ -24,7 +24,7 @@
 
 
 @pytest.mark.parametrize("logitproc_source",
-                         [LOGITPROC_SOURCE_FQN, LOGITPROC_SOURCE_ENTRYPOINT])
+                         [LOGITPROC_SOURCE_FQCN, LOGITPROC_SOURCE_ENTRYPOINT])
 def test_custom_logitsprocs_py(logitproc_source: str):
     """Test Python interface for passing custom logitsprocs
     
@@ -42,7 +42,7 @@ def test_custom_logitsprocs_py(logitproc_source: str):
     * Requests which activate the custom logitproc, only output `target_token`
 
     Args:
-      logitproc_source: what source (entrypoint or fully-qualified name) the 
+      logitproc_source: what source (entrypoint or fully-qualified class name) the 
                         user pulls the logitproc from
     """
     random.seed(40)
diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 924d0be252e..9b2b86338fc 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -15,7 +15,7 @@
     "tests.v1.sample.logits_processors.utils:DummyLogitsProcessor")
 DUMMY_LOGITPROC_ARG = "target_token"
 LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
-LOGITPROC_SOURCE_FQN = "fqn"
+LOGITPROC_SOURCE_FQCN = "fqcn"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3f5840dcca3..97957a4029c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -508,7 +508,7 @@ def __post_init__(self):
         load_general_plugins()
         if envs.VLLM_USE_V1:
             # Setup V1 custom logitsprocs. Load plugins & any logitsprocs
-            # specified by FQN
+            # specified by FQCN
             self.logits_processors = load_custom_logitsprocs(
                 self.logits_processors)
         elif self.logits_processors is not None:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 48346c7d6e5..e2512e1536e 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2503,7 +2503,7 @@ def direct_register_custom_op(
 
 def resolve_obj_by_qualname(qualname: str) -> Any:
     """
-    Resolve an object by its fully qualified name.
+    Resolve an object by its fully-qualified class name.
     """
     module_name, obj_name = qualname.rsplit(".", 1)
     module = importlib.import_module(module_name)
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index c4fa24b7d8a..16e1d3784d4 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -25,22 +25,22 @@
 ]
 
 
-def _load_logitsprocs_by_fqns(
+def _load_logitsprocs_by_fqcns(
     logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]]
 ) -> list[type[LogitsProcessor]]:
-    """Load logit processor types, identifying them by fully-qualified names
-    (FQNs).
+    """Load logit processor types, identifying them by fully-qualified class
+    names (FQCNs).
 
-    Effectively, a mixed list of logitproc types and FQN strings is converted
-    into a list of entirely logitproc types, by loading the FQNs.
+    Effectively, a mixed list of logitproc types and FQCN strings is converted
+    into a list of entirely logitproc types, by loading from the FQCNs.
 
-    FQN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
+    FQCN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
 
     Already-loaded logitproc types must be subclasses of LogitsProcessor
 
     Args:
-      fqns: Potentially mixed list of logitsprocs types and FQN strings for
-            logitproc types
+      logits_processors: Potentially mixed list of logitsprocs types and FQCN
+                         strings for logitproc types
 
     Returns:
       List of logitproc types
@@ -127,7 +127,7 @@ def load_custom_logitsprocs(
 
     Args:
       logits_processors: potentially mixed list of logitproc types and
-                         logitproc type fully-qualified names (FQNs)
+                         logitproc type fully-qualified names (FQCNs)
                          which need to be loaded
 
     Returns:
@@ -140,10 +140,12 @@ def load_custom_logitsprocs(
         return []
 
     return (_load_logitsprocs_plugins() +
-            _load_logitsprocs_by_fqns(logits_processors))
+            _load_logitsprocs_by_fqcns(logits_processors))
 
 
-def build_logitsprocs(vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool) -> LogitsProcessors:
+def build_logitsprocs(vllm_config: VllmConfig, 
+                      device: torch.device, 
+                      is_pin_memory: bool) -> LogitsProcessors:
     custom_logitsprocs_classes = vllm_config.logits_processors or []
     return LogitsProcessors(
         ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(

From d08a89d3868d534a7bd46ed4a2d78cde593b642f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeld2012@gmail.com>
Date: Mon, 14 Jul 2025 10:31:29 -0400
Subject: [PATCH 163/180] type checking

Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
---
 vllm/v1/sample/logits_processor/core.py  | 5 ++---
 vllm/v1/sample/logits_processor/impls.py | 9 +++++----
 vllm/v1/sample/logits_processor/load.py  | 9 +++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/core.py b/vllm/v1/sample/logits_processor/core.py
index 170652089ff..9a01952e699 100644
--- a/vllm/v1/sample/logits_processor/core.py
+++ b/vllm/v1/sample/logits_processor/core.py
@@ -5,16 +5,15 @@
 
 import torch
 
-from vllm.config import VllmConfig
-
 if TYPE_CHECKING:
+    from vllm.config import VllmConfig
     from vllm.v1.sample.logits_processor.state import BatchUpdate
 
 
 class LogitsProcessor(ABC):
 
     @abstractmethod
-    def __init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool) -> None:
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool) -> None:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index a24b5bc9cbc..0fcd153154d 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Optional
+from typing import Optional, TYPE_CHECKING
 
 import torch
 
@@ -10,11 +10,12 @@
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    MoveDirectionality)
 
-from vllm.config import VllmConfig
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
 
 class MinPLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool):
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool):
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
@@ -183,7 +184,7 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class MinTokensLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool):
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool):
         # index -> (min_toks, output_token_ids, stop_token_ids)
         self.device=device
         self.pin_memory=is_pin_memory
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 16e1d3784d4..189aa3732eb 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -3,17 +3,18 @@
 import importlib
 import itertools
 import logging
-from typing import Optional, Union
+from typing import Optional, Union, TYPE_CHECKING
 
 import torch
-
-from vllm.config import VllmConfig
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.state import LogitsProcessors
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
 logger = logging.getLogger(__name__)
 
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
@@ -143,7 +144,7 @@ def load_custom_logitsprocs(
             _load_logitsprocs_by_fqcns(logits_processors))
 
 
-def build_logitsprocs(vllm_config: VllmConfig, 
+def build_logitsprocs(vllm_config: "VllmConfig", 
                       device: torch.device, 
                       is_pin_memory: bool) -> LogitsProcessors:
     custom_logitsprocs_classes = vllm_config.logits_processors or []

From aae02a0098da6849d735c67e9e35d74103ac14f7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 10:52:18 -0400
Subject: [PATCH 164/180] small fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor/load.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 8705e280cfc..7ef8f08e6d5 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -143,5 +143,6 @@ def load_custom_logitsprocs(
 
 def build_logitsprocs(args: LogitProcessorCtorArgs) -> LogitsProcessors:
     return LogitsProcessors(
-        ctor(args) for ctor in itertools.chain(
-            _builtin_logitsprocs_classes, args.vllm_config.logits_processors))
+        ctor(args)
+        for ctor in itertools.chain(_builtin_logitsprocs_classes,
+                                    args.vllm_config.logits_processors or []))

From 2d807afa11e9ec7133a0d31fea0345a7d12f0544 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 10:56:46 -0400
Subject: [PATCH 165/180] fixes

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 requirements/test.txt                         | 22 +++++++++++++++++--
 .../logits_processors/test_custom_py.py       |  2 +-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index 84303b83117..2292c1f7408 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -31,6 +31,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -141,6 +145,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -695,7 +704,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -758,8 +766,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -833,14 +846,19 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
     #   pydantic-extra-types
+    #   rich
     #   torch
     #   typer
     #   typing-inspection
diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index abe092233ff..5773156036d 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -42,7 +42,7 @@ def test_custom_logitsprocs_py(logitproc_source: str):
     * Requests which activate the custom logitproc, only output `target_token`
 
     Args:
-      logitproc_source: what source (entrypoint or fully-qualified name) the 
+      logitproc_source: what source (entrypoint or fully-qualified name) the
                         user pulls the logitproc from
     """
     random.seed(40)

From 2a79d4ae54e5853a223856df514c06c9b80c8788 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 13:02:42 -0400
Subject: [PATCH 166/180] merge

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor/impls.py | 20 +++++++++++---------
 vllm/v1/sample/logits_processor/load.py  |  6 +++---
 vllm/v1/worker/gpu_model_runner.py       |  5 ++---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 137cfce6bdd..afdb7573185 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -13,9 +13,11 @@
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+
 class MinPLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool):
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
@@ -25,8 +27,7 @@ def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memor
                                             pin_memory=is_pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
 
-        self.use_double_tensor = torch.device("cpu") != torch.device(
-            device)
+        self.use_double_tensor = torch.device("cpu") != torch.device(device)
 
         if self.use_double_tensor:
             # Pre-allocated device tensor
@@ -108,8 +109,8 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 class LogitBiasLogitsProcessor(LogitsProcessor):
 
     def __init__(self, _, device: torch.device, is_pin_memory: bool):
-        self.device=device
-        self.pin_memory=is_pin_memory
+        self.device = device
+        self.pin_memory = is_pin_memory
         self.biases: dict[int, dict[int, float]] = {}
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
@@ -185,10 +186,11 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class MinTokensLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool):
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
         # index -> (min_toks, output_token_ids, stop_token_ids)
-        self.device=device
-        self.pin_memory=is_pin_memory
+        self.device = device
+        self.pin_memory = is_pin_memory
         self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
 
         # (req_idx_tensor,eos_tok_id_tensor)
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 189aa3732eb..dd1fa712083 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -3,9 +3,10 @@
 import importlib
 import itertools
 import logging
-from typing import Optional, Union, TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
+
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
@@ -144,8 +145,7 @@ def load_custom_logitsprocs(
             _load_logitsprocs_by_fqcns(logits_processors))
 
 
-def build_logitsprocs(vllm_config: "VllmConfig", 
-                      device: torch.device, 
+def build_logitsprocs(vllm_config: "VllmConfig", device: torch.device,
                       is_pin_memory: bool) -> LogitsProcessors:
     custom_logitsprocs_classes = vllm_config.logits_processors or []
     return LogitsProcessors(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a42c6f0105..b6da9548471 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -55,8 +55,7 @@
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import (LogitsProcessors,
-                                             build_logitsprocs)
+from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -198,7 +197,7 @@ def __init__(
         # Build logits processors. If specified by user, load custom
         # logitsprocs constructors.
         self.logitsprocs: LogitsProcessors = build_logitsprocs(
-            vllm_config,self.device,self.pin_memory)
+            vllm_config, self.device, self.pin_memory)
 
         # Input Batch
         # NOTE(Chen): Ideally, we should initialize the input batch inside

From bd243c9a752fb887f5f62f258221bd3ab55d3087 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 13:27:52 -0400
Subject: [PATCH 167/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/logits_processors/test_custom_py.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index cc61f40e562..f223569bfa2 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -42,12 +42,8 @@ def test_custom_logitsprocs_py(logitproc_source: str):
     * Requests which activate the custom logitproc, only output `target_token`
 
     Args:
-<<<<<<< HEAD
-      logitproc_source: what source (entrypoint or fully-qualified name) the
-=======
-      logitproc_source: what source (entrypoint or fully-qualified class name) the 
->>>>>>> d08a89d3868d534a7bd46ed4a2d78cde593b642f
-                        user pulls the logitproc from
+      logitproc_source: what source (entrypoint or fully-qualified class name)
+                        the user pulls the logitproc from
     """
     random.seed(40)
 

From 07d60560032cd844a4d4218093c38df470b9b723 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 13:53:29 -0400
Subject: [PATCH 168/180] fix

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/logits_processors/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 9b2b86338fc..0867b5a5e18 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -6,6 +6,7 @@
 import torch
 
 from vllm import SamplingParams
+from vllm.config import VllmConfig
 from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
                                              MoveDirectionality)
 
@@ -31,8 +32,8 @@
 class DummyLogitsProcessor(LogitsProcessor):
     """Fake logit processor to support unit testing and examples"""
 
-    def __init__(self, _):
-        super().__init__()
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
         self.req_info = {}
 
     def is_argmax_invariant(self) -> bool:

From 83eca33599ef2f5a1e888bab091c919b3ffbdf20 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 14:34:47 -0400
Subject: [PATCH 169/180] fix test bug

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_custom_cli.py      |  4 +-
 .../logits_processors/test_custom_py.py       | 38 ++++++++++++-------
 tests/v1/sample/logits_processors/utils.py    | 13 +++++--
 3 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
index 902d4641f83..3690822c1c7 100644
--- a/tests/v1/sample/logits_processors/test_custom_cli.py
+++ b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -9,7 +9,7 @@
 
 from tests.utils import RemoteOpenAIServer
 from tests.v1.sample.logits_processors.utils import (
-    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQN,
+    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQCN,
     MAX_TOKENS, MODEL_NAME, TEMP_GREEDY, prompts)
 
 
@@ -35,7 +35,7 @@ def default_server_args():
     ],
             [
                 "--logits-processors-fqns",
-                DUMMY_LOGITPROC_FQN + "," + DUMMY_LOGITPROC_FQN
+                DUMMY_LOGITPROC_FQCN + "," + DUMMY_LOGITPROC_FQCN
             ]])
 def server(default_server_args, request):
     if request.param:
diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index f223569bfa2..fcf480ffd05 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -4,10 +4,14 @@
 
 import pytest
 
-from tests.v1.sample.logits_processors.utils import (
-    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQN,
-    LOGITPROC_SOURCE_ENTRYPOINT, LOGITPROC_SOURCE_FQCN, MAX_TOKENS, MODEL_NAME,
-    TEMP_GREEDY, prompts)
+# yapf: disable
+from tests.v1.sample.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                                     DUMMY_LOGITPROC_FQCN,
+                                                     MAX_TOKENS, MODEL_NAME,
+                                                     TEMP_GREEDY,
+                                                     DummyLogitsProcessor,
+                                                     LogitprocSource, prompts)
+# yapf: enable
 from vllm import LLM, SamplingParams
 
 # Create a mixture of requests which do and don't utilize the dummy logitproc
@@ -23,9 +27,12 @@
 ]
 
 
-@pytest.mark.parametrize("logitproc_source",
-                         [LOGITPROC_SOURCE_FQCN, LOGITPROC_SOURCE_ENTRYPOINT])
-def test_custom_logitsprocs_py(logitproc_source: str):
+@pytest.mark.parametrize("logitproc_source", [
+    LogitprocSource.LOGITPROC_SOURCE_FQCN,
+    LogitprocSource.LOGITPROC_SOURCE_CLASS,
+    LogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT
+])
+def test_custom_logitsprocs_py(logitproc_source: LogitprocSource):
     """Test Python interface for passing custom logitsprocs
     
     Construct an `LLM` instance which loads a custom logitproc that has a
@@ -42,17 +49,20 @@ def test_custom_logitsprocs_py(logitproc_source: str):
     * Requests which activate the custom logitproc, only output `target_token`
 
     Args:
-      logitproc_source: what source (entrypoint or fully-qualified class name)
-                        the user pulls the logitproc from
+      logitproc_source: what source (entrypoint, fully-qualified class name
+                        (FQCN), or class object) the user pulls the
+                        logitproc from
     """
     random.seed(40)
 
     # Choose LLM args based on logitproc source
-    kwargs = ({
-        "logits_processors_entrypoints": [DUMMY_LOGITPROC_ENTRYPOINT]
-    } if logitproc_source == LOGITPROC_SOURCE_ENTRYPOINT else {
-        "logits_processors": [DUMMY_LOGITPROC_FQN]
-    })
+    kwargs = {}  # Loading logitsprocs entrypoints is automatic
+    if logitproc_source == LogitprocSource.LOGITPROC_SOURCE_FQCN:
+        # Load logitproc based on fully-qualified class name (FQCN)
+        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+    elif logitproc_source == LogitprocSource.LOGITPROC_SOURCE_CLASS:
+        # Load logitproc with provided constructor
+        kwargs["logits_processors"] = [DummyLogitsProcessor]
 
     # Create a vLLM instance and load custom logitproc
     llm_logitproc = LLM(
diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 0867b5a5e18..549c6005175 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from enum import Enum, auto
 from typing import Optional
 
 import torch
@@ -12,14 +13,20 @@
 
 MODEL_NAME = "facebook/opt-125m"
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQN = (
+DUMMY_LOGITPROC_FQCN = (
     "tests.v1.sample.logits_processors.utils:DummyLogitsProcessor")
 DUMMY_LOGITPROC_ARG = "target_token"
-LOGITPROC_SOURCE_ENTRYPOINT = "entrypoint"
-LOGITPROC_SOURCE_FQCN = "fqcn"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
 
+
+class LogitprocSource(Enum):
+    """How to source a logitproc for testing purposes"""
+    LOGITPROC_SOURCE_ENTRYPOINT = auto()
+    LOGITPROC_SOURCE_FQCN = auto()
+    LOGITPROC_SOURCE_CLASS = auto()
+
+
 # Sample prompts.
 prompts = [
     "Hello, my name is",

From 6a4597bcae212145538f6c67de07147b99f8fb31 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Wed, 16 Jul 2025 14:58:29 -0400
Subject: [PATCH 170/180] cli test works again

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_custom_cli.py      | 28 +++++----
 .../logits_processors/test_custom_py.py       |  5 +-
 tests/v1/sample/logits_processors/utils.py    | 58 +------------------
 vllm/test_utils.py                            | 56 ++++++++++++++++++
 4 files changed, 73 insertions(+), 74 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
index 3690822c1c7..8738a56e570 100644
--- a/tests/v1/sample/logits_processors/test_custom_cli.py
+++ b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -8,9 +8,10 @@
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.sample.logits_processors.utils import (
-    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQCN,
-    MAX_TOKENS, MODEL_NAME, TEMP_GREEDY, prompts)
+from tests.v1.sample.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                                     DUMMY_LOGITPROC_FQCN,
+                                                     MAX_TOKENS, MODEL_NAME,
+                                                     TEMP_GREEDY, prompts)
 
 
 @pytest.fixture(scope="module")
@@ -27,17 +28,15 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(
-    scope="module",
-    params=[[
-        "--logits-processors-entrypoints",
-        DUMMY_LOGITPROC_ENTRYPOINT + "," + DUMMY_LOGITPROC_ENTRYPOINT
-    ],
-            [
-                "--logits-processors-fqns",
-                DUMMY_LOGITPROC_FQCN + "," + DUMMY_LOGITPROC_FQCN
-            ]])
+@pytest.fixture(scope="module",
+                params=[["--logits-processors", DUMMY_LOGITPROC_FQCN]])
 def server(default_server_args, request):
+    """Server cli arg list is parameterized by logitproc source
+    
+    TODO (andy): entrypoints unit test; currently CLI logitsprocs
+    unit test only covers the case where logitproc is specified by
+    FQCN
+    """
     if request.param:
         default_server_args = default_server_args + request.param
     with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
@@ -70,8 +69,7 @@ async def test_custom_logitsprocs_cli(client: openai.AsyncOpenAI,
     
     Launch vLLM OpenAI-compatible server with CLI argument to loads a custom
     logitproc that has a well-defined behavior (mask out all tokens except one
-    `target_token`) Test is implicitly parameterized by the logitproc source
-    (fully-qualified class name or entrypoint)
+    `target_token`). Logitproc is specified by fully-qualified class name (FQCN)
 
     Pass in requests, 50% of which pass a `target_token` value
     in through `extra_body["vllm_xargs"]`, 50% of which do not.
diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index fcf480ffd05..75616298b27 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -9,10 +9,11 @@
                                                      DUMMY_LOGITPROC_FQCN,
                                                      MAX_TOKENS, MODEL_NAME,
                                                      TEMP_GREEDY,
-                                                     DummyLogitsProcessor,
                                                      LogitprocSource, prompts)
-# yapf: enable
 from vllm import LLM, SamplingParams
+from vllm.test_utils import DummyLogitsProcessor
+
+# yapf: enable
 
 # Create a mixture of requests which do and don't utilize the dummy logitproc
 sampling_params_list = [
diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 549c6005175..668a1b11b69 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -2,19 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum, auto
-from typing import Optional
-
-import torch
-
-from vllm import SamplingParams
-from vllm.config import VllmConfig
-from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
-                                             MoveDirectionality)
 
 MODEL_NAME = "facebook/opt-125m"
 DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQCN = (
-    "tests.v1.sample.logits_processors.utils:DummyLogitsProcessor")
+DUMMY_LOGITPROC_FQCN = ("vllm.test_utils:DummyLogitsProcessor")
 DUMMY_LOGITPROC_ARG = "target_token"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
@@ -34,50 +25,3 @@ class LogitprocSource(Enum):
     "The capital of France is",
     "The future of AI is",
 ]
-
-
-class DummyLogitsProcessor(LogitsProcessor):
-    """Fake logit processor to support unit testing and examples"""
-
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
-                 is_pin_memory: bool):
-        self.req_info = {}
-
-    def is_argmax_invariant(self) -> bool:
-        """Never impacts greedy sampling"""
-        return False
-
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _ in batch_update.added:
-            if isinstance(params, SamplingParams) and params.extra_args:
-                target_token = params.extra_args.get("target_token", None)
-            else:
-                target_token = None
-            self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                if direct == MoveDirectionality.SWAP:
-                    (self.req_info[adx],
-                     self.req_info[bdx]) = (self.req_info[bdx],
-                                            self.req_info[adx])
-                else:
-                    self.req_info[bdx] = self.req_info[adx]
-
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        for bdx in range(logits.shape[0]):
-            if (target_token := self.req_info[bdx]) is not None:
-                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
-                mask[target_token] = False
-                logits[bdx, mask] = float('-inf')
-
-        return logits
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index c6b126d002b..b26269bc356 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -1,5 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
+                                             MoveDirectionality)
+
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
     "ai21labs/AI21-Jamba-1.5-Mini",
@@ -128,3 +137,50 @@
 ]
 
 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
+
+
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        self.req_info = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _ in batch_update.added:
+            if isinstance(params, SamplingParams) and params.extra_args:
+                target_token = params.extra_args.get("target_token", None)
+            else:
+                target_token = None
+            self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional (a->b) and swap (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                if direct == MoveDirectionality.SWAP:
+                    (self.req_info[adx],
+                     self.req_info[bdx]) = (self.req_info[bdx],
+                                            self.req_info[adx])
+                else:
+                    self.req_info[bdx] = self.req_info[adx]
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        for bdx in range(logits.shape[0]):
+            if (target_token := self.req_info[bdx]) is not None:
+                mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
+                mask[target_token] = False
+                logits[bdx, mask] = float('-inf')
+
+        return logits

From 8c2d16cebb62a5175eee6a5f4c558935e9d79358 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 01:50:34 -0400
Subject: [PATCH 171/180] LLM entrypoint testing

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_custom_cli.py      |   2 +-
 .../logits_processors/test_custom_py.py       | 102 ++++++++++--------
 tests/v1/sample/logits_processors/utils.py    |  11 +-
 vllm/envs.py                                  |   5 +
 vllm/test_utils.py                            |  18 ++++
 vllm/v1/sample/logits_processor/load.py       |   7 +-
 6 files changed, 91 insertions(+), 54 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
index 8738a56e570..59bdd03958c 100644
--- a/tests/v1/sample/logits_processors/test_custom_cli.py
+++ b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -9,9 +9,9 @@
 
 from tests.utils import RemoteOpenAIServer
 from tests.v1.sample.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
-                                                     DUMMY_LOGITPROC_FQCN,
                                                      MAX_TOKENS, MODEL_NAME,
                                                      TEMP_GREEDY, prompts)
+from vllm.test_utils import DUMMY_LOGITPROC_FQCN
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/v1/sample/logits_processors/test_custom_py.py b/tests/v1/sample/logits_processors/test_custom_py.py
index 75616298b27..b059f23b6a7 100644
--- a/tests/v1/sample/logits_processors/test_custom_py.py
+++ b/tests/v1/sample/logits_processors/test_custom_py.py
@@ -4,16 +4,13 @@
 
 import pytest
 
-# yapf: disable
 from tests.v1.sample.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
-                                                     DUMMY_LOGITPROC_FQCN,
                                                      MAX_TOKENS, MODEL_NAME,
                                                      TEMP_GREEDY,
-                                                     LogitprocSource, prompts)
+                                                     CustomLogitprocSource,
+                                                     prompts)
 from vllm import LLM, SamplingParams
-from vllm.test_utils import DummyLogitsProcessor
-
-# yapf: enable
+from vllm.test_utils import DUMMY_LOGITPROC_FQCN, DummyLogitsProcessor
 
 # Create a mixture of requests which do and don't utilize the dummy logitproc
 sampling_params_list = [
@@ -28,43 +25,7 @@
 ]
 
 
-@pytest.mark.parametrize("logitproc_source", [
-    LogitprocSource.LOGITPROC_SOURCE_FQCN,
-    LogitprocSource.LOGITPROC_SOURCE_CLASS,
-    LogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT
-])
-def test_custom_logitsprocs_py(logitproc_source: LogitprocSource):
-    """Test Python interface for passing custom logitsprocs
-    
-    Construct an `LLM` instance which loads a custom logitproc that has a
-    well-defined behavior (mask out all tokens except one `target_token`)
-
-    Construct a reference `LLM` instance with no custom logitproc
-
-    Pass in a batch of requests, 50% of which pass a `target_token` value
-    in through `SamplingParams.extra_args`, 50% of which do not.
-
-    Validate that
-    * Requests which do not activate the custom logitproc, yield the same
-      results for both `LLM` instances
-    * Requests which activate the custom logitproc, only output `target_token`
-
-    Args:
-      logitproc_source: what source (entrypoint, fully-qualified class name
-                        (FQCN), or class object) the user pulls the
-                        logitproc from
-    """
-    random.seed(40)
-
-    # Choose LLM args based on logitproc source
-    kwargs = {}  # Loading logitsprocs entrypoints is automatic
-    if logitproc_source == LogitprocSource.LOGITPROC_SOURCE_FQCN:
-        # Load logitproc based on fully-qualified class name (FQCN)
-        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
-    elif logitproc_source == LogitprocSource.LOGITPROC_SOURCE_CLASS:
-        # Load logitproc with provided constructor
-        kwargs["logits_processors"] = [DummyLogitsProcessor]
-
+def _run_test(kwargs: dict, logitproc_loaded: bool):
     # Create a vLLM instance and load custom logitproc
     llm_logitproc = LLM(
         model=MODEL_NAME,
@@ -85,7 +46,7 @@ def test_custom_logitsprocs_py(logitproc_source: LogitprocSource):
     for bdx, (out_lp, out_ref, params) in enumerate(
             zip(outputs_logitproc, outputs_ref, sampling_params_list)):
         lp_toks = out_lp.outputs[0].token_ids
-        if params.extra_args:
+        if logitproc_loaded and params.extra_args:
             # This request exercises custom logitproc; validate that logitproc
             # forces `target_token` to be decoded in each step
             target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
@@ -94,10 +55,59 @@ def test_custom_logitsprocs_py(logitproc_source: LogitprocSource):
                     f"Request {bdx} generated {lp_toks}, shoud all be "
                     f"{target_token}")
         else:
-            # This request does not exercise custom logitproc; validate
-            # against reference result
+            # This request does not exercise custom logitproc (or custom
+            # logitproc is not enabled on this server); validate against
+            # reference result
             ref_toks = out_ref.outputs[0].token_ids
             if lp_toks != ref_toks:
                 raise AssertionError(
                     f"Request {bdx} generated {lp_toks}, should match "
                     f"{ref_toks}")
+
+
+@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource))
+def test_custom_logitsprocs_py(monkeypatch,
+                               logitproc_source: CustomLogitprocSource):
+    """Test Python interface for passing custom logitsprocs
+    
+    Construct an `LLM` instance which loads a custom logitproc that has a
+    well-defined behavior (mask out all tokens except one `target_token`)
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Args:
+      logitproc_source: what source (entrypoint, fully-qualified class name
+                        (FQCN), or class object) the user pulls the
+                        logitproc from
+    """
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    random.seed(40)
+
+    # Choose LLM args based on logitproc source
+    kwargs = {}
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE:
+        # Scenario: the server does not load any custom logitproc
+        # Every other scenario is a different way of loading a custom logitproc
+        _run_test(kwargs, logitproc_loaded=False)
+        return
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
+        # Scenario: vLLM loads a logitproc from a preconfigured entrypoint
+        # To that end, mock a dummy logitproc entrypoint
+        monkeypatch.setenv("VLLM_MOCK_LP_ENTRYPOINT", "1")
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
+        # Scenario: load logitproc based on fully-qualified class name (FQCN)
+        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
+        # Scenario: load logitproc from provided class object
+        kwargs["logits_processors"] = [DummyLogitsProcessor]
+
+    # Test one of the above scenarios where the server loads a custom logitproc
+    _run_test(kwargs, logitproc_loaded=True)
diff --git a/tests/v1/sample/logits_processors/utils.py b/tests/v1/sample/logits_processors/utils.py
index 668a1b11b69..4d0ce1a2e07 100644
--- a/tests/v1/sample/logits_processors/utils.py
+++ b/tests/v1/sample/logits_processors/utils.py
@@ -4,18 +4,17 @@
 from enum import Enum, auto
 
 MODEL_NAME = "facebook/opt-125m"
-DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
-DUMMY_LOGITPROC_FQCN = ("vllm.test_utils:DummyLogitsProcessor")
 DUMMY_LOGITPROC_ARG = "target_token"
 TEMP_GREEDY = 0.0
 MAX_TOKENS = 20
 
 
-class LogitprocSource(Enum):
+class CustomLogitprocSource(Enum):
     """How to source a logitproc for testing purposes"""
-    LOGITPROC_SOURCE_ENTRYPOINT = auto()
-    LOGITPROC_SOURCE_FQCN = auto()
-    LOGITPROC_SOURCE_CLASS = auto()
+    LOGITPROC_SOURCE_NONE = auto()  # No custom logitproc
+    LOGITPROC_SOURCE_ENTRYPOINT = auto()  # Via entrypoint
+    LOGITPROC_SOURCE_FQCN = auto()  # Via fully-qualified class name (FQCN)
+    LOGITPROC_SOURCE_CLASS = auto()  # Via provided class object
 
 
 # Sample prompts.
diff --git a/vllm/envs.py b/vllm/envs.py
index 502978c7685..ebd157b9744 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -141,6 +141,7 @@
     VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_LOOPBACK_IP: str = ""
+    VLLM_MOCK_LP_ENTRYPOINT: bool = False
 
 
 def get_default_cache_root():
@@ -974,6 +975,10 @@ def get_vllm_port() -> Optional[int]:
     # Used to force set up loopback IP
     "VLLM_LOOPBACK_IP":
     lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
+
+    # Controls whether or not to use cudnn prefill
+    "VLLM_MOCK_LP_ENTRYPOINT":
+    lambda: bool(int(os.getenv("VLLM_MOCK_LP_ENTRYPOINT", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index b26269bc356..e5f6f079cbc 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -138,6 +138,10 @@
 
 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
 
+DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
+
+DUMMY_LOGITPROC_FQCN = "vllm.test_utils:DummyLogitsProcessor"
+
 
 class DummyLogitsProcessor(LogitsProcessor):
     """Fake logit processor to support unit testing and examples"""
@@ -184,3 +188,17 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
                 logits[bdx, mask] = float('-inf')
 
         return logits
+
+
+class EntryPoint:
+    """Fake entrypoint class"""
+
+    def __init__(self):
+        self.name = DUMMY_LOGITPROC_ENTRYPOINT
+        self.value = DUMMY_LOGITPROC_FQCN
+
+    def load(self):
+        return DummyLogitsProcessor
+
+
+entry_points = lambda group: [EntryPoint()]
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index dd1fa712083..93fc060728d 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -7,6 +7,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.v1.sample.logits_processor import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
@@ -92,8 +93,12 @@ def _load_logitsprocs_by_fqcns(
 
 def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
     """Load all installed logit processor plugins"""
+
     import sys
-    if sys.version_info < (3, 10):
+
+    if envs.VLLM_MOCK_LP_ENTRYPOINT:
+        from vllm.test_utils import entry_points
+    elif sys.version_info < (3, 10):
         from importlib_metadata import entry_points
     else:
         from importlib.metadata import entry_points

From e6123e72f79e369cef2f8a799a382d7d501e841b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 02:48:56 -0400
Subject: [PATCH 172/180] cli entrypoints test

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_custom_cli.py      | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
index 59bdd03958c..a58815f7455 100644
--- a/tests/v1/sample/logits_processors/test_custom_cli.py
+++ b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -28,17 +28,24 @@ def default_server_args():
     ]
 
 
-@pytest.fixture(scope="module",
-                params=[["--logits-processors", DUMMY_LOGITPROC_FQCN]])
-def server(default_server_args, request):
-    """Server cli arg list is parameterized by logitproc source
-    
-    TODO (andy): entrypoints unit test; currently CLI logitsprocs
-    unit test only covers the case where logitproc is specified by
-    FQCN
+@pytest.fixture(scope="function",
+                params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]])
+def server(default_server_args, request, monkeypatch):
+    """Server cli arg list is parameterized by logitproc source: either fully-
+    qualified class name (FQCN) specified by `--logits-processors`, or
+    entrypoint.
+
+    Entrypoint requires no cli argument, but for testing purposes an
+    environment variable must be set to mock a dummy logit processor entrypoint
     """
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
     if request.param:
+        # Append FQCN argument
         default_server_args = default_server_args + request.param
+    else:
+        # Enable mock logit processor entrypoint
+        monkeypatch.setenv("VLLM_MOCK_LP_ENTRYPOINT", "1")
+
     with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
         yield remote_server
 

From da8aa76ac58c0ff28bfc38367949eff32238df6b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 03:04:52 -0400
Subject: [PATCH 173/180] fixed example

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../offline_inference/logits_processor.py     | 91 ++++++-------------
 1 file changed, 30 insertions(+), 61 deletions(-)

diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
index 887c5706d3c..84de136c69d 100644
--- a/examples/offline_inference/logits_processor.py
+++ b/examples/offline_inference/logits_processor.py
@@ -1,68 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
-
-import torch
+"""This example demonstrates instantiating vLLM with a custom logits processor
+class object.
+
+For testing purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
 
 from vllm import LLM, SamplingParams
-from vllm.v1.sample.logits_processor import (
-    BatchUpdate,
-    LogitsProcessor,
-    MoveDirectionality,
-)
-
-
-def make_dummy_logitproc_type():
-    class DummyLogitsProcessor(LogitsProcessor):
-        """Fake logit processor to support unit testing and examples"""
-
-        def __init__(self, _):
-            super().__init__()
-            self.req_info = {}
-
-        def is_argmax_invariant(self) -> bool:
-            """Never impacts greedy sampling"""
-            return False
-
-        def update_state(self, batch_update: Optional[BatchUpdate]):
-            if not batch_update:
-                return
-
-            # Process added requests.
-            for index, params, _ in batch_update.added:
-                if isinstance(params, SamplingParams) and params.extra_args:
-                    target_token = params.extra_args.get("target_token", None)
-                else:
-                    target_token = None
-                self.req_info[index] = target_token
-
-            if self.req_info:
-                # Process removed requests.
-                for index in batch_update.removed:
-                    self.req_info.pop(index, None)
-
-                # Process moved requests, unidirectional (a->b) and swap (a<->b)
-                for adx, bdx, direct in batch_update.moved:
-                    if direct == MoveDirectionality.SWAP:
-                        (self.req_info[adx], self.req_info[bdx]) = (
-                            self.req_info[bdx],
-                            self.req_info[adx],
-                        )
-                    else:
-                        self.req_info[bdx] = self.req_info[adx]
-
-        def apply(self, logits: torch.Tensor) -> torch.Tensor:
-            for bdx in range(logits.shape[0]):
-                if (target_token := self.req_info[bdx]) is not None:
-                    mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
-                    mask[target_token] = False
-                    logits[bdx, mask] = float("-inf")
-
-            return logits
-
-    return DummyLogitsProcessor
-
+from vllm.test_utils import DummyLogitsProcessor
 
 # Sample prompts.
 prompts = [
@@ -84,7 +53,7 @@ def main():
     # Create an LLM.
     llm = LLM(
         model="facebook/opt-125m",
-        logits_processors=[make_dummy_logitproc_type()],
+        logits_processors=[DummyLogitsProcessor],
     )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects

From 12d48a7766e1c54636d2170e2383958666794f48 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 04:03:27 -0400
Subject: [PATCH 174/180] adding prompt tokens to added requests

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 tests/v1/sample/logits_processors/test_correctness.py | 11 ++++++++---
 vllm/test_utils.py                                    |  2 +-
 vllm/v1/sample/logits_processor/impls.py              |  6 +++---
 vllm/v1/sample/logits_processor/state.py              |  5 +++--
 vllm/v1/worker/gpu_input_batch.py                     |  3 ++-
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_correctness.py b/tests/v1/sample/logits_processors/test_correctness.py
index 43e7181a7a7..7009a30c81e 100644
--- a/tests/v1/sample/logits_processors/test_correctness.py
+++ b/tests/v1/sample/logits_processors/test_correctness.py
@@ -54,6 +54,7 @@ class LogitsProcsRequestParams:
     workload_index: int
     logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
+    prompt_tokens: list[int]  # Dummy prompt tokens placeholder
     params: SamplingParams  # Settings customized for logitproc
 
     def __init__(self, workload_index: int, logitproc_type: LogitprocType):
@@ -64,6 +65,7 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         # don't matter *for these tests* so use 0 as a dummy value
         self.out_tokens = ([0] *
                            (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.prompt_tokens = []
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
@@ -71,6 +73,7 @@ def __str__(self):
         summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
         return f"MyClass({summ})"
 
+
 def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -88,7 +91,7 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = build_logitsprocs(   
+    logitsprocs = build_logitsprocs(
         vllm_config=VllmConfig(),
         device=device,
         is_pin_memory=PIN_MEMORY_AVAILABLE,
@@ -462,7 +465,8 @@ def _generate_fake_step_update(
         # Replace as many removed requests as possible with added requests
         add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
-            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+            (add_remove_idx, add_req_params.params, add_req_params.out_tokens,
+             add_req_params.prompt_tokens))
         persistent_batch[add_remove_idx] = add_req_params
 
     # Append remaining added requests to end of batch
@@ -470,7 +474,8 @@ def _generate_fake_step_update(
                                        num_step_add_replace):(wdx +
                                                               num_step_add)]
     batch_update_builder.added.extend([
-        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        (adx + batch_size, add_req_params.params, add_req_params.out_tokens,
+         add_req_params.prompt_tokens)
         for adx, add_req_params in enumerate(add_reqs_append)
     ])
     persistent_batch.extend(add_reqs_append)
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index e5f6f079cbc..b8f3b20f277 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -159,7 +159,7 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
             return
 
         # Process added requests.
-        for index, params, _ in batch_update.added:
+        for index, params, _, _ in batch_update.added:
             if isinstance(params, SamplingParams) and params.extra_args:
                 target_token = params.extra_args.get("target_token", None)
             else:
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index afdb7573185..412928f70f0 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -52,7 +52,7 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
 
         needs_update = False
         # Process added requests.
-        for index, params, _ in batch_update.added:
+        for index, params, _, _ in batch_update.added:
             min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
             if self.min_p_cpu[index] != min_p:
                 needs_update = True
@@ -128,7 +128,7 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
 
         # Process added requests.
         needs_update = bool(batch_update.added)
-        for index, params, _ in batch_update.added:
+        for index, params, _, _ in batch_update.added:
             if isinstance(params, SamplingParams) and (lb :=
                                                        params.logit_bias):
                 self.biases[index] = lb
@@ -211,7 +211,7 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
         if batch_update:
             # Process added requests.
             needs_update |= bool(batch_update.added)
-            for index, params, output_tok_ids in batch_update.added:
+            for index, params, output_tok_ids, _ in batch_update.added:
                 if (isinstance(params, SamplingParams)
                         and (min_tokens := params.min_tokens)
                         and len(output_tok_ids) < min_tokens):
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 716d9b50f22..95dc74b30da 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -19,9 +19,10 @@ class MoveDirectionality(Enum):
     SWAP = 1
 
 
-# (index, params, output_tok_ids) tuples for new
+# (index, params, output_tok_ids, prompt_tok_ids) tuples for new
 # requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int],
+                     list[int]]
 # (index 1, index 2, directionality) tuples representing
 # one-way moves or two-way swaps of requests in batch
 MovedRequest = tuple[int, int, MoveDirectionality]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index fb6980c877b..85dc45fd002 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -253,7 +253,8 @@ def _register_add_request(self, request: "CachedRequestState") -> int:
         params = (request.sampling_params
                   if request.sampling_params else request.pooling_params)
         self.batch_update_builder.added.append(
-            (req_index, params, request.output_token_ids))
+            (req_index, params, request.output_token_ids,
+             request.prompt_token_ids))
         return req_index
 
     def add_request(

From 4a5763130e816cdcd88495f6ed73941f84d6acd8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 05:48:45 -0400
Subject: [PATCH 175/180] initial feedback

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/config.py                                |  2 +-
 vllm/v1/sample/logits_processor/__init__.py   |  2 +-
 vllm/v1/sample/logits_processor/impls.py      |  2 +-
 .../{core.py => interface.py}                 |  3 +-
 vllm/v1/sample/logits_processor/load.py       | 54 ++++++++++---------
 vllm/v1/sample/logits_processor/state.py      |  3 +-
 6 files changed, 34 insertions(+), 32 deletions(-)
 rename vllm/v1/sample/logits_processor/{core.py => interface.py} (96%)

diff --git a/vllm/config.py b/vllm/config.py
index 1e998cb0353..5d3afdb9815 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -51,7 +51,7 @@
                         cuda_device_count_stateless, get_cpu_memory,
                         get_open_port, is_torch_equal_or_newer, random_uuid,
                         resolve_obj_by_qualname)
-from vllm.v1.sample.logits_processor.core import LogitsProcessor
+from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 
 # yapf: enable
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 31a924da441..0611663fdb3 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.sample.logits_processor.core import LogitsProcessor
 from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
                                                    MinPLogitsProcessor,
                                                    MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.logits_processor.load import (build_logitsprocs,
                                                   load_custom_logitsprocs)
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/impls.py
index 412928f70f0..2e7e5d795ec 100644
--- a/vllm/v1/sample/logits_processor/impls.py
+++ b/vllm/v1/sample/logits_processor/impls.py
@@ -6,7 +6,7 @@
 import torch
 
 from vllm import SamplingParams
-from vllm.v1.sample.logits_processor.core import LogitsProcessor
+from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    MoveDirectionality)
 
diff --git a/vllm/v1/sample/logits_processor/core.py b/vllm/v1/sample/logits_processor/interface.py
similarity index 96%
rename from vllm/v1/sample/logits_processor/core.py
rename to vllm/v1/sample/logits_processor/interface.py
index 9a01952e699..53dc427e230 100644
--- a/vllm/v1/sample/logits_processor/core.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -13,7 +13,8 @@
 class LogitsProcessor(ABC):
 
     @abstractmethod
-    def __init__(self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool) -> None:
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool) -> None:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 93fc060728d..37909231672 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -21,7 +21,7 @@
 
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
 
-_builtin_logitsprocs_classes: list[type[LogitsProcessor]] = [
+BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
     MinTokensLogitsProcessor,
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
@@ -52,15 +52,15 @@ def _load_logitsprocs_by_fqcns(
     if not logits_processors:
         return []
 
-    logger.info(
+    logger.debug(
         "%s additional custom logits processors specified, checking whether "
         "they need to be loaded.", len(logits_processors))
 
     classes: list[type[LogitsProcessor]] = []
     for ldx, logitproc in enumerate(logits_processors):
         if isinstance(logitproc, type):
-            logger.info(" - Already loaded logit processor: %s",
-                        logitproc.__name__)
+            logger.debug(" - Already-loaded logit processor: %s",
+                         logitproc.__name__)
             if not issubclass(logitproc, LogitsProcessor):
                 raise ValueError(
                     f"{logitproc.__name__} is not a subclass of LogitsProcessor"
@@ -68,25 +68,27 @@ def _load_logitsprocs_by_fqcns(
             classes.append(logitproc)
             continue
 
-        logger.info("- Loading logits processor %s", logitproc)
+        logger.debug("- Loading logits processor %s", logitproc)
+        module_path, qualname = logitproc.split(":")
+
         try:
-            module_path, qualname = logitproc.split(":")
             # Load module
             module = importlib.import_module(module_path)
-            # Walk down dotted name to get logitproc class
-            obj = module
-            for attr in qualname.split("."):
-                obj = getattr(obj, attr)
-            if not isinstance(obj, type):
-                raise ValueError("Loaded logit processor must be a type.")
-            if not issubclass(obj, LogitsProcessor):
-                raise ValueError(
-                    f"{obj.__name__} must be a subclass of LogitsProcessor")
-            classes.append(obj)
         except Exception as e:
-            logger.exception("Failed to load %sth logits processor %s", ldx,
-                             logitproc)
-            raise e
+            raise RuntimeError(
+                f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
+            ) from e
+
+        # Walk down dotted name to get logitproc class
+        obj = module
+        for attr in qualname.split("."):
+            obj = getattr(obj, attr)
+        if not isinstance(obj, type):
+            raise ValueError("Loaded logit processor must be a type.")
+        if not issubclass(obj, LogitsProcessor):
+            raise ValueError(
+                f"{obj.__name__} must be a subclass of LogitsProcessor")
+        classes.append(obj)
 
     return classes
 
@@ -110,17 +112,17 @@ def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
         return []
 
     # Load logitsprocs plugins
-    logger.info("Loading installed logitsprocs plugins (group %s):",
-                LOGITSPROCS_GROUP)
+    logger.debug("Loading installed logitsprocs plugins (group %s):",
+                 LOGITSPROCS_GROUP)
     classes: list[type[LogitsProcessor]] = []
     for entrypoint in installed_logitsprocs_plugins:
         try:
-            logger.info("- Loading logitproc plugin entrypoint=%s target=%s",
-                        entrypoint.name, entrypoint.value)
+            logger.debug("- Loading logitproc plugin entrypoint=%s target=%s",
+                         entrypoint.name, entrypoint.value)
             classes.append(entrypoint.load())
         except Exception as e:
-            logger.exception("Failed to load plugin %s", entrypoint)
-            raise e
+            raise RuntimeError(
+                f"Failed to load LogitsProcessor plugin {entrypoint}") from e
     return classes
 
 
@@ -155,4 +157,4 @@ def build_logitsprocs(vllm_config: "VllmConfig", device: torch.device,
     custom_logitsprocs_classes = vllm_config.logits_processors or []
     return LogitsProcessors(
         ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
-            _builtin_logitsprocs_classes, custom_logitsprocs_classes))
+            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 95dc74b30da..956cff66241 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -9,7 +9,7 @@
 from vllm import PoolingParams, SamplingParams
 
 if TYPE_CHECKING:
-    from vllm.v1.sample.logits_processor.core import LogitsProcessor
+    from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 
 
 class MoveDirectionality(Enum):
@@ -163,7 +163,6 @@ def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
         return batch_update
 
 
-@dataclass
 class LogitsProcessors:
     """Encapsulates initialized logitsproc objects."""
     argmax_invariant: list["LogitsProcessor"] = field(

From 5c350a374bf2018f645783c77154652bd403c410 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 05:52:40 -0400
Subject: [PATCH 176/180] wip

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 requirements/test.txt                         | 22 ++-----------------
 vllm/v1/sample/logits_processor/__init__.py   |  6 ++---
 .../logits_processor/{impls.py => builtin.py} |  0
 vllm/v1/sample/logits_processor/load.py       |  6 ++---
 4 files changed, 8 insertions(+), 26 deletions(-)
 rename vllm/v1/sample/logits_processor/{impls.py => builtin.py} (100%)

diff --git a/requirements/test.txt b/requirements/test.txt
index 0a571e80f6a..90d8f8ff0bc 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -31,10 +31,6 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
-async-timeout==5.0.1
-    # via
-    #   aiohttp
-    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -145,11 +141,6 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
-exceptiongroup==1.3.0
-    # via
-    #   anyio
-    #   hypothesis
-    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -704,6 +695,7 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
+    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -766,13 +758,8 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
-toml==0.10.2
-    # via datamodel-code-generator
 tomli==2.2.1
-    # via
-    #   black
-    #   pytest
-    #   schemathesis
+    # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.1+cu128
@@ -846,19 +833,14 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
-    #   anyio
-    #   black
-    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
-    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
     #   pydantic-extra-types
-    #   rich
     #   torch
     #   typer
     #   typing-inspection
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 0611663fdb3..11b092146e8 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
-                                                   MinPLogitsProcessor,
-                                                   MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
+                                                     MinPLogitsProcessor,
+                                                     MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.logits_processor.load import (build_logitsprocs,
                                                   load_custom_logitsprocs)
diff --git a/vllm/v1/sample/logits_processor/impls.py b/vllm/v1/sample/logits_processor/builtin.py
similarity index 100%
rename from vllm/v1/sample/logits_processor/impls.py
rename to vllm/v1/sample/logits_processor/builtin.py
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
index 37909231672..044fa5b495f 100644
--- a/vllm/v1/sample/logits_processor/load.py
+++ b/vllm/v1/sample/logits_processor/load.py
@@ -9,9 +9,9 @@
 
 import vllm.envs as envs
 from vllm.v1.sample.logits_processor import LogitsProcessor
-from vllm.v1.sample.logits_processor.impls import (LogitBiasLogitsProcessor,
-                                                   MinPLogitsProcessor,
-                                                   MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
+                                                     MinPLogitsProcessor,
+                                                     MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.state import LogitsProcessors
 
 if TYPE_CHECKING:

From 4aa5c86d985ea044718ee7c57956b0f7ff05c3db Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 06:07:14 -0400
Subject: [PATCH 177/180] merge load.py into __init__.py

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_correctness.py     |   4 +-
 vllm/v1/sample/logits_processor/__init__.py   | 152 ++++++++++++++++-
 vllm/v1/sample/logits_processor/load.py       | 160 ------------------
 3 files changed, 152 insertions(+), 164 deletions(-)
 delete mode 100644 vllm/v1/sample/logits_processor/load.py

diff --git a/tests/v1/sample/logits_processors/test_correctness.py b/tests/v1/sample/logits_processors/test_correctness.py
index 7009a30c81e..c59095a22ef 100644
--- a/tests/v1/sample/logits_processors/test_correctness.py
+++ b/tests/v1/sample/logits_processors/test_correctness.py
@@ -24,9 +24,9 @@
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
-                                             MoveDirectionality)
+                                             MoveDirectionality,
+                                             build_logitsprocs)
 # yapf: enable
-from vllm.v1.sample.logits_processor.load import build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 11b092146e8..046e8a2dab6 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -1,16 +1,164 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+import itertools
+from typing import Optional, Union
+
+import torch
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
 from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
                                                      MinPLogitsProcessor,
                                                      MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.interface import LogitsProcessor
-from vllm.v1.sample.logits_processor.load import (build_logitsprocs,
-                                                  load_custom_logitsprocs)
+from vllm.v1.sample.logits_processor.load import logger
 from vllm.v1.sample.logits_processor.state import (BatchUpdate,
                                                    BatchUpdateBuilder,
                                                    LogitsProcessors,
                                                    MoveDirectionality)
 
+LOGITSPROCS_GROUP = 'vllm.logits_processors'
+
+BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
+    MinTokensLogitsProcessor,
+    LogitBiasLogitsProcessor,
+    MinPLogitsProcessor,
+]
+
+
+def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
+    """Load all installed logit processor plugins"""
+
+    import sys
+
+    if envs.VLLM_MOCK_LP_ENTRYPOINT:
+        from vllm.test_utils import entry_points
+    elif sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
+    if len(installed_logitsprocs_plugins) == 0:
+        logger.debug("No logitsprocs plugins installed (group %s).",
+                     LOGITSPROCS_GROUP)
+        return []
+
+    # Load logitsprocs plugins
+    logger.debug("Loading installed logitsprocs plugins (group %s):",
+                 LOGITSPROCS_GROUP)
+    classes: list[type[LogitsProcessor]] = []
+    for entrypoint in installed_logitsprocs_plugins:
+        try:
+            logger.debug("- Loading logitproc plugin entrypoint=%s target=%s",
+                         entrypoint.name, entrypoint.value)
+            classes.append(entrypoint.load())
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load LogitsProcessor plugin {entrypoint}") from e
+    return classes
+
+
+def _load_logitsprocs_by_fqcns(
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]]
+) -> list[type[LogitsProcessor]]:
+    """Load logit processor types, identifying them by fully-qualified class
+    names (FQCNs).
+
+    Effectively, a mixed list of logitproc types and FQCN strings is converted
+    into a list of entirely logitproc types, by loading from the FQCNs.
+
+    FQCN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
+
+    Already-loaded logitproc types must be subclasses of LogitsProcessor
+
+    Args:
+      logits_processors: Potentially mixed list of logitsprocs types and FQCN
+                         strings for logitproc types
+
+    Returns:
+      List of logitproc types
+
+    """
+    if not logits_processors:
+        return []
+
+    logger.debug(
+        "%s additional custom logits processors specified, checking whether "
+        "they need to be loaded.", len(logits_processors))
+
+    classes: list[type[LogitsProcessor]] = []
+    for ldx, logitproc in enumerate(logits_processors):
+        if isinstance(logitproc, type):
+            logger.debug(" - Already-loaded logit processor: %s",
+                         logitproc.__name__)
+            if not issubclass(logitproc, LogitsProcessor):
+                raise ValueError(
+                    f"{logitproc.__name__} is not a subclass of LogitsProcessor"
+                )
+            classes.append(logitproc)
+            continue
+
+        logger.debug("- Loading logits processor %s", logitproc)
+        module_path, qualname = logitproc.split(":")
+
+        try:
+            # Load module
+            module = importlib.import_module(module_path)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
+            ) from e
+
+        # Walk down dotted name to get logitproc class
+        obj = module
+        for attr in qualname.split("."):
+            obj = getattr(obj, attr)
+        if not isinstance(obj, type):
+            raise ValueError("Loaded logit processor must be a type.")
+        if not issubclass(obj, LogitsProcessor):
+            raise ValueError(
+                f"{obj.__name__} must be a subclass of LogitsProcessor")
+        classes.append(obj)
+
+    return classes
+
+
+def load_custom_logitsprocs(
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]],
+) -> list[type[LogitsProcessor]]:
+    """Load all custom logits processors.
+
+    * First load all installed logitproc plugins
+    * Second load custom logitsprocs pass by the user at initialization time
+
+    Args:
+      logits_processors: potentially mixed list of logitproc types and
+                         logitproc type fully-qualified names (FQCNs)
+                         which need to be loaded
+
+    Returns:
+      A list of all loaded logitproc types
+    """
+    from vllm.platforms import current_platform
+    if current_platform.is_tpu():
+        # No logitsprocs specified by caller
+        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
+        return []
+
+    return (_load_logitsprocs_plugins() +
+            _load_logitsprocs_by_fqcns(logits_processors))
+
+
+def build_logitsprocs(vllm_config: "VllmConfig", device: torch.device,
+                      is_pin_memory: bool) -> LogitsProcessors:
+    custom_logitsprocs_classes = vllm_config.logits_processors or []
+    return LogitsProcessors(
+        ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
+            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
+
+
 __all__ = [
     "LogitsProcessor",
     "LogitBiasLogitsProcessor",
diff --git a/vllm/v1/sample/logits_processor/load.py b/vllm/v1/sample/logits_processor/load.py
deleted file mode 100644
index 044fa5b495f..00000000000
--- a/vllm/v1/sample/logits_processor/load.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
-import itertools
-import logging
-from typing import TYPE_CHECKING, Optional, Union
-
-import torch
-
-import vllm.envs as envs
-from vllm.v1.sample.logits_processor import LogitsProcessor
-from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
-                                                     MinPLogitsProcessor,
-                                                     MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.state import LogitsProcessors
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-
-logger = logging.getLogger(__name__)
-
-LOGITSPROCS_GROUP = 'vllm.logits_processors'
-
-BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
-    MinTokensLogitsProcessor,
-    LogitBiasLogitsProcessor,
-    MinPLogitsProcessor,
-]
-
-
-def _load_logitsprocs_by_fqcns(
-    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]]
-) -> list[type[LogitsProcessor]]:
-    """Load logit processor types, identifying them by fully-qualified class
-    names (FQCNs).
-
-    Effectively, a mixed list of logitproc types and FQCN strings is converted
-    into a list of entirely logitproc types, by loading from the FQCNs.
-
-    FQCN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
-
-    Already-loaded logitproc types must be subclasses of LogitsProcessor
-
-    Args:
-      logits_processors: Potentially mixed list of logitsprocs types and FQCN
-                         strings for logitproc types
-
-    Returns:
-      List of logitproc types
-    
-    """
-    if not logits_processors:
-        return []
-
-    logger.debug(
-        "%s additional custom logits processors specified, checking whether "
-        "they need to be loaded.", len(logits_processors))
-
-    classes: list[type[LogitsProcessor]] = []
-    for ldx, logitproc in enumerate(logits_processors):
-        if isinstance(logitproc, type):
-            logger.debug(" - Already-loaded logit processor: %s",
-                         logitproc.__name__)
-            if not issubclass(logitproc, LogitsProcessor):
-                raise ValueError(
-                    f"{logitproc.__name__} is not a subclass of LogitsProcessor"
-                )
-            classes.append(logitproc)
-            continue
-
-        logger.debug("- Loading logits processor %s", logitproc)
-        module_path, qualname = logitproc.split(":")
-
-        try:
-            # Load module
-            module = importlib.import_module(module_path)
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
-            ) from e
-
-        # Walk down dotted name to get logitproc class
-        obj = module
-        for attr in qualname.split("."):
-            obj = getattr(obj, attr)
-        if not isinstance(obj, type):
-            raise ValueError("Loaded logit processor must be a type.")
-        if not issubclass(obj, LogitsProcessor):
-            raise ValueError(
-                f"{obj.__name__} must be a subclass of LogitsProcessor")
-        classes.append(obj)
-
-    return classes
-
-
-def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
-    """Load all installed logit processor plugins"""
-
-    import sys
-
-    if envs.VLLM_MOCK_LP_ENTRYPOINT:
-        from vllm.test_utils import entry_points
-    elif sys.version_info < (3, 10):
-        from importlib_metadata import entry_points
-    else:
-        from importlib.metadata import entry_points
-
-    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
-    if len(installed_logitsprocs_plugins) == 0:
-        logger.debug("No logitsprocs plugins installed (group %s).",
-                     LOGITSPROCS_GROUP)
-        return []
-
-    # Load logitsprocs plugins
-    logger.debug("Loading installed logitsprocs plugins (group %s):",
-                 LOGITSPROCS_GROUP)
-    classes: list[type[LogitsProcessor]] = []
-    for entrypoint in installed_logitsprocs_plugins:
-        try:
-            logger.debug("- Loading logitproc plugin entrypoint=%s target=%s",
-                         entrypoint.name, entrypoint.value)
-            classes.append(entrypoint.load())
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to load LogitsProcessor plugin {entrypoint}") from e
-    return classes
-
-
-def load_custom_logitsprocs(
-    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]],
-) -> list[type[LogitsProcessor]]:
-    """Load all custom logits processors.
-    
-    * First load all installed logitproc plugins
-    * Second load custom logitsprocs pass by the user at initialization time
-
-    Args:
-      logits_processors: potentially mixed list of logitproc types and
-                         logitproc type fully-qualified names (FQCNs)
-                         which need to be loaded
-
-    Returns:
-      A list of all loaded logitproc types
-    """
-    from vllm.platforms import current_platform
-    if current_platform.is_tpu():
-        # No logitsprocs specified by caller
-        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
-        return []
-
-    return (_load_logitsprocs_plugins() +
-            _load_logitsprocs_by_fqcns(logits_processors))
-
-
-def build_logitsprocs(vllm_config: "VllmConfig", device: torch.device,
-                      is_pin_memory: bool) -> LogitsProcessors:
-    custom_logitsprocs_classes = vllm_config.logits_processors or []
-    return LogitsProcessors(
-        ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
-            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))

From d3099b46f93ec72c83675205257e35a1826525a4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 06:15:30 -0400
Subject: [PATCH 178/180] refactor

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 .../logits_processors/test_correctness.py     |  5 +-
 tests/v1/sample/utils.py                      |  3 +-
 vllm/test_utils.py                            |  5 +-
 vllm/v1/sample/logits_processor/__init__.py   | 10 ++--
 vllm/v1/sample/logits_processor/builtin.py    |  6 +--
 vllm/v1/sample/logits_processor/interface.py  | 47 +++++++++++++++++-
 vllm/v1/sample/logits_processor/state.py      | 49 +++----------------
 vllm/v1/worker/gpu_input_batch.py             |  4 +-
 8 files changed, 70 insertions(+), 59 deletions(-)

diff --git a/tests/v1/sample/logits_processors/test_correctness.py b/tests/v1/sample/logits_processors/test_correctness.py
index c59095a22ef..71e4ac74f5f 100644
--- a/tests/v1/sample/logits_processors/test_correctness.py
+++ b/tests/v1/sample/logits_processors/test_correctness.py
@@ -19,14 +19,15 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
 # yapf: disable
-from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
+from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
                                              LogitBiasLogitsProcessor,
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
-                                             MoveDirectionality,
                                              build_logitsprocs)
 # yapf: enable
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index e33efb413d0..1db4bb1a591 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -10,7 +10,8 @@
 
 from vllm import CompletionOutput
 from vllm.utils import make_tensor_with_pad
-from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
+from vllm.v1.sample.logits_processor import LogitsProcessor
+from vllm.v1.sample.logits_processor.interface import BatchUpdate
 from vllm.v1.sample.metadata import SamplingMetadata
 
 
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index b8f3b20f277..c50aa9a69fe 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -6,8 +6,9 @@
 
 from vllm.config import VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.v1.sample.logits_processor import (BatchUpdate, LogitsProcessor,
-                                             MoveDirectionality)
+from vllm.v1.sample.logits_processor import LogitsProcessor
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       MoveDirectionality)
 
 MODELS_ON_S3 = [
     "adept/fuyu-8b",
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 046e8a2dab6..4decda4ba1f 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -11,12 +11,12 @@
 from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
                                                      MinPLogitsProcessor,
                                                      MinTokensLogitsProcessor)
-from vllm.v1.sample.logits_processor.interface import LogitsProcessor
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       LogitsProcessor,
+                                                       MoveDirectionality)
 from vllm.v1.sample.logits_processor.load import logger
-from vllm.v1.sample.logits_processor.state import (BatchUpdate,
-                                                   BatchUpdateBuilder,
-                                                   LogitsProcessors,
-                                                   MoveDirectionality)
+from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder,
+                                                   LogitsProcessors)
 
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
 
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 2e7e5d795ec..5bbfc63320a 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -6,9 +6,9 @@
 import torch
 
 from vllm import SamplingParams
-from vllm.v1.sample.logits_processor.interface import LogitsProcessor
-from vllm.v1.sample.logits_processor.state import (BatchUpdate,
-                                                   MoveDirectionality)
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       LogitsProcessor,
+                                                       MoveDirectionality)
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 53dc427e230..528304b37b7 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -1,13 +1,56 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Optional
+from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import Enum
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
+from vllm import PoolingParams, SamplingParams
+
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
-    from vllm.v1.sample.logits_processor.state import BatchUpdate
+
+
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = 0
+    # Two-way i1<->i2 req swap within batch
+    SWAP = 1
+
+
+# (index, params, output_tok_ids, prompt_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int],
+                     list[int]]
+
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+
+# Batch indices of any removed requests.
+RemovedRequest = int
+
+
+@dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Note: each added request is represented as
+    # (index, params, output_tok_ids)
+    # Key assumption: output_tok_ids is a reference to the
+    # request's running output tokens list; in this way
+    # the logits processors always see the latest list of
+    # generated tokens
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
 
 
 class LogitsProcessor(ABC):
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index 956cff66241..abb4d296b6e 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -1,54 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterator, Sequence
-from dataclasses import dataclass, field
-from enum import Enum
+from collections.abc import Iterator
+from dataclasses import field
 from itertools import chain
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
-from vllm import PoolingParams, SamplingParams
+from vllm.v1.sample.logits_processor.interface import (AddedRequest,
+                                                       BatchUpdate,
+                                                       MovedRequest,
+                                                       RemovedRequest)
 
 if TYPE_CHECKING:
     from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 
 
-class MoveDirectionality(Enum):
-    # One-way i1->i2 req move within batch
-    UNIDIRECTIONAL = 0
-    # Two-way i1<->i2 req swap within batch
-    SWAP = 1
-
-
-# (index, params, output_tok_ids, prompt_tok_ids) tuples for new
-# requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int],
-                     list[int]]
-# (index 1, index 2, directionality) tuples representing
-# one-way moves or two-way swaps of requests in batch
-MovedRequest = tuple[int, int, MoveDirectionality]
-# Batch indices of any removed requests.
-RemovedRequest = int
-
-
-@dataclass(frozen=True)
-class BatchUpdate:
-    """Persistent batch state change info for logitsprocs"""
-    batch_size: int  # Current num reqs in batch
-
-    # Metadata for requests added to, removed from, and moved
-    # within the persistent batch.
-    #
-    # Note: each added request is represented as
-    # (index, params, output_tok_ids)
-    # Key assumption: output_tok_ids is a reference to the
-    # request's running output tokens list; in this way
-    # the logits processors always see the latest list of
-    # generated tokens
-    removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
-    added: Sequence[AddedRequest]
-
-
 class BatchUpdateBuilder:
     """Helps track persistent batch state changes and build
     a batch update data structure for logitsprocs
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 85dc45fd002..559465faa3e 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -16,8 +16,8 @@
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             LogitsProcessors,
-                                             MoveDirectionality)
+                                             LogitsProcessors)
+from vllm.v1.sample.logits_processor.interface import MoveDirectionality
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice

From f6cbcad0fab9a2a71fbf54ab7975f1435e1afae2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 06:19:43 -0400
Subject: [PATCH 179/180] resetting test.txt

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 requirements/test.txt | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index 90d8f8ff0bc..aadbab03f6f 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -174,6 +174,8 @@ fsspec==2024.9.0
     #   fastparquet
     #   huggingface-hub
     #   torch
+ftfy==6.3.1
+    # via open-clip-torch
 genai-perf==0.0.8
     # via -r requirements/test.in
 genson==1.3.0
@@ -208,6 +210,7 @@ huggingface-hub==0.33.0
     #   accelerate
     #   datasets
     #   evaluate
+    #   open-clip-torch
     #   peft
     #   sentence-transformers
     #   timm
@@ -414,6 +417,8 @@ nvidia-nvjitlink-cu12==12.8.61
     #   torch
 nvidia-nvtx-cu12==12.8.55
     # via torch
+open-clip-torch==2.32.0
+    # via -r requirements/test.in
 opencensus==0.11.4
     # via ray
 opencensus-context==0.1.3
@@ -615,6 +620,7 @@ referencing==0.35.1
 regex==2024.9.11
     # via
     #   nltk
+    #   open-clip-torch
     #   sacrebleu
     #   tiktoken
     #   transformers
@@ -665,6 +671,7 @@ sacrebleu==2.4.3
 safetensors==0.4.5
     # via
     #   accelerate
+    #   open-clip-torch
     #   peft
     #   timm
     #   transformers
@@ -753,7 +760,9 @@ tiktoken==0.7.0
     #   lm-eval
     #   mistral-common
 timm==1.0.11
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   open-clip-torch
 tokenizers==0.21.1
     # via
     #   -r requirements/test.in
@@ -772,6 +781,7 @@ torch==2.7.1+cu128
     #   lm-eval
     #   mamba-ssm
     #   mteb
+    #   open-clip-torch
     #   peft
     #   runai-model-streamer
     #   sentence-transformers
@@ -789,6 +799,7 @@ torchaudio==2.7.1+cu128
 torchvision==0.22.1+cu128
     # via
     #   -r requirements/test.in
+    #   open-clip-torch
     #   timm
 tqdm==4.66.6
     # via
@@ -798,6 +809,7 @@ tqdm==4.66.6
     #   lm-eval
     #   mteb
     #   nltk
+    #   open-clip-torch
     #   peft
     #   pqdm
     #   sentence-transformers
@@ -863,6 +875,8 @@ virtualenv==20.31.2
     # via ray
 vocos==0.1.0
     # via -r requirements/test.in
+wcwidth==0.2.13
+    # via ftfy
 webcolors==24.11.1
     # via jsonschema
 werkzeug==3.1.3

From ffbc6f23c0f9fbf66b1bea7490d576cc93cb105f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@redhat.com>
Date: Thu, 17 Jul 2025 08:04:43 -0400
Subject: [PATCH 180/180] logitsprocs in input batch

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
---
 vllm/v1/sample/logits_processor/__init__.py | 10 +++++---
 vllm/v1/worker/gpu_input_batch.py           | 27 ++++++++-------------
 vllm/v1/worker/gpu_model_runner.py          | 10 +++-----
 3 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 4decda4ba1f..ff144ee0a84 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -2,22 +2,26 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
 import itertools
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
                                                      MinPLogitsProcessor,
                                                      MinTokensLogitsProcessor)
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
-from vllm.v1.sample.logits_processor.load import logger
 from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder,
                                                    LogitsProcessors)
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
 LOGITSPROCS_GROUP = 'vllm.logits_processors'
 
 BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 559465faa3e..3f9cbe89243 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -69,7 +69,7 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
-        logitsprocs: LogitsProcessors,
+        logitsprocs: Optional[LogitsProcessors] = None,
         is_spec_decode: bool = False,
         logits_processing_needs_token_ids: bool = False,
     ):
@@ -228,8 +228,12 @@ def __init__(
 
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
+        # Build logits processors. If specified by user, load custom
+        # logitsprocs constructors.
+        self.logitsprocs = logitsprocs or LogitsProcessors(None)
+
         # This is updated each time the batch constituents change.
-        self.sampling_metadata = self._make_sampling_metadata(logitsprocs)
+        self.sampling_metadata = self._make_sampling_metadata()
 
         self.pooling_params: dict[str, PoolingParams] = {}
 
@@ -579,30 +583,19 @@ def condense(self) -> None:
         del self._req_ids[self.num_reqs:]
         del self.req_output_token_ids[self.num_reqs:]
 
-    @property
-    def _logitsprocs(self) -> Optional[LogitsProcessors]:
-        if not self.sampling_metadata:
-            return None
-        return self.sampling_metadata.logitsprocs
-
     def refresh_metadata(self):
         """Apply batch updates, reset input batch at end of step
         
         * Apply batch add/remove/permute to logits procs' states
         * If batch state is modified, update sampling metadata
         """
-        if not (old_logitsprocs := self._logitsprocs):
-            raise RuntimeError("Expected input batch sampling metadata "
-                               "to be initialized.")
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
-        for logit_proc in old_logitsprocs.all:
+        for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
         if batch_update:
-            self.sampling_metadata = self._make_sampling_metadata(
-                old_logitsprocs)
+            self.sampling_metadata = self._make_sampling_metadata()
 
-    def _make_sampling_metadata(
-            self, logitsprocs: LogitsProcessors) -> SamplingMetadata:
+    def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
         if not self.all_greedy:
             temperature = copy_slice(self.temperature_cpu_tensor,
@@ -660,7 +653,7 @@ def _make_sampling_metadata(
             no_penalties=self.no_penalties,
             allowed_token_ids_mask=allowed_token_ids_mask,
             bad_words_token_ids=self.bad_words_token_ids,
-            logitsprocs=logitsprocs,
+            logitsprocs=self.logitsprocs,
         )
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cf9d2771806..2a1500009f1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -191,11 +191,6 @@ def __init__(
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
-        # Build logits processors. If specified by user, load custom
-        # logitsprocs constructors.
-        self.logitsprocs: LogitsProcessors = build_logitsprocs(
-            vllm_config, self.device, self.pin_memory)
-
         # Input Batch
         # NOTE(Chen): Ideally, we should initialize the input batch inside
         # `initialize_kv_cache` based on the kv cache config. However, as in
@@ -214,7 +209,8 @@ def __init__(
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
-            logitsprocs=self.logitsprocs,
+            logitsprocs=build_logitsprocs(self.vllm_config, self.device,
+                                          self.pin_memory),
         )
 
         self.use_cuda_graph = (
@@ -2369,7 +2365,7 @@ def may_reinitialize_input_batch(self,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
                 is_spec_decode=bool(self.vllm_config.speculative_config),
-                logitsprocs=self.logitsprocs,
+                logitsprocs=self.input_batch.logitsprocs,
             )
 
     def _allocate_kv_cache_tensors(