[TPU] support fp8 kv cache quantization

yaochengji · yaochengji · commit fff63b255198 · 2025-06-06T19:03:43.000Z
Signed-off-by: Chengji Yao &lt;chengjiyao@google.com&gt;
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1314,7 +1314,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                 and not envs.is_set("VLLM_ATTENTION_BACKEND")
             ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
             supported = False
-            if current_platform.is_rocm():
+            if current_platform.is_rocm() or current_platform.is_tpu():
                 supported = True
             elif fp8_attention and will_use_fa:
                 from vllm.attention.utils.fa_utils import (
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -35,7 +35,9 @@ class TpuPlatform(Platform):
     device_control_env_var: str = "TPU_VISIBLE_CHIPS"
     simple_compile_backend: str = "openxla"
 
-    supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
+    supported_quantization: list[str] = [
+        "fp8", "tpu_int8", "compressed-tensors"
+    ]
 
     additional_env_vars: list[str] = [
         "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -175,10 +175,11 @@
     "half": torch.half,
     "bfloat16": torch.bfloat16,
     "float": torch.float,
-    "fp8": torch.uint8,
-    "fp8_e4m3": torch.uint8,
-    "fp8_e5m2": torch.uint8,
+    "fp8": torch.float8_e4m3fn,
+    "fp8_e4m3": torch.float8_e4m3fn,
+    "fp8_e5m2": torch.float8_e5m2,
     "int8": torch.int8,
+    "uint8": torch.uint8,
 }
 
 TORCH_DTYPE_TO_NUMPY_DTYPE = {
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
@@ -13,7 +13,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import cdiv, next_power_of_2
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, next_power_of_2
 
 logger = init_logger(__name__)
 
@@ -137,8 +137,6 @@ def __init__(
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
             raise NotImplementedError("Alibi slopes is not supported.")
-        if kv_cache_dtype != "auto":
-            raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
 
@@ -151,6 +149,14 @@ def __init__(
         tpu_version = torch_xla.tpu.version()
         if tpu_version < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
+        self.kv_cache_quantized_dtype = None
+        if kv_cache_dtype != "auto":
+            if tpu_version < 5:
+                raise NotImplementedError(
+                    "FP8 KV cache dtype is only supported when TPU version"
+                    " is 5 or higher.")
+            self.kv_cache_quantized_dtype = STR_DTYPE_TO_TORCH_DTYPE.get(
+                kv_cache_dtype.lower().strip())
 
     def forward(
         self,
@@ -179,15 +185,16 @@ def forward(
                 output = torch.ones_like(query)
             return output
 
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
         if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0:
             # Write input keys and values to the KV cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             slot_mapping = attn_metadata.slot_mapping
-            write_to_kv_cache(key, value, kv_cache, slot_mapping)
+            write_to_kv_cache(key, value, kv_cache, slot_mapping,
+                              self.kv_cache_quantized_dtype,
+                              layer._k_scale_float, layer._v_scale_float)
 
         output = torch.ops.xla.ragged_paged_attention(
             query,
@@ -206,6 +213,8 @@ def forward(
             sm_scale=self.scale,
             sliding_window=self.sliding_window,
             soft_cap=self.logits_soft_cap,
+            k_scale=1 / layer._k_scale_float,
+            v_scale=1 / layer._v_scale_float,
         )
 
         return output.reshape(num_tokens, hidden_size)
@@ -216,6 +225,9 @@ def write_to_kv_cache(
     value: torch.Tensor,
     kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
+    kv_cache_quantized_dtype: Optional[torch.dtype] = None,
+    k_scale: float = 1.0,
+    v_scale: float = 1.0,
 ) -> None:
     """ Write the key and values to the KV cache.
 
@@ -230,6 +242,11 @@ def write_to_kv_cache(
 
     key = key.view(-1, num_kv_heads, head_size)
     value = value.view(-1, num_kv_heads, head_size)
+    if kv_cache_quantized_dtype is not None:
+        key = key * k_scale
+        key = key.to(kv_cache_quantized_dtype)
+        value = value * v_scale
+        value = value.to(kv_cache_quantized_dtype)
 
     kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads,
                                                   head_size)