Fix API typo and remove FP8 on V1 restriction

gshtras · gshtras · commit a9e7a00feb79 · 2025-04-25T18:57:10.000Z
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
@@ -38,11 +38,11 @@ def _fwd_kernel(Q,
                 V,
                 K_cache,
                 V_cache,
-                out_scale,
                 B_Loc,
                 sm_scale,
                 k_scale,
                 v_scale,
+                out_scale,
                 B_Start_Loc,
                 B_Seqlen,
                 x: tl.constexpr,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1368,23 +1368,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        if current_platform.is_rocm():
-            from vllm.model_executor.layers.quantization.fp8 import Fp8Config
-            load_config = self.create_load_config()
-            quantization_config = VllmConfig.get_quantization_config(
-                model_config, load_config)
-            if isinstance(quantization_config, Fp8Config):
-                _raise_or_fallback(feature_name="fp8 for ROCm",
-                                   recommend_to_remove=False)
-                return False
-            from vllm.model_executor.layers.quantization.quark.quark import (
-                QuarkConfig)
-
-            if isinstance(quantization_config, QuarkConfig
-                          ) and quantization_config.has_fp8_layer_weights():
-                _raise_or_fallback(feature_name="Quark fp8 for ROCm",
-                                   recommend_to_remove=False)
-
         # No Fp8 KV cache so far.
         if self.kv_cache_dtype != "auto":
             fp8_attention = self.kv_cache_dtype.startswith("fp8")