vllm-project · rafvasq · Jul 25, 2025 · Jul 16, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/examples/offline_inference/spyre_inference.py b/examples/offline_inference/spyre_inference.py
@@ -36,6 +36,10 @@
                     choices=['eager', 'sendnn'])
 parser.add_argument("--compare-with-cpu",
                     action=argparse.BooleanOptionalAction)
+parser.add_argument("--quantization",
+                    type=str,
+                    default='none',
+                    choices=['fp8', 'gptq', 'none'])
 args = parser.parse_args()
 
 if platform.machine() == "arm64":
@@ -82,11 +86,13 @@
                                  temperature=0.0,
                                  ignore_eos=True)
 # Create an LLM.
-llm = LLM(model=args.model,
-          tokenizer=args.model,
-          max_model_len=args.max_model_len,
-          block_size=2048,
-          tensor_parallel_size=args.tp)
+llm = LLM(
+    model=args.model,
+    tokenizer=args.model,
+    max_model_len=args.max_model_len,
+    block_size=2048,
+    tensor_parallel_size=args.tp,
+    quantization=args.quantization if args.quantization != "none" else None)
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.

diff --git a/tests/e2e/test_spyre_online.py b/tests/e2e/test_spyre_online.py
@@ -57,6 +57,7 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
         assert "warmup" in str(e)
 
 
+@pytest.mark.quantized
 @pytest.mark.parametrize("model", get_spyre_model_list(quantization="gptq"))
 @pytest.mark.parametrize("backend", ["sendnn"])
 @pytest.mark.parametrize("quantization", ["gptq"])
@@ -82,6 +83,31 @@ def test_openai_serving_gptq(remote_openai_server, model, backend,
     assert len(completion.choices[0].text) > 0
 
 
+@pytest.mark.quantized
+@pytest.mark.parametrize("model", get_spyre_model_list(quantization="fp8"))
+@pytest.mark.parametrize("backend", ["sendnn"])
+@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
+def test_openai_serving_fp8(remote_openai_server, model, backend,
+                            warmup_shape):
+    """Test online serving a GPTQ model with the sendnn backend only"""
+
+    client = remote_openai_server.get_client()
+    completion = client.completions.create(model=model,
+                                           prompt="Hello World!",
+                                           max_tokens=5,
+                                           temperature=0.0)
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) > 0
+
+    completion = client.completions.create(model=model,
+                                           prompt="Hello World!",
+                                           max_tokens=5,
+                                           temperature=1.0,
+                                           n=2)
+    assert len(completion.choices) == 2
+    assert len(completion.choices[0].text) > 0
+
+
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("cb",
                          [pytest.param(1, marks=pytest.mark.cb, id="cb")])

diff --git a/tests/spyre_util.py b/tests/spyre_util.py
@@ -179,6 +179,7 @@ def generate_spyre_vllm_output(
     warmup_shapes: Optional[list[tuple[int, int, int]]] = None,
     max_num_seqs: Optional[int] = None,
     use_cb: bool = False,
+    quantization: Optional[str] = None,
 ) -> list[dict[str, Any]]:
 
     # ---- For static batching ----
@@ -205,14 +206,13 @@ def generate_spyre_vllm_output(
     # shutdown engine this context.
     monkeypatch.setenv("VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER", "1")
 
-    vllm_model = LLM(
-        model=model,
-        tokenizer=model,
-        max_model_len=max_model_len,
-        max_num_seqs=max_num_seqs,
-        block_size=block_size,
-        tensor_parallel_size=tensor_parallel_size,
-    )
+    vllm_model = LLM(model=model,
+                     tokenizer=model,
+                     max_model_len=max_model_len,
+                     max_num_seqs=max_num_seqs,
+                     block_size=block_size,
+                     tensor_parallel_size=tensor_parallel_size,
+                     quantization=quantization)
 
     vllm_outputs = vllm_model.generate(prompts, sampling_params)
     results = []
@@ -505,6 +505,11 @@ def _get_or_default(env: str, default: str) -> str:
         user_test_model_list = _get_or_default("VLLM_SPYRE_TEST_MODEL_LIST",
                                                "granite-3.0-8b-instruct-gptq")
         marks = [pytest.mark.decoder, pytest.mark.quantized, pytest.mark.spyre]
+    elif quantization == "fp8":
+        # TODO: need a HF hub reference here as a default
+        user_test_model_list = _get_or_default("VLLM_SPYRE_TEST_MODEL_LIST",
+                                               "granite-3.3-8b-instruct-FP8")
+        marks = [pytest.mark.decoder, pytest.mark.quantized, pytest.mark.spyre]
     else:
         user_test_model_list = _get_or_default(
             "VLLM_SPYRE_TEST_MODEL_LIST",

diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py
@@ -171,25 +171,34 @@ def load_weights(
         **kwargs,
     ) -> None:
 
-        if model_config.quantization == "gptq":
+        quant_type = model_config.quantization
+        if quant_type in ("gptq", "fp8"):
             if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn":
-                from fms_mo.aiu_addons.gptq import (  # noqa: F401
-                    gptq_aiu_adapter, gptq_aiu_linear)
-                linear_type = "gptq_aiu"
+                if quant_type == "gptq":
+                    from fms_mo.aiu_addons.gptq import (  # noqa: F401
+                        gptq_aiu_adapter, gptq_aiu_linear)
+                else:
+                    from fms_mo.aiu_addons.fp8 import (  # noqa: F401
+                        fp8_adapter, fp8_linear)
+                linear_type = f"{quant_type}_aiu"
                 logger.info("Loaded `aiu_addons` functionalities")
             else:
-                linear_type = "gptq_cpu"
-                logger.warning("GPTQ is not expected to work on CPU.")
+                linear_type = f"{quant_type}_cpu"
+                logger.warning("%s is not expected to work on CPU.",
+                               quant_type.upper())
 
             quant_cfg = model_config._parse_quant_hf_config()
 
-            linear_config = {
-                "linear_type": linear_type,
-                "group_size": quant_cfg['group_size'],
-                "desc_act": quant_cfg['desc_act'],
-            }
+            linear_config = {"linear_type": linear_type}
+            if quant_type == "gptq":
+                linear_config.update({
+                    "group_size": quant_cfg["group_size"],
+                    "desc_act": quant_cfg["desc_act"],
+                })
+
             self.dtype = None
-            model_source = "hf_gptq_aiu"
+            model_source = f"hf_{quant_type}_aiu"
+
         else:
             linear_config = {"linear_type": "torch_linear"}
             model_source = "hf"

@@ -40,7 +40,7 @@ class SpyrePlatform(Platform):
     # "spyre" device_name no longer worked due to https://github.com/vllm-project/vllm/pull/16464
     device_name: str = "cpu"
     device_type: str = "cpu"
-    supported_quantization: list[str] = ["gptq"]
+    supported_quantization: list[str] = ["gptq", "fp8", "compressed-tensors"]
     _warmup_shapes: Optional[tuple[dict[str, int], ...]] = None
     _block_size: int = 64  # hardcoded Spyre constraint for now
     _config: VllmConfig = None