use the correct configuration

b8zhong · b8zhong · commit 31aeda01e379 · 2025-07-13T21:46:22.000-04:00
diff --git a/test_llama4_eplb.py b/test_llama4_eplb.py
@@ -19,6 +19,7 @@ def main():
     llm = LLM(
         model="/fp8-llama/llama4scout-fp8/",
         tensor_parallel_size=8,
+        max_model_len=2048,
         enable_expert_parallel=True,
         enable_eplb=True,
         num_redundant_experts=16,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
@@ -369,11 +369,13 @@ def __init__(self,
         # Store the original layer_type and override it with a lambda
         original_layer_type = layer_type
 
-        def create_layer(prefix):
+        def create_layer(config, cache_config, quant_config, prefix):
+            # We use the config from vllm_config instead of the passed one
+            # to ensure we get the Llama4TextConfig type
             config = cast(Llama4TextConfig, vllm_config.model_config.hf_config)
             return original_layer_type(config=config,
-                                       cache_config=vllm_config.cache_config,
-                                       quant_config=vllm_config.quant_config,
+                                       cache_config=cache_config,
+                                       quant_config=quant_config,
                                        prefix=prefix,
                                        enable_eplb=self.enable_eplb)