Use xla flag to improve the quantized model performance (#19303)

vanbasten23 · rahul-tuli · commit 813e0b8c5b83 · 2025-06-10T04:24:13.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
Signed-off-by: Rahul Tuli &lt;rahul@neuralmagic.com&gt;
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
@@ -101,7 +101,10 @@ def init_device(self):
         # fix this. It will be removed after the bug in XLA compiler is fixed.
         os.environ["LIBTPU_INIT_ARGS"] = (
             os.environ.get("LIBTPU_INIT_ARGS", "") +
-            " --xla_tpu_force_1d_allreduce_at_chunk_count=1")
+            " --xla_tpu_force_1d_allreduce_at_chunk_count=1"
+            " --xla_jf_conv_input_fusion=False")
+        # --xla_jf_conv_input_fusion=False is used to improve the perf of
+        # quantized matmul.
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)