Disable remote caching when calling compile_fx (vllm-project#16611)

zou3519 · yangw-dev · commit 257b00ec8eb9 · 2025-04-21T10:08:15.000-07:00
Signed-off-by: rzou &lt;zou3519@gmail.com&gt;
Signed-off-by: Yang Wang &lt;elainewy@meta.com&gt;
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
@@ -290,6 +290,19 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
             # Dynamo metrics context, see method for more details.
             stack.enter_context(self.metrics_context())
 
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
+
             compiled_graph = compile_fx(
                 graph,
                 example_inputs,