Skip to content

Commit ba8c300

Browse files
authored
[BugFix] VLLM_DISABLE_COMPILE_CACHE=1 should disable all reads and writes from the cache (#20942)
Signed-off-by: Richard Zou <zou3519@gmail.com>
1 parent 8cdc371 commit ba8c300

File tree

4 files changed

+33
-2
lines changed

4 files changed

+33
-2
lines changed

tests/compile/test_config.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
2626
assert not vllm_config.compilation_config.use_cudagraph
2727

2828

29+
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
30+
# on the state of the cache directory on the current machine, which
31+
# may be influenced by other tests.
32+
@pytest.mark.parametrize("val", ["1"])
33+
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
34+
assert vllm.envs.VLLM_USE_V1
35+
36+
# spawn means that the counters are in the same process.
37+
monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
38+
monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
39+
40+
compilation_config = {
41+
"use_cudagraph": False, # speed things up a bit
42+
}
43+
with (
44+
compilation_counter.expect(num_cache_entries_updated=0,
45+
num_compiled_artifacts_saved=0),
46+
# loading the model causes compilation (if enabled) to happen
47+
vllm_runner('facebook/opt-125m',
48+
compilation_config=compilation_config,
49+
gpu_memory_utilization=0.4) as _):
50+
pass
51+
52+
2953
@pytest.mark.parametrize("enabled", [True, False])
3054
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
3155
assert vllm.envs.VLLM_USE_V1

vllm/compilation/backends.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,10 @@ def compile(self,
183183
assert compiled_graph is not None, "Failed to compile the graph"
184184

185185
# store the artifact in the cache
186-
if handle is not None:
186+
if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
187187
self.cache[(runtime_shape, graph_index,
188188
self.compiler.name)] = handle
189+
compilation_counter.num_cache_entries_updated += 1
189190
self.is_cache_updated = True
190191
if graph_index == 0:
191192
# adds some info logging for the first graph

vllm/compilation/compiler_interface.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,9 @@ def compile(
213213
# Save the compiled artifact to disk in the specified path
214214
assert key is not None
215215
path = os.path.join(self.cache_dir, key)
216-
compiled_graph.save(path=path, format="unpacked")
216+
if not envs.VLLM_DISABLE_COMPILE_CACHE:
217+
compiled_graph.save(path=path, format="unpacked")
218+
compilation_counter.num_compiled_artifacts_saved += 1
217219
return compiled_graph, (key, path)
218220

219221
def load(self,

vllm/compilation/counter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ class CompilationCounter:
2323
num_inductor_compiles: int = 0
2424
# EagerAdapter.compile calls
2525
num_eager_compiles: int = 0
26+
# The number of time vLLM's compiler cache entry was updated
27+
num_cache_entries_updated: int = 0
28+
# The number of standalone_compile compiled artifacts saved
29+
num_compiled_artifacts_saved: int = 0
2630

2731
def clone(self) -> "CompilationCounter":
2832
return copy.deepcopy(self)

0 commit comments

Comments
 (0)