diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b3b6bb4749d..213c17e55d0 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2761,7 +2761,6 @@ def warmup_graphs(self, def _warmup_multimodal_graph(self, kv_caches, - available_mem, starting_mem=0, total_batch_seq=0.001): @@ -2775,11 +2774,6 @@ def _warmup_multimodal_graph(self, _, max_seq_len = self.bucketing_ctx.get_max_prompt_shape() seq_len = max_seq_len batch_seq = 1 * num_patches - # Graph memory usage is proportional to seq dimension in a batch - mem_estimate = batch_seq / total_batch_seq * total_mem - if mem_estimate >= available_mem: - captured_all = False - continue graphed_multimodal_bucket = num_patches if graphed_multimodal_bucket in self.graphed_multimodal_buckets: continue @@ -2797,7 +2791,6 @@ def _warmup_multimodal_graph(self, used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX) - available_mem -= used_mem total_mem += used_mem total_batch_seq += batch_seq