diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index b022832effc..0e97e93a3e0 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -5,6 +5,7 @@ from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, Union, overload) +import habana_frameworks.torch.core as htcore import torch import torch.nn as nn from torch.func import functional_call @@ -391,6 +392,7 @@ def _merge_multimodal_embeddings( """ # skip check for HPU, the number of tokens is a cpu fallback during HPU lazy if current_platform.is_hpu(): + htcore.mark_step() flattened = _flatten_embeddings(multimodal_embeddings) inputs_embeds[is_multimodal] = flattened return inputs_embeds