diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py index 67681dff8f6..e09ee954f2e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py @@ -330,9 +330,9 @@ def recv_kv_caches_and_hidden_states_hpu( block_indices_tensor = torch.tensor(block_indices_list[start_block_idx:end_block_idx], device="hpu", dtype=torch.int32 ) # we think this is a padding sequence, so we skip it. but we still need write kv cache if slen == 1: - for i in range(model_executable.model.model.start_layer, - model_executable.model.model.end_layer): - current_layer_idx = i - model_executable.model.model.start_layer + for i in range(model_executable.model.start_layer, + model_executable.model.end_layer): + current_layer_idx = i - model_executable.model.start_layer kv_cache = kv_caches[current_layer_idx] # key_cache, value_cache = kv_cache[0], kv_cache[1] key_cache = kv_cache[0] @@ -371,8 +371,9 @@ def recv_kv_caches_and_hidden_states_hpu( num_computed_tokens_list.append(num_computed_tokens) # it's padded to block size now. - key_values = remote_kv.to("hpu") - keys = key_values + # key_values = remote_kv.to("hpu") + # TEST: use CPU kv cache directly + keys = remote_kv # values = key_values[..., self.k_head_size:] htorch.core.mark_step()