Skip to content

Commit 99625ee

Browse files
[NPUW] Fix empty output if the app set output tensor before infer (#31202)
* The issue occurs when the user tries to set `RemoteTensor` as an output for the NPUW infer request (`JustInferRequest`). * In `set_tensor()` function in `BaseInferRequest` we just put it to `m_port_to_tensor` to later return from `get_tensor()` * However NPUW infer requests pre-allocate IO on NPU. `JustInferRequest` stores it in `m_funcall_result` and puts the results there after the inference * This fix implements `set_tensor()` for `JustInferRequest`, so the user-provided output overrides the pre-allocated one in `m_funcall_result` * Also removed `m_input_tensors` and `m_output_tensors` as they aren't used anywhere and would just keep pre-allocated memory alive * The issue described above only occurs with `RemoteTensor` allocated on NPU. It is not clear why, need to further investigate in the future
1 parent 18f1571 commit 99625ee

File tree

4 files changed

+29
-9
lines changed

4 files changed

+29
-9
lines changed

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,8 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
259259
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
260260
const auto& port = m_npuw_model->inputs()[i];
261261
ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
262-
m_input_tensors.push_back(allocated);
263262
m_input_allocated.insert(allocated->data());
264-
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
263+
m_port_to_tensor[port] = TensorStorage{allocated, true};
265264
} // for(inputs)
266265

267266
// Preallocate output tensors
@@ -276,7 +275,6 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
276275
LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
277276

278277
auto tensor = alloc_global_out(i);
279-
m_output_tensors.push_back(tensor);
280278
m_port_to_tensor[port] = TensorStorage{tensor, true};
281279
}
282280
}

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,6 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
8787
// so this cached information is used to detect these situations.
8888
std::vector<std::string> m_subrequest_devices;
8989

90-
// Permanent storage for input & output tensors
91-
// FIXME: Currently is initialized in subclasses. Likely this
92-
// initialization should be moved here, to the base class?
93-
std::vector<ov::SoPtr<ov::ITensor>> m_input_tensors;
94-
std::vector<ov::SoPtr<ov::ITensor>> m_output_tensors;
95-
9690
struct TensorStorage {
9791
ov::SoPtr<ov::ITensor> tensor;
9892
bool persistent = false; // true for the parent I/O tensors

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,32 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
353353
}
354354
}
355355

356+
void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
357+
const ov::SoPtr<ov::ITensor>& tensor) {
358+
// Check that it's I/O
359+
NPUW_ASSERT(m_port_to_tensor.at(port).persistent);
360+
361+
// Assigning via .at() to ensure it is a known port
362+
m_port_to_tensor.at(port).tensor = tensor;
363+
364+
// Check if setting output tensor
365+
for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
366+
if (m_npuw_model->outputs()[i] == port) {
367+
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
368+
auto funcall_result_iter = m_funcall_result.find(from_submodel);
369+
// This is a tricky case:
370+
// 1) We already allocated an output tensor in m_funcall_result via FMM
371+
// 2) We got an output tensor from outside
372+
// m_funcall_result and m_port_to_tensor aren't connected, thus we will only write
373+
// to m_funcall_result, but get_tensor() would return an empty tensor from m_port_to_tensor.
374+
// Here we have to set the tensor to function's output, so the function will write to the correct tensor.
375+
if (funcall_result_iter != m_funcall_result.end()) {
376+
funcall_result_iter->second = tensor;
377+
}
378+
}
379+
}
380+
}
381+
356382
ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
357383
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
358384
auto funcall_result_iter = m_funcall_result.find(from_submodel);

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ class JustInferRequest final : public IBaseInferRequest {
8585

8686
TensorPtr alloc_global_out(std::size_t out_idx) override;
8787

88+
void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
89+
8890
////////////////////////////////////
8991
// now own API
9092

0 commit comments

Comments
 (0)