[NPUW] Fix empty output if the app set output tensor before infer (#31202)

smirnov-alexey · web-flow · commit 99625ee06da0 · 2025-07-09T11:48:05.000Z
* The issue occurs when the user tries to set `RemoteTensor` as an
output for the NPUW infer request (`JustInferRequest`).
* In `set_tensor()` function in `BaseInferRequest` we just put it to
`m_port_to_tensor` to later return from `get_tensor()`
* However NPUW infer requests pre-allocate IO on NPU. `JustInferRequest`
stores it in `m_funcall_result` and puts the results there after the
inference
* This fix implements `set_tensor()` for `JustInferRequest`, so the
user-provided output overrides the pre-allocated one in
`m_funcall_result`
* Also removed `m_input_tensors` and `m_output_tensors` as they aren't
used anywhere and would just keep pre-allocated memory alive
* The issue described above only occurs with `RemoteTensor` allocated on
NPU. It is not clear why, need to further investigate in the future
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -259,9 +259,8 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
     for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
         const auto& port = m_npuw_model->inputs()[i];
         ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
-        m_input_tensors.push_back(allocated);
         m_input_allocated.insert(allocated->data());
-        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
+        m_port_to_tensor[port] = TensorStorage{allocated, true};
     }  // for(inputs)
 
     // Preallocate output tensors
@@ -276,7 +275,6 @@ void ov::npuw::IBaseInferRequest::alloc_io() {
         LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
 
         auto tensor = alloc_global_out(i);
-        m_output_tensors.push_back(tensor);
         m_port_to_tensor[port] = TensorStorage{tensor, true};
     }
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -87,12 +87,6 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     // so this cached information is used to detect these situations.
     std::vector<std::string> m_subrequest_devices;
 
-    // Permanent storage for input & output tensors
-    // FIXME: Currently is initialized in subclasses. Likely this
-    // initialization should be moved here, to the base class?
-    std::vector<ov::SoPtr<ov::ITensor>> m_input_tensors;
-    std::vector<ov::SoPtr<ov::ITensor>> m_output_tensors;
-
     struct TensorStorage {
         ov::SoPtr<ov::ITensor> tensor;
         bool persistent = false;       // true for the parent I/O tensors
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -353,6 +353,32 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     }
 }
 
+void ov::npuw::JustInferRequest::set_tensor(const ov::Output<const ov::Node>& port,
+                                            const ov::SoPtr<ov::ITensor>& tensor) {
+    // Check that it's I/O
+    NPUW_ASSERT(m_port_to_tensor.at(port).persistent);
+
+    // Assigning via .at() to ensure it is a known port
+    m_port_to_tensor.at(port).tensor = tensor;
+
+    // Check if setting output tensor
+    for (std::size_t i = 0; i < m_npuw_model->outputs().size(); ++i) {
+        if (m_npuw_model->outputs()[i] == port) {
+            const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+            auto funcall_result_iter = m_funcall_result.find(from_submodel);
+            // This is a tricky case:
+            // 1) We already allocated an output tensor in m_funcall_result via FMM
+            // 2) We got an output tensor from outside
+            // m_funcall_result and m_port_to_tensor aren't connected, thus we will only write
+            // to m_funcall_result, but get_tensor() would return an empty tensor from m_port_to_tensor.
+            // Here we have to set the tensor to function's output, so the function will write to the correct tensor.
+            if (funcall_result_iter != m_funcall_result.end()) {
+                funcall_result_iter->second = tensor;
+            }
+        }
+    }
+}
+
 ov::npuw::TensorPtr ov::npuw::JustInferRequest::alloc_global_out(std::size_t out_idx) {
     const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(out_idx);
     auto funcall_result_iter = m_funcall_result.find(from_submodel);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -85,6 +85,8 @@ class JustInferRequest final : public IBaseInferRequest {
 
     TensorPtr alloc_global_out(std::size_t out_idx) override;
 
+    void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
+
     ////////////////////////////////////
     // now own API