[GPU] Reset memory from padded_pool (#30728)

kelvinchoi-intel · web-flow · commit effaf3a38c4b · 2025-07-10T06:39:04.000Z
### Details:
 - *Sample case*
1. A node {64..128,64..128} with padding{1,1} and saved in _padded_pool
with key layout shape {128,128}.
2. B node has same shape/pad with Node A, and B node output reuses A
node's output from padded_pool.
3. On execution time, B node's shape is updated to {64,64} with pad{1,1}
and the output of B node is reinterpret and reset
4. C node {128,128} with pad{1,1} resue A/B's output in _padded_pool.
But the B node's real output layout is {64,64} with pad{1,1}. Before
executing C node, B node's output should be reset.

### Tickets:
 - *165710*
 - *165708*
 - *151125*
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -127,7 +127,7 @@ struct memory {
 #endif
 
     std::shared_ptr<MemoryTracker> get_mem_tracker() const { return m_mem_tracker; }
-    GPU_DEBUG_CODE(bool from_memory_pool = false);
+    bool from_memory_pool = false;
 
 protected:
     engine* _engine;
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1013,6 +1013,12 @@ void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {
             GPU_DEBUG_CODE(memalloc_info += (((_outputs.size() > 1) ? ("o" + to_string(i) + ":") : "") +
                                   (_outputs[i]->from_memory_pool ? "from_pool" : "new_alloc"));)
             GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);
+
+            if (need_reset_output_memory() && !can_be_optimized() &&
+                _outputs[i]->from_memory_pool && _outputs[i]->get_layout().data_padding) {
+                GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl;
+                add_dep_event(_outputs[i]->fill(get_network().get_stream()));
+            }
         }
     }
 
diff --git a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@@ -171,7 +171,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
             !has_conflict(it->second._users, restrictions))) {
             it->second._users.insert(memory_user(MEM_USER(unique_id, network_id, prim_id, layout_bytes_count)));
             auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
-            GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
+            ret_mem->from_memory_pool = true;
             return ret_mem;
         } else {
             ++it;
@@ -217,7 +217,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
                 !has_conflict(rec_list._users, restrictions)) {
                 auto ret_mem = _engine->reinterpret_buffer(*(rec_list._memory), layout);
                 rec_list._users.insert({MEM_USER(unique_id, network_id, prim_id, ret_mem->size())});
-                GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
+                ret_mem->from_memory_pool = true;
                 return ret_mem;
             }
         }
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -212,23 +212,27 @@ memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& n
                     memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string());
 
     try {
+        bool from_memory_pool = memory.from_memory_pool;
+        memory::ptr reinterpret_memory = nullptr;
         if (new_layout.format.is_image_2d()) {
-           return std::make_shared<ocl::gpu_image2d>(this,
+           reinterpret_memory = std::make_shared<ocl::gpu_image2d>(this,
                                      new_layout,
                                      reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
                                      memory.get_mem_tracker());
         } else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
-           return std::make_shared<ocl::gpu_usm>(this,
+           reinterpret_memory = std::make_shared<ocl::gpu_usm>(this,
                                      new_layout,
                                      reinterpret_cast<const ocl::gpu_usm&>(memory).get_buffer(),
                                      memory.get_allocation_type(),
                                      memory.get_mem_tracker());
         } else {
-           return std::make_shared<ocl::gpu_buffer>(this,
+           reinterpret_memory = std::make_shared<ocl::gpu_buffer>(this,
                                     new_layout,
                                     reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer(),
                                     memory.get_mem_tracker());
         }
+        reinterpret_memory->from_memory_pool = from_memory_pool;
+        return reinterpret_memory;
     } catch (cl::Error const& err) {
         OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
     }
diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp
@@ -4,6 +4,7 @@
 
 #include "test_utils.h"
 
+#include <intel_gpu/primitives/concatenation.hpp>
 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/softmax.hpp>
 #include <intel_gpu/primitives/reorder.hpp>
@@ -128,6 +129,117 @@ TEST(memory_reuse_realloc_reset_test, basic_conv_with_padding) {
                 << reorder_mem->get_mem_tracker()->size() << "bytes.";
 }
 
+TEST(memory_reuse_realloc_reset_test, basic_conv_with_memory_get_from_padded_pool) {
+    auto& engine = get_test_engine();
+
+    layout weight_layout = layout{ov::PartialShape{1, 4, 3, 3}, data_types::f32, format::bfyx};
+    auto weights = engine.allocate_memory(weight_layout);
+    set_values<float>(weights, {
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f
+    });
+
+    layout weight_layout2 = layout{ov::PartialShape{1, 3, 3, 3}, data_types::f32, format::bfyx};
+    auto weights2 = engine.allocate_memory(weight_layout2);
+    set_values<float>(weights2, {
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f
+    });
+
+    layout elt_layout1 = layout{ov::PartialShape{1, 2, 4, 4}, data_types::f32, format::bfyx};
+    auto elt_mem1 = engine.allocate_memory(elt_layout1);
+    set_values<float>(elt_mem1, {
+        10.f, 10.f, 10.f, 10.f,
+        10.f, 10.f, 10.f, 10.f,
+        10.f, 10.f, 10.f, 10.f,
+        10.f, 10.f, 10.f, 10.f,
+
+        10.f, 10.f, 10.f, 10.f,
+        10.f, 10.f, 10.f, 10.f,
+        10.f, 10.f, 10.f, 10.f,
+        10.f, 10.f, 10.f, 10.f
+    });
+
+    std::vector<float> ref_output = {
+        1080, 1720, 1720, 1080,
+        1720, 2740, 2740, 1720,
+        1720, 2740, 2740, 1720,
+        1080, 1720, 1720, 1080
+    };
+
+    std::vector<float> subtract_val = {0.f, };
+    auto input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
+    auto elt_input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
+
+    topology topology(input_layout("elt_input", elt_input_l),
+                      data("weights", weights),
+                      data("weights2", weights2),
+                      reorder("reorder1-1", input_info("elt_input"), format::bfyx, data_types::f32, subtract_val, reorder_mean_mode::subtract),
+                      reorder("reorder1-2", input_info("elt_input"), format::bfyx, data_types::f32, subtract_val, reorder_mean_mode::subtract),
+                      concatenation("concat1", {input_info("reorder1-1"), input_info("reorder1-2")}, 1),
+                      convolution("conv1",
+                                  input_info("concat1"),
+                                  "weights",
+                                  "",     /*bias*/
+                                  1,
+                                  {1, 1}, /*stride*/
+                                  {1, 1}, /*dilation*/
+                                  {1, 1}, /*pad_above*/
+                                  {1, 1}, /*pad_below*/
+                                  false,
+                                  ov::op::PadType::EXPLICIT),
+                      reorder("reorder2-1", input_info("conv1"), format::bfyx, data_types::f32, subtract_val, reorder_mean_mode::subtract),
+                      concatenation("concat2", {input_info("reorder1-1"), input_info("reorder2-1")}, 1),
+                      convolution("conv2",
+                                  input_info("concat2"),
+                                  "weights2",
+                                  "",     /*bias*/
+                                  1,
+                                  {1, 1}, /*stride*/
+                                  {1, 1}, /*dilation*/
+                                  {1, 1}, /*pad_above*/
+                                  {1, 1}, /*pad_below*/
+                                  false,
+                                  ov::op::PadType::EXPLICIT),
+                      reorder("output", input_info("conv2"), format::bfyx, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+
+    network network(engine, topology, config);
+    network.set_input_data("elt_input", elt_mem1);
+    auto outputs = network.execute();
+    auto output_mem = outputs.begin()->second.get_memory();
+    cldnn::mem_lock<float> output_mem_ptr(output_mem, get_test_stream());
+
+    for (size_t i = 0; i < output_mem->get_layout().get_linear_size(); ++i) {
+        ASSERT_EQ(output_mem_ptr[i], ref_output[i]);
+    }
+}
+
 TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
     static const int32_t
         output_x_1  = 10, output_b_1  = 8,

Original file line number	Diff line number	Diff line change
`@@ -1013,6 +1013,12 @@ void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {`
`1013`	`1013`	`GPU_DEBUG_CODE(memalloc_info += (((_outputs.size() > 1) ? ("o" + to_string(i) + ":") : "") +`
`1014`	`1014`	`(_outputs[i]->from_memory_pool ? "from_pool" : "new_alloc"));)`
`1015`	`1015`	`GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);`
	`1016`	`+`
	`1017`	`+ if (need_reset_output_memory() && !can_be_optimized() &&`
	`1018`	`+ _outputs[i]->from_memory_pool && _outputs[i]->get_layout().data_padding) {`
	`1019`	`+ GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl;`
	`1020`	`+ add_dep_event(_outputs[i]->fill(get_network().get_stream()));`
	`1021`	`+ }`
`1016`	`1022`	`}`
`1017`	`1023`	`}`
`1018`	`1024`