Skip to content

Commit effaf3a

Browse files
[GPU] Reset memory from padded_pool (#30728)
### Details: - *Sample case* 1. A node {64..128,64..128} with padding{1,1} and saved in _padded_pool with key layout shape {128,128}. 2. B node has same shape/pad with Node A, and B node output reuses A node's output from padded_pool. 3. On execution time, B node's shape is updated to {64,64} with pad{1,1} and the output of B node is reinterpret and reset 4. C node {128,128} with pad{1,1} resue A/B's output in _padded_pool. But the B node's real output layout is {64,64} with pad{1,1}. Before executing C node, B node's output should be reset. ### Tickets: - *165710* - *165708* - *151125*
1 parent ef64b0b commit effaf3a

File tree

5 files changed

+128
-6
lines changed

5 files changed

+128
-6
lines changed

src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ struct memory {
127127
#endif
128128

129129
std::shared_ptr<MemoryTracker> get_mem_tracker() const { return m_mem_tracker; }
130-
GPU_DEBUG_CODE(bool from_memory_pool = false);
130+
bool from_memory_pool = false;
131131

132132
protected:
133133
engine* _engine;

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,12 @@ void primitive_inst::realloc_if_needed(bool prev_execution_skipped) {
10131013
GPU_DEBUG_CODE(memalloc_info += (((_outputs.size() > 1) ? ("o" + to_string(i) + ":") : "") +
10141014
(_outputs[i]->from_memory_pool ? "from_pool" : "new_alloc"));)
10151015
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO(memalloc_info);
1016+
1017+
if (need_reset_output_memory() && !can_be_optimized() &&
1018+
_outputs[i]->from_memory_pool && _outputs[i]->get_layout().data_padding) {
1019+
GPU_DEBUG_TRACE_DETAIL << id() << " : Need reset output memory considering user" << std::endl;
1020+
add_dep_event(_outputs[i]->fill(get_network().get_stream()));
1021+
}
10161022
}
10171023
}
10181024

src/plugins/intel_gpu/src/runtime/memory_pool.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
171171
!has_conflict(it->second._users, restrictions))) {
172172
it->second._users.insert(memory_user(MEM_USER(unique_id, network_id, prim_id, layout_bytes_count)));
173173
auto ret_mem = _engine->reinterpret_buffer(*it->second._memory, layout);
174-
GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
174+
ret_mem->from_memory_pool = true;
175175
return ret_mem;
176176
} else {
177177
++it;
@@ -217,7 +217,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
217217
!has_conflict(rec_list._users, restrictions)) {
218218
auto ret_mem = _engine->reinterpret_buffer(*(rec_list._memory), layout);
219219
rec_list._users.insert({MEM_USER(unique_id, network_id, prim_id, ret_mem->size())});
220-
GPU_DEBUG_CODE(ret_mem->from_memory_pool = true);
220+
ret_mem->from_memory_pool = true;
221221
return ret_mem;
222222
}
223223
}

src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,23 +212,27 @@ memory::ptr ocl_engine::reinterpret_buffer(const memory& memory, const layout& n
212212
memory.get_layout().format.to_string(), " Target: ", new_layout.format.to_string());
213213

214214
try {
215+
bool from_memory_pool = memory.from_memory_pool;
216+
memory::ptr reinterpret_memory = nullptr;
215217
if (new_layout.format.is_image_2d()) {
216-
return std::make_shared<ocl::gpu_image2d>(this,
218+
reinterpret_memory = std::make_shared<ocl::gpu_image2d>(this,
217219
new_layout,
218220
reinterpret_cast<const ocl::gpu_image2d&>(memory).get_buffer(),
219221
memory.get_mem_tracker());
220222
} else if (memory_capabilities::is_usm_type(memory.get_allocation_type())) {
221-
return std::make_shared<ocl::gpu_usm>(this,
223+
reinterpret_memory = std::make_shared<ocl::gpu_usm>(this,
222224
new_layout,
223225
reinterpret_cast<const ocl::gpu_usm&>(memory).get_buffer(),
224226
memory.get_allocation_type(),
225227
memory.get_mem_tracker());
226228
} else {
227-
return std::make_shared<ocl::gpu_buffer>(this,
229+
reinterpret_memory = std::make_shared<ocl::gpu_buffer>(this,
228230
new_layout,
229231
reinterpret_cast<const ocl::gpu_buffer&>(memory).get_buffer(),
230232
memory.get_mem_tracker());
231233
}
234+
reinterpret_memory->from_memory_pool = from_memory_pool;
235+
return reinterpret_memory;
232236
} catch (cl::Error const& err) {
233237
OPENVINO_THROW(OCL_ERR_MSG_FMT(err));
234238
}

src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include "test_utils.h"
66

7+
#include <intel_gpu/primitives/concatenation.hpp>
78
#include <intel_gpu/primitives/input_layout.hpp>
89
#include <intel_gpu/primitives/softmax.hpp>
910
#include <intel_gpu/primitives/reorder.hpp>
@@ -128,6 +129,117 @@ TEST(memory_reuse_realloc_reset_test, basic_conv_with_padding) {
128129
<< reorder_mem->get_mem_tracker()->size() << "bytes.";
129130
}
130131

132+
TEST(memory_reuse_realloc_reset_test, basic_conv_with_memory_get_from_padded_pool) {
133+
auto& engine = get_test_engine();
134+
135+
layout weight_layout = layout{ov::PartialShape{1, 4, 3, 3}, data_types::f32, format::bfyx};
136+
auto weights = engine.allocate_memory(weight_layout);
137+
set_values<float>(weights, {
138+
1.0f, 1.0f, 1.0f,
139+
1.0f, 1.0f, 1.0f,
140+
1.0f, 1.0f, 1.0f,
141+
142+
1.0f, 1.0f, 1.0f,
143+
1.0f, 1.0f, 1.0f,
144+
1.0f, 1.0f, 1.0f,
145+
146+
1.0f, 1.0f, 1.0f,
147+
1.0f, 1.0f, 1.0f,
148+
1.0f, 1.0f, 1.0f,
149+
150+
1.0f, 1.0f, 1.0f,
151+
1.0f, 1.0f, 1.0f,
152+
1.0f, 1.0f, 1.0f
153+
});
154+
155+
layout weight_layout2 = layout{ov::PartialShape{1, 3, 3, 3}, data_types::f32, format::bfyx};
156+
auto weights2 = engine.allocate_memory(weight_layout2);
157+
set_values<float>(weights2, {
158+
1.0f, 1.0f, 1.0f,
159+
1.0f, 1.0f, 1.0f,
160+
1.0f, 1.0f, 1.0f,
161+
162+
1.0f, 1.0f, 1.0f,
163+
1.0f, 1.0f, 1.0f,
164+
1.0f, 1.0f, 1.0f,
165+
166+
1.0f, 1.0f, 1.0f,
167+
1.0f, 1.0f, 1.0f,
168+
1.0f, 1.0f, 1.0f
169+
});
170+
171+
layout elt_layout1 = layout{ov::PartialShape{1, 2, 4, 4}, data_types::f32, format::bfyx};
172+
auto elt_mem1 = engine.allocate_memory(elt_layout1);
173+
set_values<float>(elt_mem1, {
174+
10.f, 10.f, 10.f, 10.f,
175+
10.f, 10.f, 10.f, 10.f,
176+
10.f, 10.f, 10.f, 10.f,
177+
10.f, 10.f, 10.f, 10.f,
178+
179+
10.f, 10.f, 10.f, 10.f,
180+
10.f, 10.f, 10.f, 10.f,
181+
10.f, 10.f, 10.f, 10.f,
182+
10.f, 10.f, 10.f, 10.f
183+
});
184+
185+
std::vector<float> ref_output = {
186+
1080, 1720, 1720, 1080,
187+
1720, 2740, 2740, 1720,
188+
1720, 2740, 2740, 1720,
189+
1080, 1720, 1720, 1080
190+
};
191+
192+
std::vector<float> subtract_val = {0.f, };
193+
auto input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
194+
auto elt_input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
195+
196+
topology topology(input_layout("elt_input", elt_input_l),
197+
data("weights", weights),
198+
data("weights2", weights2),
199+
reorder("reorder1-1", input_info("elt_input"), format::bfyx, data_types::f32, subtract_val, reorder_mean_mode::subtract),
200+
reorder("reorder1-2", input_info("elt_input"), format::bfyx, data_types::f32, subtract_val, reorder_mean_mode::subtract),
201+
concatenation("concat1", {input_info("reorder1-1"), input_info("reorder1-2")}, 1),
202+
convolution("conv1",
203+
input_info("concat1"),
204+
"weights",
205+
"", /*bias*/
206+
1,
207+
{1, 1}, /*stride*/
208+
{1, 1}, /*dilation*/
209+
{1, 1}, /*pad_above*/
210+
{1, 1}, /*pad_below*/
211+
false,
212+
ov::op::PadType::EXPLICIT),
213+
reorder("reorder2-1", input_info("conv1"), format::bfyx, data_types::f32, subtract_val, reorder_mean_mode::subtract),
214+
concatenation("concat2", {input_info("reorder1-1"), input_info("reorder2-1")}, 1),
215+
convolution("conv2",
216+
input_info("concat2"),
217+
"weights2",
218+
"", /*bias*/
219+
1,
220+
{1, 1}, /*stride*/
221+
{1, 1}, /*dilation*/
222+
{1, 1}, /*pad_above*/
223+
{1, 1}, /*pad_below*/
224+
false,
225+
ov::op::PadType::EXPLICIT),
226+
reorder("output", input_info("conv2"), format::bfyx, data_types::f32));
227+
228+
ExecutionConfig config = get_test_default_config(engine);
229+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
230+
config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
231+
232+
network network(engine, topology, config);
233+
network.set_input_data("elt_input", elt_mem1);
234+
auto outputs = network.execute();
235+
auto output_mem = outputs.begin()->second.get_memory();
236+
cldnn::mem_lock<float> output_mem_ptr(output_mem, get_test_stream());
237+
238+
for (size_t i = 0; i < output_mem->get_layout().get_linear_size(); ++i) {
239+
ASSERT_EQ(output_mem_ptr[i], ref_output[i]);
240+
}
241+
}
242+
131243
TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
132244
static const int32_t
133245
output_x_1 = 10, output_b_1 = 8,

0 commit comments

Comments
 (0)