Skip to content

Commit 3932807

Browse files
authored
[GPU] Avoid memory allocation for any node can reuse previous memory (openvinotoolkit#30957)
### Details: There is a strategy to reuse memory for some intermediate outputs. However, it is recognized after the allocation of memory for all intermediate outputs. Therefore, the peak memory is not actually reduced. This PR is to address this problem which can reduce memory footprint for some models, for example 35% memory reduction for SD1.5 vae_decoder model when generating 512*512 images. ### Tickets: - [CVS-169229](https://jira.devtools.intel.com/browse/CVS-169229)
1 parent 8231e61 commit 3932807

File tree

5 files changed

+41
-35
lines changed

5 files changed

+41
-35
lines changed

src/plugins/intel_gpu/src/graph/include/program_helpers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ struct onednn_add_fusing_helpers {
132132
static void for_eltwise(const program_node& conv_node, eltwise_mode mode,
133133
std::function<void(const program_node&, const fused_primitive_desc&)> func);
134134
static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc);
135+
static int32_t get_reused_eltwmem_idx(const program_node& node);
135136
};
136137

137138
using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type;

src/plugins/intel_gpu/src/graph/network.cpp

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -545,31 +545,6 @@ void network::allocate_primitives() {
545545

546546
auto& po = _program->get_processing_order();
547547

548-
for (auto const& node : po) {
549-
if (node->get_preferred_impl_type() == impl_types::onednn) {
550-
size_t eltw_dep = 0;
551-
for (auto& fused_op : node->get_fused_primitives()) {
552-
if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {
553-
// If it is first sum, reuse the buffer
554-
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
555-
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
556-
continue;
557-
if (!fused_op.has_outer_dep())
558-
continue;
559-
eltw_dep = fused_op.outer_dep_start_idx;
560-
auto& eltw_in = node->get_dependency(eltw_dep);
561-
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
562-
auto& eltw_inst = _primitives.at(eltw_in.id());
563-
auto& prim_inst = _primitives.at(node->id());
564-
auto& eltw_mem = eltw_inst->output_memory();
565-
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
566-
prim_inst->set_output_memory(new_mem);
567-
}
568-
}
569-
}
570-
}
571-
}
572-
573548
// Update the output memory address of optimized-out layer if it is not valid.
574549
for (auto const& node : po) {
575550
if (node->can_be_optimized() && !node->is_dynamic() &&

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2205,16 +2205,25 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
22052205
}
22062206
}
22072207

2208-
// TODO: Remove WA for arg_max_min node.
2209-
// For now it's required to handle the case when only second output of TopK primitive is used in plugin,
2210-
// but kernels always write both outputs to the same memory object which leads to wrong result.
2211-
if (user_count == 1 && mutable_data_count == 1 && !node.is_type<arg_max_min>()
2212-
&& !node.is_type<experimental_detectron_roi_feature_extractor>()) {
2213-
for (auto& user : node.get_users())
2214-
if (user->is_type<mutable_data>())
2215-
_outputs[0] = user->as<mutable_data>().get_attached_memory_ptr();
2208+
if (auto reused_eltwmem_idx = onednn_add_fusing_helpers::get_reused_eltwmem_idx(node); reused_eltwmem_idx != -1) {
2209+
// sum post-op can use the input buffer as the output buffer
2210+
auto& eltw_node = node.get_dependency(reused_eltwmem_idx);
2211+
const auto& eltw_inst = _network.get_primitive(eltw_node.id());
2212+
auto& eltw_mem = eltw_inst->output_memory();
2213+
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node.get_output_layout());
2214+
_outputs.push_back(new_mem);
22162215
} else {
2217-
_outputs = allocate_outputs();
2216+
// TODO: Remove WA for arg_max_min node.
2217+
// For now it's required to handle the case when only second output of TopK primitive is used in plugin,
2218+
// but kernels always write both outputs to the same memory object which leads to wrong result.
2219+
if (user_count == 1 && mutable_data_count == 1 && !node.is_type<arg_max_min>() &&
2220+
!node.is_type<experimental_detectron_roi_feature_extractor>()) {
2221+
for (auto& user : node.get_users())
2222+
if (user->is_type<mutable_data>())
2223+
_outputs[0] = user->as<mutable_data>().get_attached_memory_ptr();
2224+
} else {
2225+
_outputs = allocate_outputs();
2226+
}
22182227
}
22192228
}
22202229
if (_node) {

src/plugins/intel_gpu/src/graph/program.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,10 @@ const std::vector<primitive_id>& program::get_allocating_order(bool forced_updat
736736
if (lhs_layout.is_dynamic())
737737
return false;
738738

739+
if (lhs_layout.bytes_count() == rhs_layout.bytes_count()) {
740+
return lhs->get_unique_id() < rhs->get_unique_id();
741+
}
742+
739743
return (lhs_layout.bytes_count() > rhs_layout.bytes_count());
740744
});
741745

src/plugins/intel_gpu/src/graph/program_helpers.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,5 +159,22 @@ add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
159159
return add_fusing_type::binary_per_oc;
160160
}
161161

162-
162+
int32_t onednn_add_fusing_helpers::get_reused_eltwmem_idx(const program_node& node) {
163+
int32_t reused_mem_idx = -1; // if -1, no reused input memory
164+
if (node.get_preferred_impl_type() == impl_types::onednn) {
165+
size_t eltw_dep = 0;
166+
for (auto& fused_op : node.get_fused_primitives()) {
167+
if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {
168+
// If it is first sum, reuse the buffer
169+
auto fusing_type = get_add_fusing_type(node, fused_op);
170+
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
171+
continue;
172+
if (!fused_op.has_outer_dep())
173+
continue;
174+
reused_mem_idx = fused_op.outer_dep_start_idx;
175+
}
176+
}
177+
}
178+
return reused_mem_idx;
179+
}
163180
} // namespace cldnn

0 commit comments

Comments
 (0)