diff --git a/src/plugins/intel_gpu/src/graph/include/program_helpers.h b/src/plugins/intel_gpu/src/graph/include/program_helpers.h index 5c47d535343494..e23d16988df485 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_helpers.h +++ b/src/plugins/intel_gpu/src/graph/include/program_helpers.h @@ -132,7 +132,6 @@ struct onednn_add_fusing_helpers { static void for_eltwise(const program_node& conv_node, eltwise_mode mode, std::function func); static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc); - static int32_t get_reused_eltwmem_idx(const program_node& node); }; using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index e5198595492e77..d385f96d5cb76b 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -545,6 +545,31 @@ void network::allocate_primitives() { auto& po = _program->get_processing_order(); + for (auto const& node : po) { + if (node->get_preferred_impl_type() == impl_types::onednn) { + size_t eltw_dep = 0; + for (auto& fused_op : node->get_fused_primitives()) { + if (fused_op.is_type() && fused_op.deps.size() == 1) { + // If it is first sum, reuse the buffer + auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op); + if (fusing_type != add_fusing_type::sum || eltw_dep != 0) + continue; + if (!fused_op.has_outer_dep()) + continue; + eltw_dep = fused_op.outer_dep_start_idx; + auto& eltw_in = node->get_dependency(eltw_dep); + if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) { + auto& eltw_inst = _primitives.at(eltw_in.id()); + auto& prim_inst = _primitives.at(node->id()); + auto& eltw_mem = eltw_inst->output_memory(); + auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout()); + prim_inst->set_output_memory(new_mem); + } + } + } + } + } + // Update the output memory address of optimized-out layer if it is not valid. for (auto const& node : po) { if (node->can_be_optimized() && !node->is_dynamic() && diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 2d73ee69cdba3a..f3ea728be9c493 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -2206,25 +2206,16 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool } } - if (auto reused_eltwmem_idx = onednn_add_fusing_helpers::get_reused_eltwmem_idx(node); reused_eltwmem_idx != -1) { - // sum post-op can use the input buffer as the output buffer - auto& eltw_node = node.get_dependency(reused_eltwmem_idx); - const auto& eltw_inst = _network.get_primitive(eltw_node.id()); - auto& eltw_mem = eltw_inst->output_memory(); - auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node.get_output_layout()); - _outputs.push_back(new_mem); + // TODO: Remove WA for arg_max_min node. + // For now it's required to handle the case when only second output of TopK primitive is used in plugin, + // but kernels always write both outputs to the same memory object which leads to wrong result. + if (user_count == 1 && mutable_data_count == 1 && !node.is_type() + && !node.is_type()) { + for (auto& user : node.get_users()) + if (user->is_type()) + _outputs[0] = user->as().get_attached_memory_ptr(); } else { - // TODO: Remove WA for arg_max_min node. - // For now it's required to handle the case when only second output of TopK primitive is used in plugin, - // but kernels always write both outputs to the same memory object which leads to wrong result. - if (user_count == 1 && mutable_data_count == 1 && !node.is_type() && - !node.is_type()) { - for (auto& user : node.get_users()) - if (user->is_type()) - _outputs[0] = user->as().get_attached_memory_ptr(); - } else { - _outputs = allocate_outputs(); - } + _outputs = allocate_outputs(); } } if (_node) { diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index ad4a290fc28470..56f0c569ba590c 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -742,10 +742,6 @@ const std::vector& program::get_allocating_order(bool forced_updat if (lhs_layout.is_dynamic()) return false; - if (lhs_layout.bytes_count() == rhs_layout.bytes_count()) { - return lhs->get_unique_id() < rhs->get_unique_id(); - } - return (lhs_layout.bytes_count() > rhs_layout.bytes_count()); }); diff --git a/src/plugins/intel_gpu/src/graph/program_helpers.cpp b/src/plugins/intel_gpu/src/graph/program_helpers.cpp index a326c9bce6c725..0c1c6fdcc7112a 100644 --- a/src/plugins/intel_gpu/src/graph/program_helpers.cpp +++ b/src/plugins/intel_gpu/src/graph/program_helpers.cpp @@ -158,21 +158,4 @@ add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type( return add_fusing_type::binary_per_oc; } - -int32_t onednn_add_fusing_helpers::get_reused_eltwmem_idx(const program_node& node) { - if (node.get_preferred_impl_type() == impl_types::onednn) { - for (auto& fused_op : node.get_fused_primitives()) { - if (fused_op.is_type() && fused_op.deps.size() == 1) { - // If it is first sum, reuse the buffer - auto fusing_type = get_add_fusing_type(node, fused_op); - if (fusing_type != add_fusing_type::sum) - continue; - if (!fused_op.has_outer_dep()) - continue; - return fused_op.outer_dep_start_idx; - } - } - } - return -1; // if -1, no reused input memory -} } // namespace cldnn