Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ struct onednn_add_fusing_helpers {
static void for_eltwise(const program_node& conv_node, eltwise_mode mode,
std::function<void(const program_node&, const fused_primitive_desc&)> func);
static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc);
static int32_t get_reused_eltwmem_idx(const program_node& node);
};

using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type;
Expand Down
25 changes: 25 additions & 0 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,31 @@ void network::allocate_primitives() {

auto& po = _program->get_processing_order();

for (auto const& node : po) {
if (node->get_preferred_impl_type() == impl_types::onednn) {
size_t eltw_dep = 0;
for (auto& fused_op : node->get_fused_primitives()) {
if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {
// If it is first sum, reuse the buffer
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
auto& eltw_inst = _primitives.at(eltw_in.id());
auto& prim_inst = _primitives.at(node->id());
auto& eltw_mem = eltw_inst->output_memory();
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
prim_inst->set_output_memory(new_mem);
}
}
}
}
}

// Update the output memory address of optimized-out layer if it is not valid.
for (auto const& node : po) {
if (node->can_be_optimized() && !node->is_dynamic() &&
Expand Down
27 changes: 9 additions & 18 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2206,25 +2206,16 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
}
}

if (auto reused_eltwmem_idx = onednn_add_fusing_helpers::get_reused_eltwmem_idx(node); reused_eltwmem_idx != -1) {
// sum post-op can use the input buffer as the output buffer
auto& eltw_node = node.get_dependency(reused_eltwmem_idx);
const auto& eltw_inst = _network.get_primitive(eltw_node.id());
auto& eltw_mem = eltw_inst->output_memory();
auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node.get_output_layout());
_outputs.push_back(new_mem);
// TODO: Remove WA for arg_max_min node.
// For now it's required to handle the case when only second output of TopK primitive is used in plugin,
// but kernels always write both outputs to the same memory object which leads to wrong result.
if (user_count == 1 && mutable_data_count == 1 && !node.is_type<arg_max_min>()
&& !node.is_type<experimental_detectron_roi_feature_extractor>()) {
for (auto& user : node.get_users())
if (user->is_type<mutable_data>())
_outputs[0] = user->as<mutable_data>().get_attached_memory_ptr();
} else {
// TODO: Remove WA for arg_max_min node.
// For now it's required to handle the case when only second output of TopK primitive is used in plugin,
// but kernels always write both outputs to the same memory object which leads to wrong result.
if (user_count == 1 && mutable_data_count == 1 && !node.is_type<arg_max_min>() &&
!node.is_type<experimental_detectron_roi_feature_extractor>()) {
for (auto& user : node.get_users())
if (user->is_type<mutable_data>())
_outputs[0] = user->as<mutable_data>().get_attached_memory_ptr();
} else {
_outputs = allocate_outputs();
}
_outputs = allocate_outputs();
}
}
if (_node) {
Expand Down
4 changes: 0 additions & 4 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -742,10 +742,6 @@ const std::vector<primitive_id>& program::get_allocating_order(bool forced_updat
if (lhs_layout.is_dynamic())
return false;

if (lhs_layout.bytes_count() == rhs_layout.bytes_count()) {
return lhs->get_unique_id() < rhs->get_unique_id();
}

return (lhs_layout.bytes_count() > rhs_layout.bytes_count());
});

Expand Down
17 changes: 0 additions & 17 deletions src/plugins/intel_gpu/src/graph/program_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,21 +158,4 @@ add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(

return add_fusing_type::binary_per_oc;
}

int32_t onednn_add_fusing_helpers::get_reused_eltwmem_idx(const program_node& node) {
if (node.get_preferred_impl_type() == impl_types::onednn) {
for (auto& fused_op : node.get_fused_primitives()) {
if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {
// If it is first sum, reuse the buffer
auto fusing_type = get_add_fusing_type(node, fused_op);
if (fusing_type != add_fusing_type::sum)
continue;
if (!fused_op.has_outer_dep())
continue;
return fused_op.outer_dep_start_idx;
}
}
}
return -1; // if -1, no reused input memory
}
} // namespace cldnn
Loading