[GPU] Avoid memory allocation for any node can reuse previous memory (openvinotoolkit#30957)

Kotomi-Du · web-flow · commit 3932807324e9 · 2025-07-03T17:46:05.000Z
### Details: There is a strategy to reuse memory for some intermediate outputs. However, it is recognized after the allocation of memory for all intermediate outputs. Therefore, the peak memory is not actually reduced. This PR is to address this problem which can reduce memory footprint for some models, for example 35% memory reduction for SD1.5 vae_decoder model when generating 512*512 images. ### Tickets: - [CVS-169229](https://jira.devtools.intel.com/browse/CVS-169229)
diff --git a/src/plugins/intel_gpu/src/graph/include/program_helpers.h b/src/plugins/intel_gpu/src/graph/include/program_helpers.h
@@ -132,6 +132,7 @@ struct onednn_add_fusing_helpers {
     static void for_eltwise(const program_node& conv_node, eltwise_mode mode,
                             std::function<void(const program_node&, const fused_primitive_desc&)> func);
     static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc);
+    static int32_t get_reused_eltwmem_idx(const program_node& node);
 };
 
 using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type;
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -545,31 +545,6 @@ void network::allocate_primitives() {
 
     auto& po = _program->get_processing_order();
 
-    for (auto const& node : po) {
-        if (node->get_preferred_impl_type() == impl_types::onednn) {
-            size_t eltw_dep = 0;
-            for (auto& fused_op : node->get_fused_primitives()) {
-                if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {
-                    // If it is first sum, reuse the buffer
-                    auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
-                    if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
-                        continue;
-                    if (!fused_op.has_outer_dep())
-                        continue;
-                    eltw_dep = fused_op.outer_dep_start_idx;
-                    auto& eltw_in = node->get_dependency(eltw_dep);
-                    if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
-                        auto& eltw_inst = _primitives.at(eltw_in.id());
-                        auto& prim_inst = _primitives.at(node->id());
-                        auto& eltw_mem = eltw_inst->output_memory();
-                        auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
-                        prim_inst->set_output_memory(new_mem);
-                    }
-                }
-            }
-        }
-    }
-
     // Update the output memory address of optimized-out layer if it is not valid.
     for (auto const& node : po) {
         if (node->can_be_optimized() && !node->is_dynamic() &&
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -2205,16 +2205,25 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
             }
         }
 
-        // TODO: Remove WA for arg_max_min node.
-        // For now it's required to handle the case when only second output of TopK primitive is used in plugin,
-        // but kernels always write both outputs to the same memory object which leads to wrong result.
-        if (user_count == 1 && mutable_data_count == 1 && !node.is_type<arg_max_min>()
-                                                       && !node.is_type<experimental_detectron_roi_feature_extractor>()) {
-            for (auto& user : node.get_users())
-                if (user->is_type<mutable_data>())
-                    _outputs[0] = user->as<mutable_data>().get_attached_memory_ptr();
+        if (auto reused_eltwmem_idx = onednn_add_fusing_helpers::get_reused_eltwmem_idx(node); reused_eltwmem_idx != -1) {
+            // sum post-op can use the input buffer as the output buffer
+            auto& eltw_node = node.get_dependency(reused_eltwmem_idx);
+            const auto& eltw_inst = _network.get_primitive(eltw_node.id());
+            auto& eltw_mem = eltw_inst->output_memory();
+            auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node.get_output_layout());
+            _outputs.push_back(new_mem);
         } else {
-            _outputs = allocate_outputs();
+            // TODO: Remove WA for arg_max_min node.
+            // For now it's required to handle the case when only second output of TopK primitive is used in plugin,
+            // but kernels always write both outputs to the same memory object which leads to wrong result.
+            if (user_count == 1 && mutable_data_count == 1 && !node.is_type<arg_max_min>() &&
+                !node.is_type<experimental_detectron_roi_feature_extractor>()) {
+                for (auto& user : node.get_users())
+                    if (user->is_type<mutable_data>())
+                        _outputs[0] = user->as<mutable_data>().get_attached_memory_ptr();
+            } else {
+                _outputs = allocate_outputs();
+            }
         }
     }
     if (_node) {
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -736,6 +736,10 @@ const std::vector<primitive_id>& program::get_allocating_order(bool forced_updat
                     if (lhs_layout.is_dynamic())
                         return false;
 
+                    if (lhs_layout.bytes_count() == rhs_layout.bytes_count()) {
+                        return lhs->get_unique_id() < rhs->get_unique_id();
+                    }
+
                     return (lhs_layout.bytes_count() > rhs_layout.bytes_count());
             });
 
diff --git a/src/plugins/intel_gpu/src/graph/program_helpers.cpp b/src/plugins/intel_gpu/src/graph/program_helpers.cpp
@@ -159,5 +159,22 @@ add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
     return add_fusing_type::binary_per_oc;
 }
 
-
+int32_t onednn_add_fusing_helpers::get_reused_eltwmem_idx(const program_node& node) {
+    int32_t reused_mem_idx = -1;   // if -1, no reused input memory
+    if (node.get_preferred_impl_type() == impl_types::onednn) {
+        size_t eltw_dep = 0;
+        for (auto& fused_op : node.get_fused_primitives()) {
+            if (fused_op.is_type<eltwise>() && fused_op.deps.size() == 1) {
+                // If it is first sum, reuse the buffer
+                auto fusing_type = get_add_fusing_type(node, fused_op);
+                if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
+                    continue;
+                if (!fused_op.has_outer_dep())
+                    continue;
+                reused_mem_idx = fused_op.outer_dep_start_idx;
+            }
+        }
+    }
+    return reused_mem_idx;
+}
 }  // namespace cldnn