openvinotoolkit
diff --git a/‎src/common/transformations/include/ov_ops/vl_sdpa.hpp‎
Lines changed: 58 additions & 0 deletions b/‎src/common/transformations/include/ov_ops/vl_sdpa.hpp‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎src/common/transformations/src/ov_ops/vl_sdpa.cpp‎
Lines changed: 120 additions & 0 deletions b/‎src/common/transformations/src/ov_ops/vl_sdpa.cpp‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎src/common/transformations/tests/common_optimizations/sdpa_to_vlsdpa_test.cpp‎
Lines changed: 108 additions & 0 deletions b/‎src/common/transformations/tests/common_optimizations/sdpa_to_vlsdpa_test.cpp‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎src/core/include/openvino/pass/sdpa_to_vlsdpa.hpp‎
Lines changed: 32 additions & 0 deletions b/‎src/core/include/openvino/pass/sdpa_to_vlsdpa.hpp‎
Lines changed: 32 additions & 0 deletions
@@ -0,0 +1,58 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "openvino/op/util/sub_graph_base.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+/// \brief Implements the SDPA (Scaled Dot Product Attention) operator for specific ViT models like Qwen2-VL and
+/// Qwen2.5-VL. These models exhibit distinct attention mask sparsity patterns where:
+///   - Attention occurs only within individual images (for multi-image inputs)
+///   - Attention is confined to individual windows (in Qwen2.5-VL)
+/// \note The key difference from standard scaled_dot_product_attention is mask handling:
+///       This implementation uses cu_seqlens instead of attention_mask.
+class TRANSFORMATIONS_API VLSDPA : public ov::op::Op {
+public:
+    OPENVINO_OP("VLSDPA", "ie_internal_opset", ov::op::Op);
+
+    VLSDPA() = default;
+
+    VLSDPA(const OutputVector& inputs,
+           const std::vector<int64_t>& order_q = {},
+           const std::vector<int64_t>& order_k = {},
+           const std::vector<int64_t>& order_v = {},
+           const std::vector<int64_t>& order_out = {});
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+
+    std::vector<int64_t> get_input0_transpose_order() const {
+        return m_order_q;
+    }
+    std::vector<int64_t> get_input1_transpose_order() const {
+        return m_order_k;
+    }
+    std::vector<int64_t> get_input2_transpose_order() const {
+        return m_order_v;
+    }
+    std::vector<int64_t> get_output_transpose_order() const {
+        return m_order_out;
+    }
+
+protected:
+    std::vector<int64_t> m_order_q;
+    std::vector<int64_t> m_order_k;
+    std::vector<int64_t> m_order_v;
+    std::vector<int64_t> m_order_out;
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
@@ -0,0 +1,120 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ov_ops/vl_sdpa.hpp"
+
+#include "augru_sequence_shape_inference.hpp"
+#include "itt.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "ov_ops/augru_sequence.hpp"
+#include "scaled_dot_product_attention_shape_inference.hpp"
+
+namespace ov {
+namespace op {
+namespace internal {
+
+VLSDPA::VLSDPA(const OutputVector& inputs,
+               const std::vector<int64_t>& order_q,
+               const std::vector<int64_t>& order_k,
+               const std::vector<int64_t>& order_v,
+               const std::vector<int64_t>& order_out)
+    : Op(inputs),
+      m_order_q(order_q),
+      m_order_k(order_k),
+      m_order_v(order_v),
+      m_order_out(order_out) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<ov::Node> VLSDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(internal_VLSDPA_clone_with_new_inputs);
+    return std::make_shared<VLSDPA>(new_args, m_order_q, m_order_k, m_order_v, m_order_out);
+}
+
+bool VLSDPA::visit_attributes(ov::AttributeVisitor& visitor) {
+    INTERNAL_OP_SCOPE(internal_VLSDPA_visit_attributes);
+    visitor.on_attribute("order_q", m_order_q);
+    visitor.on_attribute("order_k", m_order_k);
+    visitor.on_attribute("order_v", m_order_v);
+    visitor.on_attribute("order_out", m_order_out);
+    return true;
+}
+
+void VLSDPA::validate_and_infer_types() {
+    INTERNAL_OP_SCOPE(internal_VLSDPA_validate_and_infer_types);
+    OPENVINO_ASSERT(get_input_size() == 4, "VLSDPA must have 4 inputs whereas it has ", get_input_size());
+
+    auto out_type = get_input_element_type(0);
+
+    const auto& cu_seqlens_type = get_input_element_type(3);
+    NODE_VALIDATION_CHECK(this,
+                          cu_seqlens_type.is_integral() || cu_seqlens_type.is_dynamic(),
+                          "The element type of cu_seqlens must be integral.");
+
+    for (size_t i = 1; i < 3; i++) {
+        const auto& element_type = get_input_element_type(i);
+        NODE_VALIDATION_CHECK(this,
+                              element::Type::merge(out_type, out_type, element_type),
+                              "Mixed input types of K/V are not supported.");
+    }
+    NODE_VALIDATION_CHECK(this,
+                          out_type.is_real() || out_type.is_dynamic(),
+                          "The element type of the input tensor must be a floating-point.");
+
+    const auto& input_shapes = ov::util::get_node_input_partial_shapes(*this);
+
+    // validate input shapes
+    // VLSDPA node is only optimized by QWen2.x-VL model at the moment. Therefore,
+    // the strict check is applied, which could be relaxed once we see similar patterns in
+    // more models and corresponding kernel implements the function.
+    auto shape_q = input_shapes[0];
+    auto shape_k = input_shapes[1];
+    auto shape_v = input_shapes[2];
+
+    auto shape_q_rank = shape_q.rank();
+    NODE_VALIDATION_CHECK(this,
+                          shape_q_rank.is_static() && shape_q_rank.get_length() == 3,
+                          "Query input rank length must be 3.");
+    auto shape_v_rank = shape_v.rank();
+    NODE_VALIDATION_CHECK(this,
+                          shape_v_rank.is_static() && shape_v_rank.get_length() == 3,
+                          "Key input rank length must be 3.");
+    auto shape_k_rank = shape_v.rank();
+    NODE_VALIDATION_CHECK(this,
+                          shape_k_rank.is_static() && shape_k_rank.get_length() == 3,
+                          "Value input rank length must be 3.");
+
+    NODE_VALIDATION_CHECK(this,
+                          (m_order_q == m_order_k && m_order_q == m_order_v && m_order_q == m_order_out),
+                          "Value of m_order* must be equal.");
+
+    if (m_order_q.size() > 0) {
+        NODE_VALIDATION_CHECK(this,
+                              (m_order_q == std::vector<int64_t>{1, 0, 2}),
+                              "Value of order_q must be {1, 0, 2}.");
+    }
+
+    // const auto output_shapes = shape_infer(this, input_shapes);
+    // transpose shape into BHLS(4D), or HLS(3D)
+    auto transpose_pshape = [](const ov::PartialShape& pshape, const std::vector<int64_t>& order) {
+        if (order.empty())
+            return pshape;
+
+        auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank());
+        for (size_t i = 0; i < order.size(); i++) {
+            transposed_pshape[i] = pshape[order[i]];
+        }
+        return transposed_pshape;
+    };
+    const auto& output_shape = transpose_pshape(input_shapes[0], m_order_q);
+    if (m_order_out.size() > 0) {
+        set_output_type(0, out_type, transpose_pshape(output_shape, m_order_out));
+    } else {
+        set_output_type(0, out_type, output_shape);
+    }
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
@@ -0,0 +1,108 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/pass/sdpa_to_vlsdpa.hpp"
+
+#include <gtest/gtest.h>
+
+#include "common_test_utils/ov_test_utils.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/opsets/opset13.hpp"
+#include "openvino/pass/manager.hpp"
+#include "openvino/runtime/core.hpp"
+#include "ov_ops/vl_sdpa.hpp"
+
+using namespace std;
+using namespace ov;
+using namespace ov::opset13;
+
+namespace {
+std::shared_ptr<ov::Model> build_model(const string& mask_name) {
+    auto q = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32}); /* L,H,S */
+    auto k = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
+    auto v = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
+
+    q->set_friendly_name("q");
+    k->set_friendly_name("k");
+    v->set_friendly_name("v");
+
+    auto transpose_q = std::make_shared<Transpose>(q, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    auto transpose_k = std::make_shared<Transpose>(k, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    auto transpose_v = std::make_shared<Transpose>(v, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    transpose_q->set_friendly_name("transpose_q");
+    transpose_k->set_friendly_name("transpose_k");
+    transpose_v->set_friendly_name("transpose_v");
+
+    auto mask = std::make_shared<Parameter>(element::f32, PartialShape{1, -1, -1});
+    mask->set_friendly_name(mask_name);
+    mask->get_output_tensor(0).set_names({mask_name});
+
+    const auto casual = false;
+
+    auto sdpa =
+        std::make_shared<opset13::ScaledDotProductAttention>(transpose_q, transpose_k, transpose_v, mask, casual);
+    sdpa->set_friendly_name("sdpa");
+
+    auto transpose_o = std::make_shared<Transpose>(sdpa, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    transpose_o->set_friendly_name("transpose_o");
+
+    return std::make_shared<ov::Model>(OutputVector{transpose_o}, ParameterVector{q, k, v, mask});
+}
+
+std::shared_ptr<ov::Model> build_target_model(const string& mask_name) {
+    auto q = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32}); /* L,H,S */
+    auto k = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
+    auto v = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
+    q->set_friendly_name("q");
+    k->set_friendly_name("k");
+    v->set_friendly_name("v");
+
+    auto transpose_q = std::make_shared<Transpose>(q, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    auto transpose_k = std::make_shared<Transpose>(k, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    auto transpose_v = std::make_shared<Transpose>(v, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    transpose_q->set_friendly_name("transpose_q");
+    transpose_k->set_friendly_name("transpose_k");
+    transpose_v->set_friendly_name("transpose_v");
+
+    auto cuseq_mask = std::make_shared<Parameter>(element::i32, PartialShape{-1});
+    cuseq_mask->set_friendly_name(mask_name);
+    cuseq_mask->get_output_tensor(0).set_names({mask_name});
+
+    auto vlsdpa =
+        std::make_shared<ov::op::internal::VLSDPA>(OutputVector{transpose_q, transpose_k, transpose_v, cuseq_mask});
+
+    auto transpose_o = std::make_shared<Transpose>(vlsdpa, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
+    transpose_o->set_friendly_name("transpose_o");
+
+    return std::make_shared<ov::Model>(OutputVector{transpose_o}, ParameterVector{q, k, v, cuseq_mask});
+}
+};  // namespace
+
+TEST_F(TransformationTestsF, SDPA2VLSDPAAttentionMaskTest) {
+    disable_rt_info_check();
+    {
+        model = build_model("attention_mask");
+        model->set_rt_info("QWenVL", "model_type_hint");  // request_vl_sdpa_transformations
+        manager.register_pass<ov::pass::SDPAToVLSDPA>();
+    }
+    { model_ref = build_target_model("cu_seq_lens"); }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+    comparator.enable(FunctionsComparator::CmpValues::NAMES);
+}
+
+TEST_F(TransformationTestsF, SDPA2VLSDPAWindowAttentionMaskTest) {
+    disable_rt_info_check();
+    {
+        model = build_model("window_attention_mask");
+        model->set_rt_info("QWenVL", "model_type_hint");  // request_vl_sdpa_transformations
+        manager.register_pass<ov::pass::SDPAToVLSDPA>();
+    }
+    { model_ref = build_target_model("cu_window_seqlens"); }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+    comparator.enable(FunctionsComparator::CmpValues::NAMES);
+}
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "openvino/pass/pass.hpp"
+
+namespace ov {
+namespace pass {
+/**
+ * @brief The transformation replaces SDPA in ViTs by VLSDPA operation.
+ * The input "attention_mask" is replaced by "accumulated sequence lengths".
+ * Please note -
+ * 1. This pass applies to QWen2.x-VL models only, which relies on user (genai) to set
+ * rt_info of "model_type_hint".
+ * 2. The pass will change model inputs w.r.t input names, shape, and data type. Therefore,
+ * it should be applied at the beginning of transformation pipeline.
+ * \ingroup ov_pass_cpp_api
+ */
+class OPENVINO_API SDPAToVLSDPA : public ModelPass {
+public:
+    OPENVINO_MODEL_PASS_RTTI("SDPAToVLSDPA");
+
+    explicit SDPAToVLSDPA();
+    bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
+};
+}  // namespace pass
+}  // namespace ov