Skip to content

Commit 78449ae

Browse files
ceciliapeng2011riverlijunjiepraaszpeterchen-intel
authored
[GPU][QWen2/2.5-VL] improve SDPA performance with cu_seqlens and cu_window_seqlens (#30909)
### Details: The process is like, 1. GenAI provides a special RT_INFO entry to QWen-VL models during compile_model. 2. The plugin detects this entry and the target device capability. 3. The plugin transforms the model input, replacing attention_mask with cu_seqlens. 4. GenAI then performs inference after validating the final model inputs. ### Tickets: - *[168519](https://jira.devtools.intel.com/browse/CVS-168519)* Should work along with - openvinotoolkit/openvino.genai#2330 --------- Co-authored-by: River.Li <[email protected]> Co-authored-by: Pawel Raasz <[email protected]> Co-authored-by: Chen Peter <[email protected]>
1 parent 05b0622 commit 78449ae

File tree

31 files changed

+2829
-92
lines changed

31 files changed

+2829
-92
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright (C) 2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "openvino/op/op.hpp"
8+
#include "openvino/op/util/sub_graph_base.hpp"
9+
#include "transformations_visibility.hpp"
10+
11+
namespace ov {
12+
namespace op {
13+
namespace internal {
14+
/// \brief Implements the SDPA (Scaled Dot Product Attention) operator for specific ViT models like Qwen2-VL and
15+
/// Qwen2.5-VL. These models exhibit distinct attention mask sparsity patterns where:
16+
/// - Attention occurs only within individual images (for multi-image inputs)
17+
/// - Attention is confined to individual windows (in Qwen2.5-VL)
18+
/// \note The key difference from standard scaled_dot_product_attention is mask handling:
19+
/// This implementation uses cu_seqlens instead of attention_mask.
20+
class TRANSFORMATIONS_API VLSDPA : public ov::op::Op {
21+
public:
22+
OPENVINO_OP("VLSDPA", "ie_internal_opset", ov::op::Op);
23+
24+
VLSDPA() = default;
25+
26+
VLSDPA(const OutputVector& inputs,
27+
const std::vector<int64_t>& order_q = {},
28+
const std::vector<int64_t>& order_k = {},
29+
const std::vector<int64_t>& order_v = {},
30+
const std::vector<int64_t>& order_out = {});
31+
32+
bool visit_attributes(AttributeVisitor& visitor) override;
33+
void validate_and_infer_types() override;
34+
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
35+
36+
std::vector<int64_t> get_input0_transpose_order() const {
37+
return m_order_q;
38+
}
39+
std::vector<int64_t> get_input1_transpose_order() const {
40+
return m_order_k;
41+
}
42+
std::vector<int64_t> get_input2_transpose_order() const {
43+
return m_order_v;
44+
}
45+
std::vector<int64_t> get_output_transpose_order() const {
46+
return m_order_out;
47+
}
48+
49+
protected:
50+
std::vector<int64_t> m_order_q;
51+
std::vector<int64_t> m_order_k;
52+
std::vector<int64_t> m_order_v;
53+
std::vector<int64_t> m_order_out;
54+
};
55+
56+
} // namespace internal
57+
} // namespace op
58+
} // namespace ov
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "ov_ops/vl_sdpa.hpp"
6+
7+
#include "augru_sequence_shape_inference.hpp"
8+
#include "itt.hpp"
9+
#include "openvino/op/scaled_dot_product_attention.hpp"
10+
#include "ov_ops/augru_sequence.hpp"
11+
#include "scaled_dot_product_attention_shape_inference.hpp"
12+
13+
namespace ov {
14+
namespace op {
15+
namespace internal {
16+
17+
VLSDPA::VLSDPA(const OutputVector& inputs,
18+
const std::vector<int64_t>& order_q,
19+
const std::vector<int64_t>& order_k,
20+
const std::vector<int64_t>& order_v,
21+
const std::vector<int64_t>& order_out)
22+
: Op(inputs),
23+
m_order_q(order_q),
24+
m_order_k(order_k),
25+
m_order_v(order_v),
26+
m_order_out(order_out) {
27+
constructor_validate_and_infer_types();
28+
}
29+
30+
std::shared_ptr<ov::Node> VLSDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const {
31+
INTERNAL_OP_SCOPE(internal_VLSDPA_clone_with_new_inputs);
32+
return std::make_shared<VLSDPA>(new_args, m_order_q, m_order_k, m_order_v, m_order_out);
33+
}
34+
35+
bool VLSDPA::visit_attributes(ov::AttributeVisitor& visitor) {
36+
INTERNAL_OP_SCOPE(internal_VLSDPA_visit_attributes);
37+
visitor.on_attribute("order_q", m_order_q);
38+
visitor.on_attribute("order_k", m_order_k);
39+
visitor.on_attribute("order_v", m_order_v);
40+
visitor.on_attribute("order_out", m_order_out);
41+
return true;
42+
}
43+
44+
void VLSDPA::validate_and_infer_types() {
45+
INTERNAL_OP_SCOPE(internal_VLSDPA_validate_and_infer_types);
46+
OPENVINO_ASSERT(get_input_size() == 4, "VLSDPA must have 4 inputs whereas it has ", get_input_size());
47+
48+
auto out_type = get_input_element_type(0);
49+
50+
const auto& cu_seqlens_type = get_input_element_type(3);
51+
NODE_VALIDATION_CHECK(this,
52+
cu_seqlens_type.is_integral() || cu_seqlens_type.is_dynamic(),
53+
"The element type of cu_seqlens must be integral.");
54+
55+
for (size_t i = 1; i < 3; i++) {
56+
const auto& element_type = get_input_element_type(i);
57+
NODE_VALIDATION_CHECK(this,
58+
element::Type::merge(out_type, out_type, element_type),
59+
"Mixed input types of K/V are not supported.");
60+
}
61+
NODE_VALIDATION_CHECK(this,
62+
out_type.is_real() || out_type.is_dynamic(),
63+
"The element type of the input tensor must be a floating-point.");
64+
65+
const auto& input_shapes = ov::util::get_node_input_partial_shapes(*this);
66+
67+
// validate input shapes
68+
// VLSDPA node is only optimized by QWen2.x-VL model at the moment. Therefore,
69+
// the strict check is applied, which could be relaxed once we see similar patterns in
70+
// more models and corresponding kernel implements the function.
71+
auto shape_q = input_shapes[0];
72+
auto shape_k = input_shapes[1];
73+
auto shape_v = input_shapes[2];
74+
75+
auto shape_q_rank = shape_q.rank();
76+
NODE_VALIDATION_CHECK(this,
77+
shape_q_rank.is_static() && shape_q_rank.get_length() == 3,
78+
"Query input rank length must be 3.");
79+
auto shape_v_rank = shape_v.rank();
80+
NODE_VALIDATION_CHECK(this,
81+
shape_v_rank.is_static() && shape_v_rank.get_length() == 3,
82+
"Key input rank length must be 3.");
83+
auto shape_k_rank = shape_v.rank();
84+
NODE_VALIDATION_CHECK(this,
85+
shape_k_rank.is_static() && shape_k_rank.get_length() == 3,
86+
"Value input rank length must be 3.");
87+
88+
NODE_VALIDATION_CHECK(this,
89+
(m_order_q == m_order_k && m_order_q == m_order_v && m_order_q == m_order_out),
90+
"Value of m_order* must be equal.");
91+
92+
if (m_order_q.size() > 0) {
93+
NODE_VALIDATION_CHECK(this,
94+
(m_order_q == std::vector<int64_t>{1, 0, 2}),
95+
"Value of order_q must be {1, 0, 2}.");
96+
}
97+
98+
// const auto output_shapes = shape_infer(this, input_shapes);
99+
// transpose shape into BHLS(4D), or HLS(3D)
100+
auto transpose_pshape = [](const ov::PartialShape& pshape, const std::vector<int64_t>& order) {
101+
if (order.empty())
102+
return pshape;
103+
104+
auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank());
105+
for (size_t i = 0; i < order.size(); i++) {
106+
transposed_pshape[i] = pshape[order[i]];
107+
}
108+
return transposed_pshape;
109+
};
110+
const auto& output_shape = transpose_pshape(input_shapes[0], m_order_q);
111+
if (m_order_out.size() > 0) {
112+
set_output_type(0, out_type, transpose_pshape(output_shape, m_order_out));
113+
} else {
114+
set_output_type(0, out_type, output_shape);
115+
}
116+
}
117+
118+
} // namespace internal
119+
} // namespace op
120+
} // namespace ov
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "openvino/pass/sdpa_to_vlsdpa.hpp"
6+
7+
#include <gtest/gtest.h>
8+
9+
#include "common_test_utils/ov_test_utils.hpp"
10+
#include "openvino/core/model.hpp"
11+
#include "openvino/opsets/opset13.hpp"
12+
#include "openvino/pass/manager.hpp"
13+
#include "openvino/runtime/core.hpp"
14+
#include "ov_ops/vl_sdpa.hpp"
15+
16+
using namespace std;
17+
using namespace ov;
18+
using namespace ov::opset13;
19+
20+
namespace {
21+
std::shared_ptr<ov::Model> build_model(const string& mask_name) {
22+
auto q = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32}); /* L,H,S */
23+
auto k = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
24+
auto v = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
25+
26+
q->set_friendly_name("q");
27+
k->set_friendly_name("k");
28+
v->set_friendly_name("v");
29+
30+
auto transpose_q = std::make_shared<Transpose>(q, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
31+
auto transpose_k = std::make_shared<Transpose>(k, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
32+
auto transpose_v = std::make_shared<Transpose>(v, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
33+
transpose_q->set_friendly_name("transpose_q");
34+
transpose_k->set_friendly_name("transpose_k");
35+
transpose_v->set_friendly_name("transpose_v");
36+
37+
auto mask = std::make_shared<Parameter>(element::f32, PartialShape{1, -1, -1});
38+
mask->set_friendly_name(mask_name);
39+
mask->get_output_tensor(0).set_names({mask_name});
40+
41+
const auto casual = false;
42+
43+
auto sdpa =
44+
std::make_shared<opset13::ScaledDotProductAttention>(transpose_q, transpose_k, transpose_v, mask, casual);
45+
sdpa->set_friendly_name("sdpa");
46+
47+
auto transpose_o = std::make_shared<Transpose>(sdpa, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
48+
transpose_o->set_friendly_name("transpose_o");
49+
50+
return std::make_shared<ov::Model>(OutputVector{transpose_o}, ParameterVector{q, k, v, mask});
51+
}
52+
53+
std::shared_ptr<ov::Model> build_target_model(const string& mask_name) {
54+
auto q = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32}); /* L,H,S */
55+
auto k = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
56+
auto v = std::make_shared<Parameter>(element::f32, PartialShape{-1, 8, 32});
57+
q->set_friendly_name("q");
58+
k->set_friendly_name("k");
59+
v->set_friendly_name("v");
60+
61+
auto transpose_q = std::make_shared<Transpose>(q, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
62+
auto transpose_k = std::make_shared<Transpose>(k, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
63+
auto transpose_v = std::make_shared<Transpose>(v, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
64+
transpose_q->set_friendly_name("transpose_q");
65+
transpose_k->set_friendly_name("transpose_k");
66+
transpose_v->set_friendly_name("transpose_v");
67+
68+
auto cuseq_mask = std::make_shared<Parameter>(element::i32, PartialShape{-1});
69+
cuseq_mask->set_friendly_name(mask_name);
70+
cuseq_mask->get_output_tensor(0).set_names({mask_name});
71+
72+
auto vlsdpa =
73+
std::make_shared<ov::op::internal::VLSDPA>(OutputVector{transpose_q, transpose_k, transpose_v, cuseq_mask});
74+
75+
auto transpose_o = std::make_shared<Transpose>(vlsdpa, Constant::create(element::i64, Shape{3}, {1, 0, 2}));
76+
transpose_o->set_friendly_name("transpose_o");
77+
78+
return std::make_shared<ov::Model>(OutputVector{transpose_o}, ParameterVector{q, k, v, cuseq_mask});
79+
}
80+
}; // namespace
81+
82+
TEST_F(TransformationTestsF, SDPA2VLSDPAAttentionMaskTest) {
83+
disable_rt_info_check();
84+
{
85+
model = build_model("attention_mask");
86+
model->set_rt_info("QWenVL", "model_type_hint"); // request_vl_sdpa_transformations
87+
manager.register_pass<ov::pass::SDPAToVLSDPA>();
88+
}
89+
{ model_ref = build_target_model("cu_seq_lens"); }
90+
91+
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
92+
comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
93+
comparator.enable(FunctionsComparator::CmpValues::NAMES);
94+
}
95+
96+
TEST_F(TransformationTestsF, SDPA2VLSDPAWindowAttentionMaskTest) {
97+
disable_rt_info_check();
98+
{
99+
model = build_model("window_attention_mask");
100+
model->set_rt_info("QWenVL", "model_type_hint"); // request_vl_sdpa_transformations
101+
manager.register_pass<ov::pass::SDPAToVLSDPA>();
102+
}
103+
{ model_ref = build_target_model("cu_window_seqlens"); }
104+
105+
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
106+
comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
107+
comparator.enable(FunctionsComparator::CmpValues::NAMES);
108+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (C) 2018-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include <memory>
8+
#include <vector>
9+
10+
#include "openvino/pass/pass.hpp"
11+
12+
namespace ov {
13+
namespace pass {
14+
/**
15+
* @brief The transformation replaces SDPA in ViTs by VLSDPA operation.
16+
* The input "attention_mask" is replaced by "accumulated sequence lengths".
17+
* Please note -
18+
* 1. This pass applies to QWen2.x-VL models only, which relies on user (genai) to set
19+
* rt_info of "model_type_hint".
20+
* 2. The pass will change model inputs w.r.t input names, shape, and data type. Therefore,
21+
* it should be applied at the beginning of transformation pipeline.
22+
* \ingroup ov_pass_cpp_api
23+
*/
24+
class OPENVINO_API SDPAToVLSDPA : public ModelPass {
25+
public:
26+
OPENVINO_MODEL_PASS_RTTI("SDPAToVLSDPA");
27+
28+
explicit SDPAToVLSDPA();
29+
bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
30+
};
31+
} // namespace pass
32+
} // namespace ov

0 commit comments

Comments
 (0)