[GPU] Ignore SDPAScaleFusion pass when output of Q & V scales have di… (#29554)

hyunback · timxu826 · e-ddykim · web-flow · commit 7219aa83c171 · 2025-03-20T06:18:09.000Z
Backport of (#29450) to releases/2025/1 Co-authored-by: jag.Xu <jia3.xu@intel.com> Co-authored-by: Eddy Kim <eddy.kim@intel.com>
diff --git a/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp
@@ -49,8 +49,8 @@ SDPAScaleFusion::SDPAScaleFusion() {
 
         auto sdpa = m.get_match_root();
 
-        const bool has_q_scale = pattern_map.count(scaled_q);
-        const bool has_k_scale = pattern_map.count(scaled_k);
+        bool has_q_scale = pattern_map.count(scaled_q);
+        bool has_k_scale = pattern_map.count(scaled_k);
 
         // Nothing to do
         if (!has_q_scale && !has_k_scale)
@@ -83,22 +83,32 @@ SDPAScaleFusion::SDPAScaleFusion() {
         // Extract scalar scale values for Q and K if those are constant and set new inputs for SDPA
         if (has_q_scale) {
             scale_q_node = pattern_map.at(scale_q).get_node_shared_ptr();
-            if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
-                scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
-                q_input = pattern_map.at(q);
+            if (pattern_map.at(q).get_element_type() == q_input.get_element_type()) {
+                if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
+                    scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
+                    q_input = pattern_map.at(q);
+                }
+            } else {
+                has_q_scale = false;
             }
         }
         if (has_k_scale) {
             scale_k_node = pattern_map.at(scale_k).get_node_shared_ptr();
-            if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
-                scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
-                k_input = pattern_map.at(k);
+            if (pattern_map.at(k).get_element_type() == k_input.get_element_type()) {
+                if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
+                    scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
+                    k_input = pattern_map.at(k);
+                }
+            } else {
+                has_k_scale = false;
             }
         }
 
+        if (!has_q_scale && !has_k_scale)
+            return false;
+
         Output<ov::Node> new_scale_node;
         auto new_scale_val = prev_scale_value * scale_q_value * scale_k_value;
-
         // If new scale is 1 and we have non-constant scale node for either Q or K, then we can make it a scale of SDPA
         if (new_scale_val == 1.0f) {
             if (has_q_scale && !ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
diff --git a/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp b/src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp
@@ -15,6 +15,7 @@
 #include "openvino/op/constant.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/scaled_dot_product_attention.hpp"
+#include "ov_ops/type_relaxed.hpp"
 
 using namespace testing;
 using namespace ov::pass;
@@ -226,3 +227,49 @@ TEST_F(TransformationTestsF, SDPAScaleFusionTest5) {
     comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
     comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
 }
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest6) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::i8, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{8.0f});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    {
+        const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_const);
+        const auto k_scaled = std::make_shared<ov::op::TypeRelaxed<ov::op::v1::Multiply>>(
+            std::vector<element::Type>{element::f16, element::f16},
+            std::vector<element::Type>{element::f16},
+            ov::op::TemporaryReplaceOutputType(key, element::f16).get(),
+            ov::op::TemporaryReplaceOutputType(scale_const, element::f16).get());
+        const auto sdpa =
+            std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled, k_scaled, v_scaled, casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto k_scaled_ref = std::make_shared<ov::op::TypeRelaxed<ov::op::v1::Multiply>>(
+            std::vector<element::Type>{element::f16, element::f16},
+            std::vector<element::Type>{element::f16},
+            ov::op::TemporaryReplaceOutputType(key, element::f16).get(),
+            ov::op::TemporaryReplaceOutputType(scale_const, element::f16).get());
+        const auto new_mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{0.0f});
+        const auto new_scale_const =
+            ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{8.0f / std::sqrt(32.0f)});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   k_scaled_ref,
+                                                                                   v_scaled,
+                                                                                   new_mask_const,
+                                                                                   new_scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+}