fix pre_mul_q placement

shubhambhokare1 · shubhambhokare1 · commit c5c96afca660 · 2025-04-25T01:00:44.000Z
diff --git a/onnxscript/rewriter/ort_fusions/_core.py b/onnxscript/rewriter/ort_fusions/_core.py
@@ -76,6 +76,7 @@ def fuse_xformers(model: ir.Model) -> tuple[ir.Model, dict[str, int]]:
     fusion_count["partial_rotary_embedding"] = fuse_partial_rotary_embedding(model)
     fusion_count["cos_sin_cache"] = fuse_cos_sin_cache(model)
     fusion_count["sdpa"] = fuse_sdpa(model)
+    model = _pre_optimize(model)
     # Optimize to avoid trying multiple attention-based fusions
     fusion_count["mha"] = fuse_mha(model)
     if fusion_count["mha"] == 0:
diff --git a/onnxscript/rewriter/ort_fusions/mha.py b/onnxscript/rewriter/ort_fusions/mha.py
@@ -5,7 +5,7 @@
 from typing import Sequence, Union
 
 import onnxscript.ir as ir
-from onnxscript.rewriter import _fusion_utils, pattern
+from onnxscript.rewriter import _fusion_utils, _ir_utils, pattern
 
 """
 The MultiHeadAttention pattern: generate an instance
@@ -37,13 +37,15 @@ def __init__(
         name,
         *,
         transpose_4d: bool,
+        pre_scale_q: bool,
         is_rotary: bool,
         use_mask: bool,
         has_past_present: bool,
         is_cross_attention: bool,
     ):
         super().__init__(name)
         self._transpose_4d = transpose_4d
+        self._pre_scale_q = pre_scale_q
         self._is_rotary = is_rotary
         self._use_mask = use_mask
         self._has_past_present = has_past_present
@@ -64,9 +66,12 @@ def pattern(
         position_ids,
         cos,
         sin,
+        q_scale,
     ):
         # First, query, key, and value are reshaped+transposed from (B, S, D) to (B, H, S, D/H)
 
+        if self._pre_scale_q:
+            query_BSD = op.Mul(query_BSD, q_scale)
         # Reshape from (B, S, D) to (B, S, H, D/H)
         query_BSHDh = op.Reshape(
             query_BSD,
@@ -202,6 +207,8 @@ def check(
         past_key,
         past_value,
         query_BSHDh,
+        key_BSHDh=None,
+        value_BSHDh=None,
         **_,
     ) -> pattern.MatchResult:  # type: ignore[name-defined]
         check_result = pattern.MatchResult()
@@ -239,25 +246,24 @@ def no_match(val: ir.Value, dims: Sequence[str]) -> bool:
                     f"Shape mismatch: {past_value} does not match expected dimensions ['B', 'H', 'Spast', 'Dv']",
                     past_value,
                 )
-        """
+
         if no_match(query_BSHDh, ["B", "S", "H", "Dh"]):
             return check_result.fail(
                 f"Shape mismatch: {query_BSHDh} does not match expected dimensions ['B', 'S', 'H', 'Dh']",
                 query_BSHDh,
             )
-        
-        if not self.is_cross_attention:
-            if no_match(key_BSHDh, ["B", "S", "H", "Dh"]):
+
+        if not self._is_cross_attention:
+            if key_BSHDh and no_match(key_BSHDh, ["B", "S", "H", "Dh"]):
                 return check_result.fail(
                     f"Shape mismatch: {key_BSHDh} does not match expected dimensions ['B', 'S', 'H', 'Dh']",
                     query_BSHDh,
                 )
-            if no_match(value_BSHDh, ["B", "S", "H", "Dh"]):
+            if value_BSHDh and no_match(value_BSHDh, ["B", "S", "H", "Dh"]):
                 return check_result.fail(
                     f"Shape mismatch: {value_BSHDh} does not match expected dimensions ['B', 'S', 'H', 'Dh']",
                     query_BSHDh,
                 )
-        """
 
         # TODO: mask shape check: ideally, it should be (1 or B, 1 or H, S, St)
         # But this also, unforunately, depends on ORT version.
@@ -283,9 +289,7 @@ def rewrite(
         sin,
         **_,
     ):
-        num_heads = 64
-        # TODO: (fix) Error caused by incorrect SDPA fusion for pre-scaling case
-        # num_heads = _ir_utils.get_dim(query_BSHDh, 2)
+        num_heads = _ir_utils.get_dim(query_BSHDh, 2)
         if not isinstance(num_heads, int):
             return None
 
@@ -341,12 +345,14 @@ def rewrite(
 parameter_combinations = [
     {
         "transpose_4d": transpose_4d,
+        "pre_scale_q": pre_scale_q,
         "is_rotary": is_rotary,
         "use_mask": use_mask,
         "has_past_present": has_past_present,
         "is_cross_attention": is_cross_attention,
     }
     for transpose_4d in [False, True]
+    for pre_scale_q in [True, False]
     for is_rotary in [False, True]
     for use_mask in [False, True]
     for has_past_present in [False, True]
@@ -358,6 +364,7 @@ def rewrite(
     [
         MultiHeadAttention.rule(
             f"MHA_{'4D' if params['transpose_4d'] else '3D'}_Transpose"
+            f"{'_PreScaleQ' if params['pre_scale_q'] else ''}"
             f"{'_Rotary' if params['is_rotary'] else ''}"
             f"{'_Masked' if params['use_mask'] else ''}"
             f"{'_Past' if params['has_past_present'] else ''}"
diff --git a/onnxscript/rewriter/ort_fusions/sdpa.py b/onnxscript/rewriter/ort_fusions/sdpa.py
@@ -102,22 +102,22 @@ def check(
         if self._use_mul:
             expected_scaling_factor = 1.0 / expected_scaling_factor
 
-        if self._pre_scale:
+        if self._pre_scale and not self._pre_scale_q:
             # Check if query_scale and key_scale are scalars == sqrt(expected_scaling_factor)
             # If they are scalars but != sqrt(expected_scaling_factor), a custom scale is being used.
             sqrt_scaling_factor = math.sqrt(expected_scaling_factor)
-                # Calculate the scaling factor for query
+            # Calculate the scaling factor for query
             if (query_scale_value := _ir_utils.get_singleton_value(query_scale)) is None:
-                    return check_result.fail(
-                        "Query scale is not a scalar.",
-                        query_scale,
-                    )
-                # Ensure the scaling factor for key is the same as for query
+                return check_result.fail(
+                    "Query scale is not a scalar.",
+                    query_scale,
+                )
+            # Ensure the scaling factor for key is the same as for query
             if (key_scale_value := _ir_utils.get_singleton_value(key_scale)) is None:
-                    return check_result.fail(
-                        "Key scale is not a scalar.",
-                        key_scale,
-                    )
+                return check_result.fail(
+                    "Key scale is not a scalar.",
+                    key_scale,
+                )
             if not math.isclose(query_scale_value, key_scale_value, rel_tol=1e-3):
                 return check_result.fail(
                     "Query and key scales are not equal.",
@@ -129,13 +129,13 @@ def check(
                 # Pass no scaling factor to SDPA, SDPA will use the default scaling factor
                 self._scale = None
         else:
-                # Check if qk_scale is a scalar == expected_scaling_factor)
-                # If it is a scalar but != sqrt(expected_scaling_factor), a custom scale is being used
+            # Check if qk_scale is a scalar == expected_scaling_factor)
+            # If it is a scalar but != sqrt(expected_scaling_factor), a custom scale is being used
             if (qk_scale_value := _ir_utils.get_singleton_value(qk_scale)) is None:
-                    return check_result.fail(
-                        "QK scale is not a scalar.",
-                        qk_scale,
-                    )
+                return check_result.fail(
+                    "QK scale is not a scalar.",
+                    qk_scale,
+                )
             if not math.isclose(qk_scale_value, expected_scaling_factor, rel_tol=1e-3):
                 self._scale = qk_scale_value
             else:
@@ -153,13 +153,20 @@ def rewrite(
         key_transposed,
         value,
         mask,
-        query_reshape,
+        query_scale,
+        key_scale,
+        qk_scale,
+        query_reshape=None,
         **_,
     ):
-        if self._has_3d_inputs and self._pre_scale_q:
+        if self._has_3d_inputs and self._pre_scale and self._pre_scale_q:
+            if self._use_mul:
+                query_mul = op.Mul(query, qk_scale)
+            else:
+                query_mul = op.Div(query, qk_scale)
             # Reshape and transpose 3D input of shape (B, S, D)
             # to 4D input of shape (B, N, S, H)
-            queryBNSH = op.Reshape(query, query_reshape)
+            queryBNSH = op.Reshape(query_mul, query_reshape)
             query = op.Transpose(queryBNSH, perm=[0, 2, 1, 3])
 
         sdpa_args = [query, key_transposed, value]