Fix handling of attention-bias in MHA fusion (#2332)

gramalingam · web-flow · commit ef7e9e706fdb · 2025-05-22T13:52:24.000-07:00
In models generated from pytorch, masks may have shapes that are
broadcastable to (B, H, S, St): eg., a 2D mask of shape (S, St) or even
shape (1, 1, 1, St) in one example.

ONNX's opset23 Attention op allows masks of this shape. However, ORT's
contrib ops (MHA, Attention) allow a mask of shape (1 or B, 1 or H, S,
St). That is: they support broadcast only for the first two dimensions.
(Even that is not supported by some earlier versions of ORT, which we
don't consider here.)

So, while doing fusion for MHA, we should expand the mask to ensure it
satisfies the constraints of MHA/Attention.

---------

Signed-off-by: Ganesan Ramalingam &lt;grama@microsoft.com&gt;
diff --git a/onnxscript/rewriter/ort_fusions/mha.py b/onnxscript/rewriter/ort_fusions/mha.py
@@ -265,8 +265,46 @@ def no_match(val: ir.Value, dims: Sequence[str]) -> bool:
                         past_value,
                     )
 
-        # TODO: mask shape check: ideally, it should be (1 or B, 1 or H, S, St)
-        # But this also, unforunately, depends on ORT version.
+        # mask (aka attention_bias) shape check:
+        # ONNX's Attention op (named SDPA here) allows a mask broadcastable to (B, H, S, St)
+        # ORT's contrib ops (MHA, Attention) allow a mask of shape (1 or B, 1 or H, S, St)
+        # That is: broadcast allowed only for the first two dimensions. (Even that is not
+        # supported by some earlier versions of ORT, which are not supported here.)
+        if self._use_mask:
+            if (mask_shape := mask.shape) is None:
+                return check_result.fail(
+                    "Mask shape cannot be determined.",
+                    mask,
+                )
+            if mask_shape.rank() == 4:
+                if no_match(mask, ["B_or_1", "H_or_1", "S_or_1", "St"]):
+                    return check_result.fail(
+                        f"Shape mismatch: {mask} does not match expected dimensions ['1 or B', '1 or H', '1 or S', 'St']",
+                        mask,
+                    )
+                mask_dim_2 = bindings.get("S_or_1")
+                if mask_dim_2 == bindings.get("S"):
+                    self._use_mask_broadcast = False
+                elif mask_dim_2 == 1:
+                    self._use_mask_broadcast = True
+                else:
+                    return check_result.fail(
+                        "Mask dimension 2 cannot be verified to be 1 or S"
+                    )
+            elif mask_shape.rank() == 2:
+                if no_match(mask, ["S_or_1", "St"]):
+                    return check_result.fail(
+                        f"Shape mismatch: {mask} does not match expected dimensions ['1 or S', 'St']",
+                        mask,
+                    )
+                self._use_mask_broadcast = True
+            else:
+                return check_result.fail(
+                    f"Mask shape {mask_shape} is not supported. Expected 2D or 4D.",
+                    mask,
+                )
+        else:
+            self._use_mask_broadcast = False
 
         # TODO: verify Reshapes:
         # eg.: verify bindings["B"] * bindings["H"] == bindings["B*H"]:
@@ -315,6 +353,12 @@ def rewrite(
             query_BSD_emb = query_BSD
             key_BSD_emb = key
 
+        if self._use_mask_broadcast:
+            one = op.Constant(value_ints=[1])
+            S = op.Shape(query_BSD, start=1, end=2)
+            shape_11S1 = op.Concat(one, one, S, one, axis=0)
+            mask = op.Expand(mask, shape_11S1)
+
         num_outputs = 1 + (2 * self._has_past_present)
         return op.MultiHeadAttention(
             query_BSD_emb,