add utilities

shubhambhokare1 · shubhambhokare1 · commit 9c2342d084af · 2025-04-03T23:14:56.000Z
diff --git a/onnxscript/rewriter/_fusion_utils.py b/onnxscript/rewriter/_fusion_utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+from typing import Sequence, Union
+
+
+from onnxscript import ir
+
+Dim = Union[int, ir.SymbolicDim]
+
+def _check_shape(bindings: dict[str, Dim], val: ir.Value, shape: Sequence[str]) -> bool:
+    if val.shape is None:
+        return False
+    if val.shape.rank() != len(shape):
+        return False
+    for actual, expected in zip(val.shape, shape):
+        if expected not in bindings:
+            bindings[expected] = actual  # type: ignore[assignment]
+        elif actual != bindings[expected]:
+            return False
+    return True
diff --git a/onnxscript/rewriter/ort_fusions/attention.py b/onnxscript/rewriter/ort_fusions/attention.py
@@ -5,23 +5,12 @@
 from typing import Sequence, Union
 
 import onnxscript.ir as ir
-from onnxscript.rewriter import pattern
+from onnxscript.rewriter import _fusion_utils, pattern
 
 Dim = Union[int, ir.SymbolicDim]
 
 
 # TODO: Maybe add this check to utilities
-def _check_shape(bindings: dict[str, Dim], val: ir.Value, shape: Sequence[str]) -> bool:
-    if val.shape is None:
-        return False
-    if val.shape.rank() != len(shape):
-        return False
-    for actual, expected in zip(val.shape, shape):
-        if expected not in bindings:
-            bindings[expected] = actual  # type: ignore[assignment]
-        elif actual != bindings[expected]:
-            return False
-    return True
 
 
 class AttentionFusion(pattern.RewriteRuleClassBase):
@@ -103,6 +92,7 @@ def pattern(
             present_key = op.Unsqueeze(present_key, [0])
             present_value = op.Unsqueeze(present_value, [0])
             present = op.Concat(present_key, present_value, axis=0)
+            # Return present output first as it captures the complete pattern graph
             return present, attention
         else:
             attention = op.MultiHeadAttention(
@@ -136,7 +126,7 @@ def check(
         self.bindings: dict[str, Dim] = {}
 
         def no_match(val: ir.Value, dims: Sequence[str]) -> bool:
-            return not _check_shape(self.bindings, val, dims)
+            return not _fusion_utils._check_shape(self.bindings, val, dims)
 
         if no_match(input, ["B", "S", "D"]):
             return check_result.fail(
@@ -228,7 +218,7 @@ def rewrite(
                 _domain="com.microsoft",
                 _outputs=2,
             )
-            # Return present output first as it captures the complete rewrite pattern graph
+            # Use same output ordering as in pattern
             return present, attention
         else:
             return op.Attention(
diff --git a/onnxscript/rewriter/ort_fusions/attention_test.py b/onnxscript/rewriter/ort_fusions/attention_test.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.batchsize = 2
         self.seqlen = 8
-        self.max_seqlen = 32
+        self.past_seqlen = 32
         self.headsize = 16
         self.num_heads = 10
         self.input_hidden_size = self.headsize * self.num_heads
@@ -36,7 +36,7 @@ def random_inputs(self, with_past=False):
         """Generate random inputs for the model."""
         B = self.batchsize
         S = self.seqlen
-        M = self.max_seqlen
+        Sp = self.past_seqlen
         D = self.input_hidden_size
         N = self.num_heads
         H = self.headsize
@@ -48,22 +48,22 @@ def random_inputs(self, with_past=False):
             "bias": np.random.rand(D_qkv).astype(np.float32),
         }
         if with_past:
-            inputs["past"] = np.random.rand(2, B, N, M, H).astype(np.float32)
+            inputs["past"] = np.random.rand(2, B, N, Sp, H).astype(np.float32)
         return inputs
 
     def create_model(self, with_past=False):
         """Create a model with or without past inputs."""
         D = self.input_hidden_size
-        Dh_qkv = self.q_hidden_size + self.k_hidden_size + self.v_hidden_size
+        D_qkv = self.q_hidden_size + self.k_hidden_size + self.v_hidden_size
 
         @script()
         def model_with_mha(input, weight, bias):
-            QKV_no_bias = op.MatMul(input, weight)
-            QKV = op.Add(QKV_no_bias, bias)
+            qkv_no_bias = op.MatMul(input, weight)
+            qkv = op.Add(qkv_no_bias, bias)
 
-            query_BSDh = op.Slice(QKV, [0], [160], [2])
-            key_BSDh = op.Slice(QKV, [160], [320], [2])
-            value_BSDh = op.Slice(QKV, [320], [480], [2])
+            query_BSDh = op.Slice(qkv, [0], [160], [2])
+            key_BSDh = op.Slice(qkv, [160], [320], [2])
+            value_BSDh = op.Slice(qkv, [320], [480], [2])
 
             mha = msft_op.MultiHeadAttention(
                 query_BSDh,
@@ -75,12 +75,12 @@ def model_with_mha(input, weight, bias):
 
         @script()
         def model_with_mha_past(input, weight, bias, past):
-            QKV_no_bias = op.MatMul(input, weight)
-            QKV = op.Add(QKV_no_bias, bias)
+            qkv_no_bias = op.MatMul(input, weight)
+            qkv = op.Add(qkv_no_bias, bias)
 
-            query_BSDh = op.Slice(QKV, [0], [160], [2])
-            key_BSDh = op.Slice(QKV, [160], [320], [2])
-            value_BSDh = op.Slice(QKV, [320], [480], [2])
+            query_BSDh = op.Slice(qkv, [0], [160], [2])
+            key_BSDh = op.Slice(qkv, [160], [320], [2])
+            value_BSDh = op.Slice(qkv, [320], [480], [2])
 
             past_key_5d = op.Slice(past, [0], [1], [0])
             past_value_5d = op.Slice(past, [1], [2], [0])
@@ -106,14 +106,15 @@ def model_with_mha_past(input, weight, bias, past):
 
         input_types = (
             FLOAT["B", "S", D],
-            FLOAT[D, Dh_qkv],
-            FLOAT[Dh_qkv],
+            FLOAT[D, D_qkv],
+            FLOAT[D_qkv],
         )
         output_types = (FLOAT["B", "S", self.v_hidden_size],)
 
         if with_past:
+            # "T" indicates total sequence length (after concatenation of past and current key/value)
             input_types += (FLOAT[2, "B", self.num_heads, "S", self.headsize],)
-            output_types += (FLOAT[2, "B", self.num_heads, "NS", self.headsize],)
+            output_types += (FLOAT[2, "B", self.num_heads, "T", self.headsize],)
             model_proto = model_with_mha_past.to_model_proto(
                 input_types=input_types,
                 output_types=output_types,
diff --git a/onnxscript/rewriter/ort_fusions/mha.py b/onnxscript/rewriter/ort_fusions/mha.py
@@ -5,7 +5,7 @@
 from typing import Sequence, Union
 
 import onnxscript.ir as ir
-from onnxscript.rewriter import _ir_utils, pattern
+from onnxscript.rewriter import _fusion_utils, _ir_utils, pattern
 
 """
 The MultiHeadAttention pattern: generate an instance
@@ -31,19 +31,6 @@
 Dim = Union[int, ir.SymbolicDim]
 
 
-def _check_shape(bindings: dict[str, Dim], val: ir.Value, shape: Sequence[str]) -> bool:
-    if val.shape is None:
-        return False
-    if val.shape.rank() != len(shape):
-        return False
-    for actual, expected in zip(val.shape, shape):
-        if expected not in bindings:
-            bindings[expected] = actual  # type: ignore[assignment]
-        elif actual != bindings[expected]:
-            return False
-    return True
-
-
 class MultiHeadAttention(pattern.RewriteRuleClassBase):
     def __init__(self, name, *, transpose_4d: bool):
         super().__init__(name)
@@ -168,7 +155,7 @@ def check(
         bindings: dict[str, Dim] = {}
 
         def no_match(val: ir.Value, dims: Sequence[str]) -> bool:
-            return not _check_shape(bindings, val, dims)
+            return not _fusion_utils._check_shape(bindings, val, dims)
 
         if no_match(query_BSD, ["B", "S", "D"]):
             return check_result.fail(