pytorch
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 7 additions & 11 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎backends/arm/_passes/fuse_consecutive_rescales_pass.py‎
Lines changed: 174 additions & 0 deletions b/‎backends/arm/_passes/fuse_consecutive_rescales_pass.py‎
Lines changed: 174 additions & 0 deletions
@@ -102,6 +102,7 @@
     QuantizeClampArgumentsPass,
 )
 from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass  # noqa
+from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass  # noqa
 from .fuse_constant_ops_pass import (  # noqa
     ComputeConstantOpsAOTPass,
     FuseConstantArgsPass,
 
@@ -98,6 +98,7 @@
     DecorateFp32toInt32CastingPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchNorm2dPass,
+    FuseConsecutiveRescalesPass,
     FuseConstantArgsPass,
     FuseDuplicateUsersPass,
     FuseEqualPlaceholdersPass,
@@ -183,8 +184,7 @@ def configure_skip_passes(
         override_config: ArmPassPipelineConfig | None = None,
     ) -> tuple[type, ...]:
         """Configures the pass manager to skip certain passes based on the
-        ArmPassPipelineConfig class found in the compile spec.
-        """
+        ArmPassPipelineConfig class found in the compile spec."""
         skip_set: set[type] = set()
 
         config = override_config or self.compile_spec.get_pass_pipeline_config()
@@ -213,9 +213,8 @@ def validate_constraints_mandatory(self):
         """Validates that necessary passes have run before transforming to
         backend.
 
-        Note that this differs from the original validate_constraints function,
-        which only checks the order of passes.
-
+        Note that this differs from the original validate_constraints
+        function, which only checks the order of passes.
         """
         passes_to_run = defaultdict(list)
 
@@ -245,7 +244,6 @@ def insert_passes_before(
         Args:
             target_pass_type: The pass class to insert before (e.g., InsertTableOpsPass)
             passes: List of pass instances to insert
-
         """
         self._pass_insertions.setdefault(
             target_pass_type, PassInsertions()
@@ -260,7 +258,6 @@ def insert_passes_after(
         Args:
             target_pass_type: The pass class to insert after
             passes: List of pass instances to insert
-
         """
         self._pass_insertions.setdefault(
             target_pass_type, PassInsertions()
@@ -273,7 +270,6 @@ def _apply_pass_insertions(self) -> None:
 
         Raises:
             ValueError: If any registered target pass type is not found in the pipeline.
-
         """
         if self._insertions_applied or not self._pass_insertions:
             return
@@ -317,14 +313,13 @@ def _apply_pass_insertions(self) -> None:
         self._insertions_applied = True
 
     def _configure_pass_insertions(self, exported_program: ExportedProgram) -> None:
-        """Hook for subclasses to configure pass insertions. Called at the START
-        of pipeline construction, before any passes are added.
+        """Hook for subclasses to configure pass insertions. Called at the
+        START of pipeline construction, before any passes are added.
 
         Subclasses should override this to call insert_passes_before/after.
 
         Args:
             exported_program: The exported program being transformed
-
         """
         pass
 
@@ -380,6 +375,7 @@ def _tosa_pipeline(
                 # Ticket: MLETORCH-1539
                 DecomposeLinearPass(),
                 InsertRescaleInt32Pass(),
+                FuseConsecutiveRescalesPass(),
                 InsertControlFlowRescalesPass(),
                 DecomposeQuantNodesPass(),
             ]
 
@@ -0,0 +1,174 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import cast, List, Set, Type
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassResult
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class FuseConsecutiveRescalesPass(ArmPass):
+    """Fuse consecutive RESCALE(INT32->INT8/INT16) ->
+    RESCALE(INT8/INT16->INT32) pairs.
+
+    InsertRescaleInt32Pass wraps each quantized arithmetic and comparison
+    operator (add, sub, mul, abs, eq, ge, gt, le, lt, max, min, sum) with
+    input rescales (INT8/INT16->INT32) and an output rescale
+    (INT32->INT8/INT16). When two such ops are chained (e.g., add1 -> add2),
+    the output rescale of add1 feeds directly into an input rescale of add2,
+    creating a redundant INT32->INT8/INT16->INT32 round-trip that loses
+    precision.
+
+    This pass detects such pairs and handles two cases:
+
+    - **Identity** (composed scale ~1.0, matching zero points): Removes both
+      RESCALEs and directly wires R1's input to R2's users.  This eliminates
+      the entire round-trip.  Bypassing the intermediate INT8/INT16 clamp can
+      in theory cause up to ~120 INT8 steps of output difference when all
+      inputs are near the clamp boundary; in practice, observed differences
+      are 0-1 steps for typical distributions.  Tests use qtol=1.
+
+    - **Non-identity**: Leaves the pair unchanged.  The Vela NPU compiler
+      cannot correctly process INT32->INT32 RESCALE (produces all-zero NPU
+      outputs), so non-identity pairs retain their INT8/INT16 intermediate.
+
+    Handles multi-user R1 nodes: when R1 feeds both RESCALE and
+    non-RESCALE users, each R1->R2 RESCALE pair is fused individually
+    while preserving R1 for its non-RESCALE users.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        graph = graph_module.graph
+        modified = False
+        nodes_to_erase: List[Node] = []
+        rescale_before = sum(1 for n in graph.nodes if _is_rescale(n))
+        identity_pairs_fused = 0
+
+        for node in list(graph.nodes):
+            node = cast(Node, node)
+            if not _is_fuseable_r1(node):
+                continue
+
+            r1_input = node.args[0]
+            r1_input_zp = node.args[3]
+            r1_scale = float(node.args[2][0])  # type: ignore[arg-type]
+
+            node_fused = False
+            for user in list(node.users):
+                if _try_fuse_identity_pair(
+                    node,
+                    user,
+                    r1_input,
+                    r1_input_zp,
+                    r1_scale,
+                    nodes_to_erase,
+                ):
+                    node_fused = True
+                    identity_pairs_fused += 1
+
+            if node_fused:
+                nodes_to_erase.append(node)
+                modified = True
+
+        for node in nodes_to_erase:
+            if len(node.users) == 0:
+                graph.erase_node(node)
+
+        if modified:
+            rescale_after = sum(1 for n in graph.nodes if _is_rescale(n))
+            removed = rescale_before - rescale_after
+            logger.info(
+                "FuseConsecutiveRescalesPass: removed %d identity pairs "
+                "(%d RESCALEs: %d -> %d)",
+                identity_pairs_fused,
+                removed,
+                rescale_before,
+                rescale_after,
+            )
+            graph_module.recompile()
+            graph.lint()
+            # Note: we deliberately skip super().call() — retracing is
+            # unnecessary since this pass only rewires edges and removes
+            # nodes without introducing new operations.
+
+        return PassResult(graph_module, modified)
+
+
+def _is_rescale(node: Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target == exir_ops.backend.tosa.RESCALE.default
+    )
+
+
+def _is_fuseable_r1(node: Node) -> bool:
+    """Check if node is an R1 candidate.
+
+    R1 is RESCALE(INT32 -> INT8/INT16) with per-tensor scale.
+    """
+    if not _is_rescale(node):
+        return False
+    if node.args[1] not in (torch.int8, torch.int16):
+        return False
+    if len(node.args[2]) != 1:  # type: ignore[arg-type]
+        return False
+    r1_input = node.args[0]
+    if isinstance(r1_input, Node) and "val" in r1_input.meta:
+        if r1_input.meta["val"].dtype != torch.int32:
+            return False
+    return True
+
+
+def _try_fuse_identity_pair(
+    r1: Node,
+    r2: Node,
+    r1_input: Node,
+    r1_input_zp: int,
+    r1_scale: float,
+    nodes_to_erase: List[Node],
+) -> bool:
+    """Try to fuse an R1->R2 identity pair.
+
+    Returns True if fused.
+    """
+    if not _is_rescale(r2):
+        return False
+    if r2.args[1] != torch.int32:
+        return False
+    if r1.args[4] != r2.args[3]:
+        return False
+    if len(r2.args[2]) != 1:  # type: ignore[arg-type]
+        return False
+
+    r2_scale = float(r2.args[2][0])  # type: ignore[arg-type, index]
+    composed_scale = r1_scale * r2_scale
+    r2_output_zp = r2.args[4]
+
+    if abs(composed_scale - 1.0) < 1e-6 and r1_input_zp == r2_output_zp:
+        # Identity case: remove both RESCALEs and directly wire
+        # R1's input (INT32) to R2's users.  The composed scale
+        # is ~1.0 so the round-trip is a no-op modulo the INT8
+        # clamp.  Bypassing the clamp can in theory cause up to
+        # ~120 INT8 steps of difference near clamp boundaries;
+        # observed differences are 0-1 steps.  Tests use qtol=1.
+        r2.replace_all_uses_with(r1_input)
+        nodes_to_erase.append(r2)
+        return True
+
+    # Non-identity: leave the pair unchanged.  Creating a
+    # single INT32->INT32 RESCALE with the composed scale would
+    # be semantically correct (and the TOSA ref model handles
+    # it), but the Vela NPU compiler produces all-zero outputs
+    # for INT32->INT32 RESCALE operations.
+    return False
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@`
`102`	`102`	`QuantizeClampArgumentsPass,`
`103`	`103`	`)`
`104`	`104`	`from .fuse_batch_norm2d_pass import FuseBatchNorm2dPass # noqa`
	`105`	`+from .fuse_consecutive_rescales_pass import FuseConsecutiveRescalesPass # noqa`
`105`	`106`	`from .fuse_constant_ops_pass import ( # noqa`
`106`	`107`	`ComputeConstantOpsAOTPass,`
`107`	`108`	`FuseConstantArgsPass,`