pytorch
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/rewrite_high_rank_singleton_permute_pass.py‎
Lines changed: 113 additions & 0 deletions b/‎backends/arm/_passes/rewrite_high_rank_singleton_permute_pass.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎backends/arm/_passes/to_tosa_memory_format_pass.py‎
Lines changed: 18 additions & 9 deletions b/‎backends/arm/_passes/to_tosa_memory_format_pass.py‎
Lines changed: 18 additions & 9 deletions
@@ -138,6 +138,9 @@
     RewriteBoolToFp32CastViaInt8Pass,
 )
 from .rewrite_conv_pass import RewriteConvPass  # noqa
+from .rewrite_high_rank_singleton_permute_pass import (  # noqa
+    RewriteHighRankSingletonPermutePass,
+)
 from .rewrite_index_put_pass import RewriteIndexPutPass  # noqa
 from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 
@@ -121,6 +121,7 @@
     RewriteBoolBitwiseToLogicalPass,
     RewriteBoolToFp32CastViaInt8Pass,
     RewriteConvPass,
+    RewriteHighRankSingletonPermutePass,
     RewriteIndexPutPass,
     RewriteLeLtToGeGtPass,
     RewriteMatmulPass,
@@ -366,6 +367,7 @@ def _tosa_pipeline(
                 CastToInt32Pass(),
                 BroadcastArgsPass(),
                 ConvertPermuteSingletonToViewPass(),
+                RewriteHighRankSingletonPermutePass(),
                 FuseViewCopyTransformPass(),
                 DecomposeConvWithInt16ActivationPass(),
                 DecomposeSumPass(),
 
@@ -0,0 +1,113 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence, Set, Type
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class RewriteHighRankSingletonPermutePass(ArmPass):
+    """Rewrite high-rank permute via a lower-rank permute when singleton dims
+    allow it.
+
+    For rank>4 tensors, some backends are fragile around direct high-rank
+    TRANSPOSE. When singleton dimensions are present, we can rewrite:
+
+    permute(rank>4) -> view(remove singleton dims) -> permute(reduced rank) ->
+    view(restore rank)
+
+    This keeps semantics unchanged while reducing the permute rank.
+
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    _PERMUTE_OPS = (
+        exir_ops.edge.aten.permute.default,
+        exir_ops.edge.aten.permute_copy.default,
+    )
+
+    @staticmethod
+    def _extract_permutation(permutation_arg: object) -> tuple[int, ...] | None:
+        if not isinstance(permutation_arg, (list, tuple)):
+            return None
+        if not all(isinstance(dim, int) for dim in permutation_arg):
+            return None
+        return tuple(permutation_arg)
+
+    @staticmethod
+    def _normalize_permutation(
+        permutation: Sequence[int], rank: int
+    ) -> tuple[int, ...]:
+        return tuple(dim % rank for dim in permutation)
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in self._PERMUTE_OPS:
+            return super().call_operator(op, args, kwargs, meta)
+        if len(args) < 2:
+            return super().call_operator(op, args, kwargs, meta)
+        if not hasattr(args[0], "data"):
+            return super().call_operator(op, args, kwargs, meta)
+        if "val" not in meta or not hasattr(meta["val"], "shape"):
+            return super().call_operator(op, args, kwargs, meta)
+
+        permutation = self._extract_permutation(args[1])
+        if permutation is None:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_shape = list(args[0].data.shape)
+        output_shape = list(meta["val"].shape)
+        rank = len(input_shape)
+        if rank <= 4 or len(output_shape) != rank:
+            return super().call_operator(op, args, kwargs, meta)
+
+        normalized_permutation = self._normalize_permutation(permutation, rank)
+        singleton_axes = [axis for axis, dim in enumerate(input_shape) if dim == 1]
+        if not singleton_axes:
+            return super().call_operator(op, args, kwargs, meta)
+
+        non_singleton_axes = [
+            axis for axis in range(rank) if axis not in singleton_axes
+        ]
+        reduced_rank = len(non_singleton_axes)
+        if reduced_rank > 4:
+            return super().call_operator(op, args, kwargs, meta)
+
+        axis_to_reduced_axis = {
+            axis: idx for idx, axis in enumerate(non_singleton_axes)
+        }
+        reduced_permutation = tuple(
+            axis_to_reduced_axis[axis]
+            for axis in normalized_permutation
+            if axis in axis_to_reduced_axis
+        )
+        expected_axes = tuple(range(reduced_rank))
+        if tuple(sorted(reduced_permutation)) != expected_axes:
+            return super().call_operator(op, args, kwargs, meta)
+
+        reduced_input_shape = [input_shape[axis] for axis in non_singleton_axes]
+        reduced_input = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (args[0], reduced_input_shape),
+            {},
+            meta,
+        )
+        if reduced_permutation == expected_axes:
+            reduced_output = reduced_input
+        else:
+            reduced_output = super().call_operator(
+                op,
+                (reduced_input, reduced_permutation),
+                kwargs,
+                meta,
+            )
+        return super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (reduced_output, output_shape),
+            {},
+            meta,
+        )
@@ -113,6 +113,23 @@ def _channels_last_inverse_order(rank: int, spatial_rank: int) -> tuple[int, ...
             inverse[axis] = idx
         return tuple(inverse)
 
+    def _infer_dim_order_for_node(
+        self, node: torch.fx.Node, node_data: torch.Tensor, spatial_rank: int
+    ) -> tuple[int, ...]:
+        rank = node_data.dim()
+
+        # Inputs and outputs preserve their externally-declared dim order.
+        if _is_input(node, self.exported_program) or node.op == "output":
+            return node_data.dim_order()
+
+        # Conv transpose weights are serialized in OHWI layout.
+        if rank == 4 and _is_transpose_conv2d_weight(node):
+            return (1, 2, 3, 0)
+
+        if rank >= 4:
+            return self._channels_last_order(rank, spatial_rank)
+        return tuple(range(rank))
+
     def _initial_spatial_rank(self, node: torch.fx.Node) -> int:
         """Infer the initial spatial rank based on the current rank, input node
         spatial ranks and node target. A spatial dimension includes Height,
@@ -459,15 +476,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                 continue
             node_data = get_first_fake_tensor(node).data
             spatial_rank = node.meta["tosa_spatial_rank"]
-            if _is_input(node, self.exported_program) or node.op == "output":
-                dim_order = node_data.dim_order()
-            else:
-                if node_data.dim() == 4 and _is_transpose_conv2d_weight(node):
-                    dim_order = (1, 2, 3, 0)
-                elif node_data.dim() >= 4:
-                    dim_order = self._channels_last_order(node_data.dim(), spatial_rank)
-                else:
-                    dim_order = tuple(range(node_data.dim()))  # type: ignore[assignment]
+            dim_order = self._infer_dim_order_for_node(node, node_data, spatial_rank)
             node.meta["tosa_dim_order"] = dim_order
 
         # Insert TOSA transposes to convert between (N)NCHW and (N)NHWC format.