pytorch
diff --git a/‎.github/scripts/extract_benchmark_results.py
Lines changed: 2 additions & 1 deletion b/‎.github/scripts/extract_benchmark_results.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 11 additions & 31 deletions b/‎CMakeLists.txt
Lines changed: 11 additions & 31 deletions
diff --git a/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/test/test_coreml_quantizer.py
Lines changed: 3 additions & 1 deletion b/‎backends/apple/coreml/test/test_coreml_quantizer.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 5 additions & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass.py
Lines changed: 33 additions & 0 deletions b/‎backends/arm/_passes/arm_pass.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 7 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 7 deletions
diff --git a/‎backends/arm/_passes/arm_pass_utils.py
Lines changed: 16 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_utils.py
Lines changed: 16 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_layernorm_pass.py
Lines changed: 30 additions & 8 deletions b/‎backends/arm/_passes/decompose_layernorm_pass.py
Lines changed: 30 additions & 8 deletions
@@ -360,6 +360,7 @@ def transform(
                     "app_type": app_type,
                     # Just keep a copy of the benchmark config here
                     "benchmark_config": json.dumps(benchmark_config),
+                    "job_conclusion": "SUCCESS",
                 },
             },
             "model": {
@@ -455,7 +456,7 @@ def transform_failure_record(
         },
         "metric": {
             "name": "FAILURE_REPORT",
-            "benchmark_values": 0,
+            "benchmark_values": [0],
             "target_value": 0,
             "extra_info": {
                 "method": "",
 
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portable_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
@@ -764,10 +769,6 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
@@ -872,34 +873,13 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 
-    set(_pybind_training_dep_libs
-        ${TORCH_PYTHON_LIBRARY}
-        etdump
-        executorch
-        util
-        torch
-        extension_training
-    )
-
-    if(EXECUTORCH_BUILD_XNNPACK)
-      # need to explicitly specify XNNPACK and microkernels-prod
-      # here otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu
-      list(APPEND _pybind_training_dep_libs xnnpack_backend XNNPACK microkernels-prod)
-    endif()
-
-    # pybind training
-    pybind11_add_module(_training_lib SHARED extension/training/pybindings/_training_lib.cpp)
-
-    target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
-    target_compile_options(_training_lib PUBLIC ${_pybind_compile_options})
-    target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
-
-    install(TARGETS _training_lib
-            LIBRARY DESTINATION executorch/extension/training/pybindings
-    )
   endif()
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
 
@@ -12,7 +12,7 @@ SCRIPT_DIR_PATH="$(
 
 # TODO(jathu): remove the need to fetch coremltools to build deps for coreml_executor_runner.
 # Keep this version in sync with: pyproject.toml
-COREMLTOOLS_VERSION="8.1"
+COREMLTOOLS_VERSION="8.2"
 
 red=`tput setaf 1`
 green=`tput setaf 2`
 
@@ -32,7 +32,9 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
+        pre_autograd_aten_dialect = export_for_training(
+            model, example_inputs, strict=True
+        ).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
 
@@ -207,7 +207,7 @@ def lower_module_and_test_output(
         expected_output = model(*sample_inputs)
 
         model = torch.export.export_for_training(
-            model, sample_inputs, dynamic_shapes=dynamic_shapes
+            model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
 
         edge_program = export_to_edge(
 
@@ -7,6 +7,7 @@
 from . import arm_pass_utils  # noqa
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
+from .arm_pass import ArmPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
@@ -41,6 +42,10 @@
 from .meandim_to_averagepool_pass import ConvertMeanDimToAveragePoolPass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from .remove_clone_pass import RemoveClonePass  # noqa
+from .replace_scalar_with_tensor_pass import (  # noqa
+    ReplaceScalarWithTensorArgPassTOSABI,
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_conv2d_pass import SizeAdjustConv2DPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 
@@ -0,0 +1,33 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import traceback
+from typing import Optional
+
+import torch
+from executorch.exir.pass_base import ExportPass, NodeMetadata
+
+
+class ArmPass(ExportPass):
+    """Base class for Arm passes"""
+
+    def __init__(self, exported_program: Optional[torch.export.ExportedProgram] = None):
+        super(ArmPass, self).__init__()
+        self.exported_program = exported_program
+
+    def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
+        if not updated:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # if updated we should update metadata
+        new_meta = {}
+        keys = meta.data.keys()
+        for key in keys:
+            new_meta[key] = meta[key]
+        old_stack_trace = new_meta.get("stack_trace", "")
+        new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
+        return super().call_operator(op, args, kwargs, NodeMetadata(new_meta))
@@ -42,18 +42,17 @@
     MatchArgRanksPass,
     QuantizeOperatorArguments,
     RemoveClonePass,
+    ReplaceScalarWithTensorArgPassTOSABI,
+    ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
     ScalarsToAttributePass,
     SizeAdjustConv2DPass,
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
+
 from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
-
-from executorch.backends.transforms.replace_scalar_with_tensor import (
-    ReplaceScalarWithTensorArgPass,
-)
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_manager import PassManager
@@ -84,7 +83,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
             self.add_pass(CastToInt32Pass())
 
-        self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
@@ -113,7 +112,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
-        self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
@@ -170,7 +169,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
             )
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
-        self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
 
@@ -7,12 +7,12 @@
 
 # pyre-unsafe
 
+import traceback
 from inspect import isclass
 from typing import Optional, Sequence
 
 import torch
 import torch.fx
-
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -96,6 +96,7 @@ def create_node(
     kwargs: Optional[dict] = None,
     quantize: bool = False,
     q_params: Optional[tuple] = None,
+    from_node: Optional[torch.fx.Node] = None,
 ):
     """
     Adds a node to 'graph'. graph.inserting_before/after() should be used before the call to decide where to insert the node.
@@ -108,15 +109,26 @@ def create_node(
         args=args,
         kwargs=kwargs or {},
     )
+
+    new_meta = {}
+    if from_node:
+        keys = from_node.meta.keys()
+        for key in keys:
+            new_meta[key] = from_node.meta[key]
+    old_stack_trace = new_meta.get("stack_trace", "")
+    new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
+    node.meta = new_meta
+
     if quantize and q_params:
-        return insert_q_dq_pair(graph, node, q_params)
+        return insert_q_dq_pair(graph, node, q_params, from_node)
     return node
 
 
 def insert_q_dq_pair(
     graph: torch.fx.Graph,
     anchor: torch.fx.Node,
     q_params: tuple,
+    from_node: Optional[torch.fx.Node] = None,
 ):
     """
     Inserts a q dq node pair after the node 'anchor'.
@@ -127,13 +139,15 @@ def insert_q_dq_pair(
             graph=graph,
             op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             args=(),  # We add the argument last
+            from_node=from_node if from_node else anchor,
         )
         q.meta = anchor.meta
     with graph.inserting_after(q):
         dq = create_node(
             graph=graph,
             op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
             args=(q,) + q_params,
+            from_node=from_node if from_node else anchor,
         )
         dq.meta = q.meta
     anchor.replace_all_uses_with(dq)
 
@@ -9,9 +9,10 @@
 import operator
 
 import torch
+from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.pass_base import PassResult
 
 
 def get_layer_norm_decomposition(op) -> tuple:
@@ -40,7 +41,7 @@ def get_layer_norm_decomposition(op) -> tuple:
     raise RuntimeError(f"Can't get layer_norm composition for op {op}")
 
 
-class DecomposeLayerNormPass(ExportPass):
+class DecomposeLayerNormPass(ArmPass):
     """
     layernorm is defined as: ((x - E[x]) / sqrt(Var[x] + eps)) * weights + bias
     Decompose layernorm(x, normalized_shape, weights, bias, eps) to a sequence of:
@@ -111,35 +112,56 @@ def call(self, graph_module: torch.fx.GraphModule):
                     var_op,
                     args=(x, dims),
                     kwargs={"correction": 0, "keepdim": keepdim},
+                    from_node=node,
                 )
                 full = create_node(
                     graph_module.graph,
                     full_op,
                     args=(epsilon_reshaped_shape, epsilon),
                     kwargs={"dtype": dtype},
+                    from_node=node,
+                )
+                add0 = create_node(
+                    graph_module.graph, add_op, args=(var, full), from_node=node
+                )
+                rsqrt = create_node(
+                    graph_module.graph, rsqrt_op, args=(add0,), from_node=node
+                )
+                mul0 = create_node(
+                    graph_module.graph, mul_op, args=(sub, rsqrt), from_node=node
                 )
-                add0 = create_node(graph_module.graph, add_op, args=(var, full))
-                rsqrt = create_node(graph_module.graph, rsqrt_op, args=(add0,))
-                mul0 = create_node(graph_module.graph, mul_op, args=(sub, rsqrt))
                 if weights is not None:
                     weights_reshaped = create_node(
                         graph_module.graph,
                         view_op,
                         args=(weights, weights_reshaped_shape),
+                        from_node=node,
                     )
                     mul1 = create_node(
-                        graph_module.graph, mul_op, args=(mul0, weights_reshaped)
+                        graph_module.graph,
+                        mul_op,
+                        args=(
+                            mul0,
+                            weights_reshaped,
+                        ),
+                        from_node=node,
                     )
                 else:
                     mul1 = mul0
                 output = mul1
                 if bias is not None:
                     bias_reshaped_shape = weights_reshaped_shape
                     bias_reshaped = create_node(
-                        graph_module.graph, view_op, args=(bias, bias_reshaped_shape)
+                        graph_module.graph,
+                        view_op,
+                        args=(bias, bias_reshaped_shape),
+                        from_node=node,
                     )
                     output = create_node(
-                        graph_module.graph, add_op, args=(mul1, bias_reshaped)
+                        graph_module.graph,
+                        add_op,
+                        args=(mul1, bias_reshaped),
+                        from_node=node,
                     )
 
                 users = [user for user in node.users if node != user]
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,9 @@ def quantize_and_compare(`
`32`	`32`	`) -> None:`
`33`	`33`	`assert quantization_type in {"PTQ", "QAT"}`
`34`	`34`
`35`		`- pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()`
	`35`	`+ pre_autograd_aten_dialect = export_for_training(`
	`36`	`+ model, example_inputs, strict=True`
	`37`	`+ ).module()`
`36`	`38`
`37`	`39`	`quantization_config = LinearQuantizerConfig.from_dict(`
`38`	`40`	`{`