pytorch
diff --git a/‎.buckconfig
+3 b/‎.buckconfig
+3
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
+1-1 b/‎.ci/docker/ci_commit_pins/pytorch.txt
+1-1
diff --git a/‎.ci/docker/conda-env-ci.txt
+1 b/‎.ci/docker/conda-env-ci.txt
+1
diff --git a/‎.ci/scripts/gather_benchmark_configs.py
+1-1 b/‎.ci/scripts/gather_benchmark_configs.py
+1-1
diff --git a/‎.ci/scripts/setup-linux.sh
+4-1 b/‎.ci/scripts/setup-linux.sh
+4-1
diff --git a/‎.ci/scripts/setup-macos.sh
+5-1 b/‎.ci/scripts/setup-macos.sh
+5-1
diff --git a/‎.github/workflows/doc-build.yml
+1-1 b/‎.github/workflows/doc-build.yml
+1-1
diff --git a/‎.github/workflows/pull.yml
+2-17 b/‎.github/workflows/pull.yml
+2-17
diff --git a/‎.github/workflows/trunk.yml
+2-2 b/‎.github/workflows/trunk.yml
+2-2
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎.lintrunner.toml
+1-1 b/‎.lintrunner.toml
+1-1
diff --git a/‎.mypy.ini
+3 b/‎.mypy.ini
+3
diff --git a/‎CMakeLists.txt
+36 b/‎CMakeLists.txt
+36
diff --git a/‎README-wheel.md
+8-7 b/‎README-wheel.md
+8-7
diff --git a/‎README.md
+42-28 b/‎README.md
+42-28
diff --git a/‎backends/arm/README.md
+12 b/‎backends/arm/README.md
+12
diff --git a/‎backends/arm/_passes/_debug_passes.py
+23 b/‎backends/arm/_passes/_debug_passes.py
+23
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
+4 b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
+4
@@ -33,3 +33,6 @@
       **/.git, \
       cmake-out, \
       pip-out
+
+[buck2]
+restarter=true
@@ -1 +1 @@
-0a94bb432ed75cc2d950d81b2921363218a7e459
+27e35de6c288bffad1b4d18b393579c1d1a95547
@@ -1,4 +1,5 @@
 cmake=3.22.1
 ninja=1.10.2
 libuv
+llvm-openmp
 pkg-config
@@ -238,7 +238,7 @@ def set_output(name: str, val: Any) -> None:
     try:
         with open(github_output, "a") as env:
             env.write(f"{name}={val}\n")
-    except PermissionError:
+    except (PermissionError, FileNotFoundError):
         # Fall back to printing in case of permission error in unit tests
         print(f"::set-output name={name}::{val}")
 
 
@@ -23,4 +23,7 @@ fi
 # of nightly. This allows CI to test against latest commits from PyTorch
 install_executorch "use-pt-pinned-commit"
 build_executorch_runner "${BUILD_TOOL}"
-do_not_use_nightly_on_ci
+
+if [[ "${GITHUB_BASE_REF:-}" == *main* || "${GITHUB_BASE_REF:-}" == *gh* ]]; then
+  do_not_use_nightly_on_ci
+fi
@@ -121,6 +121,7 @@ setup_macos_env_variables
 # NB: we need buck2 in all cases because cmake build also depends on calling
 # buck2 atm
 install_buck
+brew install libomp
 install_pip_dependencies
 
 # TODO(huydhn): Unlike our self-hosted runner, GitHub runner doesn't have access
@@ -136,4 +137,7 @@ install_pytorch_and_domains
 # the pinned commit from PyTorch
 install_executorch "use-pt-pinned-commit"
 build_executorch_runner "${BUILD_TOOL}"
-do_not_use_nightly_on_ci
+
+if [[ "${GITHUB_BASE_REF:-}" == *main* || "${GITHUB_BASE_REF:-}" == *gh* ]]; then
+  do_not_use_nightly_on_ci
+fi
@@ -84,8 +84,8 @@ jobs:
     needs: build
     if: github.repository == 'pytorch/executorch' && github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v'))
     permissions:
+      id-token: write
       contents: write
-      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       repository: pytorch/executorch
 
@@ -212,17 +212,14 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
+      timeout: 180
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
-        # install pybind
-        bash install_executorch.sh --pybind xnnpack
-
         # install Llava requirements
         bash examples/models/llama/install_requirements.sh
         bash examples/models/llava/install_requirements.sh
@@ -483,9 +480,6 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
-        # install pybind
-        bash install_executorch.sh --pybind xnnpack
-
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
 
@@ -513,9 +507,6 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
-        # install pybind
-        bash install_executorch.sh --pybind xnnpack
-
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
 
@@ -535,17 +526,14 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
+      timeout: 180
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
-        # install pybind
-        bash install_executorch.sh --pybind xnnpack
-
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
 
@@ -573,9 +561,6 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
-        # install pybind
-        bash install_executorch.sh --pybind xnnpack
-
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
 
 
@@ -150,7 +150,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch
+        install_executorch "use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
@@ -180,7 +180,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch
+        install_executorch "use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
 
@@ -64,3 +64,6 @@
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
+[submodule "backends/cadence/utils/FACTO"]
+	path = backends/cadence/utils/FACTO
+	url = https://github.com/pytorch-labs/FACTO.git
@@ -1,4 +1,4 @@
-merge_base_with = "origin/main"
+merge_base_with = "main"
 
 [[linter]]
 code = 'FLAKE8'
 
@@ -77,6 +77,9 @@ ignore_missing_imports = True
 [mypy-ruamel]
 ignore_missing_imports = True
 
+[mypy-serializer.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
 
@@ -240,6 +240,13 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
+
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
+  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
   set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
@@ -791,6 +798,35 @@ if(EXECUTORCH_BUILD_PYBIND)
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings
   )
+
+  if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+
+    set(_pybind_training_dep_libs
+        ${TORCH_PYTHON_LIBRARY}
+        etdump
+        executorch
+        util
+        torch
+        extension_training
+    )
+
+    if(EXECUTORCH_BUILD_XNNPACK)
+      # need to explicitly specify XNNPACK and microkernels-prod
+      # here otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu
+      list(APPEND _pybind_training_dep_libs xnnpack_backend XNNPACK microkernels-prod)
+    endif()
+
+    # pybind training
+    pybind11_add_module(_training_lib SHARED extension/training/pybindings/_training_lib.cpp)
+
+    target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
+    target_compile_options(_training_lib PUBLIC ${_pybind_compile_options})
+    target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
+
+    install(TARGETS _training_lib
+            LIBRARY DESTINATION executorch/extension/training/pybindings
+    )
+  endif()
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
 
@@ -4,20 +4,21 @@ standard on-device iOS and Android mobile deployments. One of the main goals for
 ExecuTorch is to enable wider customization and deployment capabilities of the
 PyTorch programs.
 
-The `executorch` pip package is in alpha.
-* Supported python versions: 3.10, 3.11
+The `executorch` pip package is in beta.
+* Supported python versions: 3.10, 3.11, 3.12
 * Compatible systems: Linux x86_64, macOS aarch64
 
-The prebuilt `executorch.extension.pybindings.portable_lib` module included in
-this package provides a way to run ExecuTorch `.pte` files, with some
-restrictions:
+The prebuilt `executorch.runtime` module included in this package provides a way
+to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen
   operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html)
   are linked into the prebuilt module
 * Only the [XNNPACK backend
   delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html)
-  is linked into the prebuilt module
-* [macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend delegates are linked into the prebuilt module.
+  is linked into the prebuilt module.
+* \[macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html)
+  and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend
+  delegates are also linked into the prebuilt module.
 
 Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for
 tutorials and documentation. Here are some starting points:
 
@@ -1,9 +1,37 @@
-# ExecuTorch
-
-**ExecuTorch** is an end-to-end solution for enabling on-device inference
-capabilities across mobile and edge devices including wearables, embedded
-devices and microcontrollers. It is part of the PyTorch Edge ecosystem and
-enables efficient deployment of PyTorch models to edge devices.
+<div align="center">
+  <img src="./docs/source/_static/img/et-logo.png" alt="Logo" width="200">
+  <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
+</div>
+
+
+<div align="center">
+  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
+  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
+  <a href="https://discord.gg/MeacgB7A"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
+  <a href="https://pytorch.org/executorch/stable/index.html"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <hr>
+</div>
+
+**ExecuTorch** is an end-to-end solution for on-device inference and training. It powers much of Meta's on-device AI experiences across Facebook, Instagram, Meta Quest, Ray-Ban Meta Smart Glasses, WhatsApp, and more.
+
+It supports a wide range of models including LLMs (Large Language Models), CV (Computer Vision), ASR (Automatic Speech Recognition), and TTS (Text to Speech).
+
+Platform Support:
+- Operating Systems:
+  - iOS
+  - Mac
+  - Android
+  - Linux
+  - Microcontrollers
+
+- Hardware Acceleration:
+  - Apple
+  - Arm
+  - Cadence
+  - MediaTek
+  - Qualcomm
+  - Vulkan
+  - XNNPACK
 
 Key value propositions of ExecuTorch are:
 
@@ -17,35 +45,21 @@ Key value propositions of ExecuTorch are:
   experience due to a lightweight runtime and utilizing full hardware
   capabilities such as CPUs, NPUs, and DSPs.
 
-For a comprehensive technical overview of ExecuTorch and step-by-step tutorials,
-please visit our documentation website [for the latest release](https://pytorch.org/executorch/stable/index.html) (or the [main branch](https://pytorch.org/executorch/main/index.html)).
-
-Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
-
-Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+## Getting Started
+To get started you can:
 
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) on getting things running locally and deploy a model to a device
+- Use this [Colab Notebook](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
+- Jump straight into LLMs use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md)
 
-**[UPDATE - 10/24]** We have added support for running [Llama 3.2 Quantized 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
-
-## Feedback
+## Feedback and Engagement
 
 We welcome any feedback, suggestions, and bug reports from the community to help
-us improve our technology. Please use the [PyTorch
-Forums](https://discuss.pytorch.org/c/executorch) for discussion and feedback
-about ExecuTorch using the **ExecuTorch** category, and our [GitHub
-repository](https://github.com/pytorch/executorch/issues) for bug reporting.
-
-We recommend using the latest release tag from the
-[Releases](https://github.com/pytorch/executorch/releases) page when developing.
+us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/MeacgB7A)
 
 ## Contributing
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) for details about issues, PRs, code
-style, CI jobs, and other development topics.
-
-To connect with us and other community members, we invite you to join PyTorch Slack community by filling out this [form](https://docs.google.com/forms/d/e/1FAIpQLSeADnUNW36fjKjYzyHDOzEB_abKQE9b6gqqW9NXse6O0MWh0A/viewform). Once you've joined, you can:
-* Head to the `#executorch-general` channel for general questions, discussion, and community support.
-* Join the `#executorch-contributors` channel if you're interested in contributing directly to project development.
+We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/MeacgB7A)
 
 
 ## Directory Structure
 
@@ -122,6 +122,18 @@ The you can run the tests with
 pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
 ```
 
+## Passes
+
+With the default passes in the Arm Ethos-U backend, assuming the model lowers fully to the
+Ethos-U, the exported program is composed of a Quantize node, Ethos-U custom delegate
+and a Dequantize node. In some circumstances, you may want to feed quantized input to the Neural
+Network straight away, e.g. if you have a camera sensor outputting (u)int8 data and keep all the
+arithmetic of the application in the int8 domain. For these cases, you can apply the
+`exir/passes/quantize_io_pass.py`. See the unit test in `executorch/backends/arm/
+test/passes/test_ioquantization_pass.py`for an example how to feed quantized inputs and
+obtain quantized outputs.
+
+
 ### Code coverage
 
 To get code coverage:
 
@@ -0,0 +1,23 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.devtools.visualization.visualization_utils import visualize_graph
+from executorch.exir import ExportedProgram
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class VisualizePass(ExportPass):
+    """
+    This pass visualizes the graph at the point of insertion in the pass manager
+    """
+
+    def __init__(self, exported_program: ExportedProgram) -> None:
+        super().__init__()
+        self.exported_program = exported_program
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        visualize_graph(graph_module, self.exported_program)
+        return PassResult(graph_module, False)
@@ -129,6 +129,7 @@ def insert_input_transpose(node, input_node, graph_module):
             permute_node.meta["tosa_dim_order"] = tuple(
                 range(len(input_node.meta["val"].size()))
             )
+            permute_node.meta["val"] = input_node.meta["val"]
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
@@ -141,6 +142,9 @@ def insert_output_transpose(node, graph_module):
             permute_node.meta["tosa_dim_order"] = (
                 AnnotateChannelsLastDimOrder.NHWC_order
             )
+            permute_node.meta["val"] = node.meta["val"].permute(
+                AnnotateChannelsLastDimOrder.NHWC_order
+            )
             node.meta["tosa_dim_order"] = (0, 1, 2, 3)
             users = [user for user in node.users if user != permute_node]
             for user in users:
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0a94bb432ed75cc2d950d81b2921363218a7e459`
	`1`	`+27e35de6c288bffad1b4d18b393579c1d1a95547`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-merge_base_with = "origin/main"`
	`1`	`+merge_base_with = "main"`
`2`	`2`
`3`	`3`	`[[linter]]`
`4`	`4`	`code = 'FLAKE8'`