diff --git a/.gitmodules b/.gitmodules
index 980a999eff0..3138391f7c0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -31,3 +31,9 @@
 [submodule "backends/arm/third-party/serialization_lib"]
 	path = backends/arm/third-party/serialization_lib
 	url = https://git.mlplatform.org/tosa/serialization_lib.git
+[submodule "backends/arm/third-party/ethos-u-core-driver"]
+	path = backends/arm/third-party/ethos-u-core-driver
+	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git
+[submodule "backends/arm/third-party/cmsis"]
+	path = backends/arm/third-party/cmsis
+	url = https://github.com/ARM-software/CMSIS_5.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cc626e0d5d..1678bc2d8a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,8 +59,18 @@ endif()
 # - targets in the current directory, before and after this command is invoked
 # - targets in sub-directories added after this command is invoked
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  # To enable logging in Release mode
+  option(
+    EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE
+    "Enable logging in release mode" OFF)
+
+  set(_ET_LOG_ENABLE 0)
+  if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE})
+    set(_ET_LOG_ENABLE 1)
+  endif()
+
   # Avoid pulling in the logging strings, which can be large.
-  add_definitions(-DET_LOG_ENABLED=0)
+  add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE})
   # Avoid pulling in the flatbuffer data verification
   # logic, which can add about 20kB.
   add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0)
@@ -106,6 +116,10 @@ if(BUILD_SELECTIVE_BUILD_TEST)
   option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF)
 endif()
 
+# Build Arm Baremetal backend
+option(EXECUTORCH_BUILD_ARM_BAREMETAL
+       "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)
+
 # Build xnn_executor_runner which depends on XNNPACK
 option(EXECUTORCH_BUILD_XNNPACK
        "Build xnn_executor_runner which depends on XNNPACK" OFF)
@@ -295,6 +309,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 # Add selective build subdirectory
 if(BUILD_SELECTIVE_BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
new file mode 100644
index 00000000000..2cc5cf94740
--- /dev/null
+++ b/backends/arm/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(cmake/Dependencies.cmake)
+
+set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp)
+list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
+add_library(ethos_u STATIC ${_arm_baremetal_sources})
+target_include_directories(ethos_u PUBLIC ${_common_include_directories})
+target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR})
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 1a6499cf07d..82b24f4b9b6 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -13,6 +13,7 @@
 import operator
 import os
 import tempfile
+import subprocess
 from typing import final, List
 
 import numpy as np
@@ -140,6 +141,64 @@ def dbg_tosa_dump(tosa_fb, path):
     f.write(js)
     f.close()
 
+# Output to Vela with current file-based compilation
+# WARNING: if this changes, the runtime reader also needs to change
+def vela_compile(tosa_fb):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        print(f"compiling to Vela in {tmpdir}")
+
+        tosaname = "out.tosa"
+        flatbuffer = tosa_fb.serialize()
+        f = open(os.path.join(tmpdir,tosaname), "wb")
+        f.write(flatbuffer)
+        f.close()
+
+        # invoke vela
+        # TODO target ethos-u55-128
+        vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}"
+        subprocess.run([vela_command], shell=True, check=True)
+
+        np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz")
+        blocks = b''
+        with np.load(np_path, allow_pickle=False) as data:
+            # Emit the NPZ regions as:
+            #  - 16 byte block name null terminated string (padded to 16 if name shorter)
+            #  - 4 byes of int32 block length and 12 bytes of 0's
+            #  - block data (padded to 16 byte alignment at end)
+            # Repeat for all blocks
+            for key in data.keys():
+                block_name = bytes(key,"utf8")[:15]
+                block_name = block_name + b'\x00'*(16-len(block_name))
+                block_data = data[key].tobytes() 
+                # We need the acual unpadded block lengths for hw setup
+                block_length = len(block_data).to_bytes(16, 'little')
+                # pad block data to multiple of 16 bytes
+                block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16)
+
+                block = block_name + block_length + block_data
+                blocks = blocks + block
+
+            # Add a block for scratch, inputs and outputs
+            # scratch shape is a 1 element array giving us size in bytes
+            block_name = bytes("scratch_data","utf8")[:15]
+            block_name = block_name + b'\x00'*(16-len(block_name))
+            block_length = data["scratch_shape"][0].item()
+            print(f"scratch length = {block_length}")
+            block_length = block_length+(15-(block_length-1)%16)
+            block_data = b'\x00'*block_length
+            block_length = block_length.to_bytes(16, 'little')
+            print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}")
+            block = block_name + block_length + block_data
+            blocks = blocks + block
+            # TODO are these already in scratch shape? look to be
+            #input_shape * input_elem_size
+            #output_shape * output_elem_size
+            # input_offset and output_offset specify the location these arrays are written from base of scratch
+
+        # return 16 byte VELA bin header + blocks + footer
+        header = bytes("vela_bin_stream","utf-8") + b'\x00'
+        footer = bytes("vela_end_stream","utf-8") + b'\x00'
+        return header + blocks + footer
 
 def dbg_fail(node, tosa_fb, path):
     dbg_tosa_dump(tosa_fb, path)
@@ -205,10 +264,6 @@ def preprocess(  # noqa: C901
                 path = spec.value.decode()
                 debug_output = True
 
-        # in non debug builds we still pass files to vela
-        if path is None:
-            path = tempfile.mkdtemp(prefix="arm_tosa_")
-
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_fb = ts.TosaSerializer(path)
@@ -680,5 +735,7 @@ def preprocess(  # noqa: C901
             dbg_tosa_dump(tosa_fb, path)
 
         # Serialize and return the tosa flatbuffer
-        fb = tosa_fb.serialize()
-        return PreprocessResult(processed_bytes=bytes(fb))
+        # fb = bytes(tosa_fb.serialize())
+        binary = vela_compile(tosa_fb)
+        
+        return PreprocessResult(processed_bytes=binary)
diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake
new file mode 100644
index 00000000000..27a587176bb
--- /dev/null
+++ b/backends/arm/cmake/Dependencies.cmake
@@ -0,0 +1,12 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
+
+# Ethos-U driver
+set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver")
+set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
+add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} )
+include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} )
diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake
new file mode 100644
index 00000000000..d70f79361cd
--- /dev/null
+++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake
@@ -0,0 +1,90 @@
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU")
+string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
+set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_LINKER "arm-none-eabi-ld")
+
+set(CMAKE_EXECUTABLE_SUFFIX ".elf")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# Select C/C++ version
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
+
+set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR})
+string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU})
+
+# Compile options
+add_compile_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    "$<$<CONFIG:DEBUG>:-gdwarf-3>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-fno-unwind-tables;-fno-rtti;-fno-exceptions>"
+    -fdata-sections
+    -ffunction-sections)
+
+# Compile defines
+add_compile_definitions(
+    "$<$<NOT:$<CONFIG:DEBUG>>:NDEBUG>")
+
+# Link options
+add_link_options(
+    -mcpu=${GCC_CPU}
+    -mthumb
+    --specs=nosys.specs)
+
+# Set floating point unit
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp")
+    set(FLOAT soft)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)")
+    set(FLOAT hard)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR
+       CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)")
+    set(FLOAT hard)
+    set(FPU_CONFIG "fpv4-sp-d16")
+    add_compile_options(-mfpu=${FPU_CONFIG})
+    add_link_options(-mfpu=${FPU_CONFIG})
+else()
+    set(FLOAT soft)
+endif()
+
+if (FLOAT)
+    add_compile_options(-mfloat-abi=${FLOAT})
+    add_link_options(-mfloat-abi=${FLOAT})
+endif()
+
+add_link_options(LINKER:--nmagic,--gc-sections)
+
+# Compilation warnings
+add_compile_options(
+#    -Wall
+#    -Wextra
+
+#    -Wcast-align
+#    -Wdouble-promotion
+#    -Wformat
+#    -Wmissing-field-initializers
+#    -Wnull-dereference
+#    -Wredundant-decls
+#    -Wshadow
+#    -Wswitch
+#    -Wswitch-default
+#    -Wunused
+    -Wno-redundant-decls
+    -Wno-psabi
+)
diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh
new file mode 100755
index 00000000000..0dbb8cf2177
--- /dev/null
+++ b/backends/arm/cmake/build.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+#
+# Setup toolchain
+#
+BASEDIR=`realpath $(dirname "$0")`
+echo "building using build.sh in $BASEDIR"
+
+ARCH=$(uname -i)
+GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/
+
+echo $GCCPATH
+if test -d "${GCCPATH}"; then
+	echo Using exising compiler ${GCCPATH}
+else
+	pushd ${BASEDIR}/
+	./toolchain.sh
+	popd
+fi
+export PATH=${PATH}:${GCCPATH}
+
+echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"`
+
+
+#
+# Prepare and run clean build
+#
+rm -rf buck-out/ build/lib/ cmake-out/
+rm -rf cmake-corstone
+mkdir cmake-corstone
+cd cmake-corstone
+
+#cmake -DBUCK2=buck2 ..
+
+#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake ..
+cmake -DFLATC_EXECUTABLE=flatc \
+	  -DEXECUTORCH_BUILD_XNNPACK=OFF \
+	  -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \
+	  -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
+	  -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \
+	  -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
+	  --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \
+	  ..
+
+cd ..
+cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels
diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh
new file mode 100755
index 00000000000..92188ee982d
--- /dev/null
+++ b/backends/arm/cmake/toolchain.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright 2023 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -e
+
+# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon)
+ARCH=$(uname -i)
+curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz
+tar xf gcc.tar.xz
+export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)`
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
new file mode 100644
index 00000000000..3dc52645089
--- /dev/null
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2023 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Arm backend for Ethos-U baremetal driver stack, this relies on the
+ * ethos-u-core-driver for hardware interaction.
+ */
+
+#include <memory>
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <ethosu_driver.h>
+#include <pmu_ethosu.h>
+
+namespace torch {
+namespace executor {
+
+// TODO we should be in 0x31, not this lower 1MB sRAM
+// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000
+#define CS300_SRAM_LOW ((void*)0x11000000)
+#define CS300_SRAM_HIGH ((void*)0x110FFFFF)
+
+class ArmBackend final : public PyTorchBackendInterface {
+
+public:
+	ArmBackend() {
+		ET_LOG(Debug, "Constructing ARM Backend");
+	}
+	
+	~ArmBackend() = default;
+
+	virtual bool is_available() const override {
+		return 1;
+	}
+
+	Result<DelegateHandle*> init(
+		BackendInitContext& context,
+		FreeableBuffer* processed,
+		ArrayRef<CompileSpec> compile_specs) const override {
+
+        ET_LOG(Info, "ArmBackend::init %p", processed->data() );
+
+		char *data = (char*)processed->data();
+		size_t size = processed->size();
+		char *foot = data + size - 16;
+
+		// Header and footer both 16 bit aligned suggest valid structure and we
+		// wont walk off the end of the chunks and segfault
+		if( !((int)data == next_mul_16((int)data)) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !((int)foot == next_mul_16((int)foot)) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !(0 == strncmp( data, "vela_bin_stream", 15 )) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		if( !(0 == strncmp( foot, "vela_end_stream", 15 )) )
+		{
+			ET_LOG(Error, "ArmBackend::init header unaligned");
+			return Error::InvalidProgram;
+		}
+		// Verify address range is accessible current expectation is the program
+		// is wholly stored in SRAM
+		if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) );
+		
+		// Return the same buffer we were passed - this data will be
+		// executed directly
+		return processed;
+	}
+
+	Error execute(
+		BackendExecutionContext& context,
+		DelegateHandle* input_handle,
+		EValue** args) const override {
+
+		FreeableBuffer* processed = (FreeableBuffer*)input_handle;
+
+		ET_LOG(Info, "ArmBackend::execute %p", processed->data() );
+
+		vela_handles handles = { 0, 0, 0, 0, 0, 0 };
+
+		// Command stream - we know at this point it's aligned
+		char *data = (char*)processed->data();
+
+		// Read key sections from the vela_bin_stream
+		if( !this->vela_read( data, &handles, processed->size() ) )
+		{
+			ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" );
+			return Error::InvalidProgram;
+		}
+
+		ET_LOG(Debug, "ArmBackend::execute: Running program data:\n  cmd %p %d\n  weight %p %d\n  scratch %p %d\n",
+			   handles.cmd_data, handles.cmd_data_size,
+			   handles.weight_data, handles.weight_data_size,
+			   handles.scratch_data, handles.scratch_data_size );
+
+		// TMP emit scratch
+		printf("Scratch before:\n");
+		for( int i=0; i<handles.scratch_data_size; i++ )
+		{
+			if( i%4 == 0 ) ((char*)handles.scratch_data)[i] = 1;
+			printf("%02x ", ((char*)handles.scratch_data)[i]);
+			if( !((i+1)%4) ) printf("\n");
+		}
+		printf("\n");
+		
+		// Allocate driver handle and synchronously invoke driver
+		ethosu_driver *drv = ethosu_reserve_driver();
+
+		uint64_t bases[2] = {(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
+		size_t bases_size[2] = {handles.weight_data_size, handles.scratch_data_size};
+		int result = ethosu_invoke_v3(drv,
+									  (void*)handles.cmd_data,
+									  handles.cmd_data_size,
+									  bases,
+									  bases_size,
+									  2,
+									  nullptr);
+
+		if(result != 0)
+		{
+			ET_LOG(Error, "ArmBackend::execute: Ethos-U invocation failed error (%d)", result);
+			return Error::InvalidProgram;
+		}
+		
+		// TMP emit scratch
+        printf("Scratch after:\n");
+        for( int i=0; i<handles.scratch_data_size; i++ )
+        {
+            printf("%02x ", ((char*)handles.scratch_data)[i]);
+            if( !((i+1)%4) ) printf("\n");
+        }
+        printf("\n");
+		
+		// Process results into EValue storage
+		// TODO: optimise into direct write for compatible layouts
+		// TODO: get num in/out and layout?
+		int *output_address = (int*)(handles.scratch_data + handles.output_offset);
+		auto tensor = args[1]->toTensor();
+		for(int j=0; j<tensor.numel(); j++)
+		{
+			
+			tensor.mutable_data_ptr<int>()[j] = output_address[j];
+		}
+				
+		return Error::Ok;
+	}
+
+	void destroy(DelegateHandle* handle) const override {
+		return;
+	}
+
+private:
+	typedef struct {
+		const char *cmd_data; size_t cmd_data_size;
+		const char *weight_data; size_t weight_data_size;
+		const char *scratch_data; size_t scratch_data_size;
+		size_t input_offset; size_t input_data_shape[3];
+		size_t output_offset; size_t output_data_shape[3];
+	} vela_handles;
+
+	typedef struct {
+		char name[16];
+		int size; char _pad[12];
+		char data[];
+	} vela_bin_block;
+
+	static int next_mul_16( int n ) {
+		return ((n-1)|15)+1;
+	}
+	
+	int vela_read(char* data, vela_handles *h, int size ) const {
+
+		// Read header string
+		if( strncmp( data, "vela_bin_stream", 15 ) )
+		{
+			return 0;
+		}
+		data += 16;
+
+		// Expect one or more 'vela_bin_block's
+		while( 1 )
+		{
+			vela_bin_block *b = (vela_bin_block*)data;
+			data += 16 + 16 + next_mul_16(b->size);
+
+			// Exit with success on finding end of stream
+			if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1;
+
+			if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) )
+			{
+				// This magic header confirms a valid command stream in binary
+				if( strncmp( b->data, "COP1", 4 ) ) return 0;
+				h->cmd_data = b->data;
+				h->cmd_data_size = b->size;
+			}
+			if( !strncmp( b->name, "weight_data", strlen("weight_data")) )
+			{
+				h->weight_data = b->data;;
+				h->weight_data_size = b->size;
+			}
+			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
+			{
+				h->scratch_data = b->data;
+				h->scratch_data_size = b->size;
+			}
+
+			// capture inputs and outputs
+			if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) )
+			{
+				h->scratch_data = b->data;
+				h->scratch_data_size = b->size;
+			}
+			if( !strncmp( b->name, "input_offset", strlen("input_offset")) )
+			{
+				h->input_offset = ((int*)b->data)[0];
+			}
+			if( !strncmp( b->name, "output_offset", strlen("output_offset")) )
+			{
+				h->output_offset = ((int*)b->data)[0];
+			}
+			if( !strncmp( b->name, "input_shape", strlen("input_shape")) )
+			{
+				h->input_data_shape[0] = ((int*)b->data)[0];
+				h->input_data_shape[0] = ((int*)b->data)[1];
+				h->input_data_shape[0] = ((int*)b->data)[2];
+				
+			}
+			if( !strncmp( b->name, "output_shape", strlen("output_shape")) )
+			{
+				h->output_data_shape[0] = ((int*)b->data)[0];
+				h->output_data_shape[0] = ((int*)b->data)[1];
+                h->output_data_shape[0] = ((int*)b->data)[2];
+            }							
+		}
+	}
+
+};
+
+	auto backend = ArmBackend();
+	void arm_backend_register() {
+		Backend backend_id{"ArmBackend", &backend};
+		static auto registered = register_backend(backend_id);
+	}
+
+} // namespace executor
+} // namespace torch
diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis
new file mode 160000
index 00000000000..a75f01746df
--- /dev/null
+++ b/backends/arm/third-party/cmsis
@@ -0,0 +1 @@
+Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3
diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver
new file mode 160000
index 00000000000..90f9df900ac
--- /dev/null
+++ b/backends/arm/third-party/ethos-u-core-driver
@@ -0,0 +1 @@
+Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5
diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
index a7790be7fed..86938d065b8 100644
--- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp
@@ -11,6 +11,8 @@
 #include <executorch/runtime/kernel/operator_registry.h>
 #include <executorch/runtime/platform/profiler.h>
 #include "${fn_header}" // Generated Function import headers
+#include <executorch/manual.h>
+
 // ${generated_comment}
 
 // NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
@@ -24,8 +26,6 @@
 using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>;
 namespace torch {
 namespace executor {
-namespace function {
-namespace {
 
 static Kernel kernels_to_register[] = {
     ${unboxed_kernels} // Generated kernels
@@ -39,8 +39,11 @@ static KernelArrayRef kernel_array_ref(
 
 // Return value not used. Keep the static variable assignment to register
 // kernels in static initialization time.
-static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
-} // namespace
-} // namespace function
+// static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+
+void manual_override() {
+    static auto success_with_kernel_reg = register_kernels(kernel_array_ref);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py
index e320ca0cf4e..a9e07bed4c9 100644
--- a/examples/arm/arm_tosa_e2e.py
+++ b/examples/arm/arm_tosa_e2e.py
@@ -153,7 +153,7 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
 # Temp systest mode for running all models against both inference profiles
 if __name__ == "__main__":
     for op in TestList:
-        tosa_run_test(op, profile=TosaProfile.MI)
+        tosa_run_test(op, profile=TosaProfile.BI)
 
     # TODO: haven't added the quantized lowerings for BI, comment out for now
     # for op in TestList:
diff --git a/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch
new file mode 100644
index 00000000000..efb02478229
--- /dev/null
+++ b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch
@@ -0,0 +1,25 @@
+From a969839b90756b2458cb80ac5edb619e87210bea Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 18:05:03 -0700
+Subject: [PATCH 1/3] [HACK] regress cmake version from 3.21 --> 3.20
+
+---
+ targets/corstone-300/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/targets/corstone-300/CMakeLists.txt b/targets/corstone-300/CMakeLists.txt
+index 62205bb..7dda8a1 100644
+--- a/targets/corstone-300/CMakeLists.txt
++++ b/targets/corstone-300/CMakeLists.txt
+@@ -42,7 +42,7 @@ set(MEMORY_ARENA "dram" CACHE STRING "Memory config for arena")
+ # Project
+ #############################################################################
+ 
+-cmake_minimum_required(VERSION 3.21)
++cmake_minimum_required(VERSION 3.20)
+ 
+ project(ethos-u-corstone-300 VERSION 0.0.1)
+ 
+-- 
+2.39.3
+
diff --git a/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch
new file mode 100644
index 00000000000..f2a6e17ccd8
--- /dev/null
+++ b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch
@@ -0,0 +1,52 @@
+From 3687c49c2ca85ca8a7d554b1206272870c565de3 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 18:05:30 -0700
+Subject: [PATCH 2/3] [HACK] disable warnings to reduce verbosity
+
+---
+ cmake/toolchain/arm-none-eabi-gcc.cmake | 28 ++++++++++++-------------
+ 1 file changed, 14 insertions(+), 14 deletions(-)
+
+diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake
+index 093005e..0e6a2ed 100644
+--- a/cmake/toolchain/arm-none-eabi-gcc.cmake
++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake
+@@ -85,21 +85,21 @@ add_link_options(LINKER:--nmagic,--gc-sections)
+ 
+ # Compilation warnings
+ add_compile_options(
+-    -Wall
+-    -Wextra
++    # -Wall
++    # -Wextra
+ 
+-    -Wcast-align
+-    -Wdouble-promotion
+-    -Wformat
+-    -Wmissing-field-initializers
+-    -Wnull-dereference
+-    -Wredundant-decls
+-    -Wshadow
+-    -Wswitch
+-    -Wswitch-default
+-    -Wunused
++    # -Wcast-align
++    # -Wdouble-promotion
++    # -Wformat
++    # -Wmissing-field-initializers
++    # -Wnull-dereference
++    # -Wredundant-decls
++    # -Wshadow
++    # -Wswitch
++    # -Wswitch-default
++    # -Wunused
+ 
+-    -Wno-redundant-decls
++    # -Wno-redundant-decls
+ 
+-    -Wno-psabi
++    # -Wno-psabi
+ )
+-- 
+2.39.3
+
diff --git a/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch
new file mode 100644
index 00000000000..9a0b0be554e
--- /dev/null
+++ b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch
@@ -0,0 +1,224 @@
+From b5369c873814d765276a746ce26d2be5724da8f8 Mon Sep 17 00:00:00 2001
+From: Digant Desai <digantdesai@meta.com>
+Date: Thu, 28 Sep 2023 19:07:51 -0700
+Subject: [PATCH 3/3] [HACK] Add Executorch add example
+
+---
+ applications/CMakeLists.txt                  |   2 +
+ applications/executorch_tests/CMakeLists.txt |  53 ++++++++
+ applications/executorch_tests/add.cpp        | 130 +++++++++++++++++++
+ 3 files changed, 185 insertions(+)
+ create mode 100644 applications/executorch_tests/CMakeLists.txt
+ create mode 100644 applications/executorch_tests/add.cpp
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index 1fa2b2e..68e5427 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -28,6 +28,8 @@ add_subdirectory(threadx_demo)
+ 
+ add_subdirectory(message_handler_openamp)
+ 
++add_subdirectory(executorch_tests)
++
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang")
+     # Only armclang supported for now
+     add_subdirectory(trustzone_inference)
+diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt
+new file mode 100644
+index 0000000..8a34c44
+--- /dev/null
++++ b/applications/executorch_tests/CMakeLists.txt
+@@ -0,0 +1,53 @@
++#
++# Copyright (c) 2021 Arm Limited. All rights reserved.
++#
++# SPDX-License-Identifier: Apache-2.0
++#
++# Licensed under the Apache License, Version 2.0 (the License); you may
++# not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an AS IS BASIS, WITHOUT
++# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++#
++
++if (NOT TARGET ethosu_core_driver)
++  return()
++endif()
++
++####
++#### Executorch demo app/test
++####
++
++set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to Executorch dir")
++set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to Executorch build dir")
++set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to Executorch headers")
++
++get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH)
++get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH)
++get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH)
++
++message("**********************")
++message("Executorch dir      (ET_DIR_PATH)      : ${ET_DIR_PATH}")
++message("Executorch build dir(ET_BUILD_DIR_PATH): ${ET_BUILD_DIR_PATH}")
++message("Executorch headers  (ET_INCUDE_PATH)   : ${ET_INCLUDE_PATH}")
++message("**********************")
++
++set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a")
++set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a")
++set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a")
++
++ethosu_add_executable_test(executorch_add PRIVATE
++    SOURCES add.cpp
++    LIBRARIES ${LIB_ET_RUNTIME} ${LIB_ET_OP_REGISTRATION}
++    ${LIB_ET_OP_KERNELS})
++
++target_include_directories(executorch_add PRIVATE
++${ET_INCLUDE_PATH})
++
++# TODO Memory setup
+diff --git a/applications/executorch_tests/add.cpp b/applications/executorch_tests/add.cpp
+new file mode 100644
+index 0000000..115af66
+--- /dev/null
++++ b/applications/executorch_tests/add.cpp
+@@ -0,0 +1,130 @@
++/*
++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
++ *
++ * SPDX-License-Identifier: Apache-2.0
++ *
++ * Licensed under the Apache License, Version 2.0 (the License); you may
++ * not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ * www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++/****************************************************************************
++ * Includes
++ ****************************************************************************/
++
++#include <stdio.h>
++
++#include <executorch/runtime/platform/runtime.h>
++#include <executorch/runtime/executor/program.h>
++#include <executorch/extension/data_loader/buffer_data_loader.h>
++#include <executorch/runtime/platform/log.h>
++#include <executorch/util/util.h>
++
++#include <executorch/add.pte.h>
++
++#include <executorch/manual.h>
++
++#include <vector>
++#include <memory>
++
++using namespace std;
++
++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
++
++/****************************************************************************
++ * Functions
++ ****************************************************************************/
++
++int main() {
++     /*
++      * This is a simple Executorch app which runs `add.pte`.
++      */
++
++    torch::executor::runtime_init();
++
++    torch::executor::manual_override(); // Hack: This will be updated soon.
++
++    using torch::executor::Result;
++    using torch::executor::Error;
++
++    auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte));
++
++    Result<torch::executor::Program> program = torch::executor::Program::load(&loader);
++    if(!program.ok()) {
++       ET_LOG(Info,"ET: Program loading failed @ 0x%p: 0x%" PRIx32, add_pte, program.error());
++    }
++
++    ET_LOG(Info,"ET: Model buffer loaded, has %lu methods", program->num_methods());
++
++    const char* method_name = nullptr;
++    {
++      const auto method_name_result = program->get_method_name(0);
++      ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
++      method_name = *method_name_result;
++    }
++    ET_LOG(Info,"ET: Running method %s", method_name);
++
++    Result<torch::executor::MethodMeta> method_meta = program->method_meta(method_name);
++    if (!method_meta.ok()) {
++        ET_LOG(Info,"ET: Failed to get method_meta for %s: 0x%x",
++                method_name, (unsigned int)method_meta.error());
++    }
++
++    torch::executor::MemoryAllocator method_allocator{
++        torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
++
++    std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
++    std::vector<torch::executor::Span<uint8_t>> planned_spans; // Passed to the allocator
++    size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
++
++    for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
++        size_t buffer_size = static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
++        ET_LOG(Info,"ET: Setting up planned buffer %zu, size %zu.", id, buffer_size);
++
++        planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
++        planned_spans.push_back({planned_buffers.back().get(), buffer_size});
++    }
++
++    torch::executor::HierarchicalAllocator planned_memory(
++      {planned_spans.data(), planned_spans.size()});
++
++    torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory);
++
++    Result<torch::executor::Method> method = program->load_method(method_name, &memory_manager);
++    if(!method.ok()) {
++        ET_LOG(Info,"ET: Loading of method %s failed with status 0x%" PRIx32, method_name, method.error());
++    }
++    ET_LOG(Info,"ET: Method loaded.");
++
++    ET_LOG(Info,"ET: Preparing inputs...");
++    auto inputs = torch::executor::util::PrepareInputTensors(*method);
++    ET_LOG(Info,"ET: Input prepared.");
++
++    ET_LOG(Info,"ET: Starting the model execution...");
++    Error status = method->execute();
++    if(status != Error::Ok){
++        ET_LOG(Info,"ET: Execution of method %s failed with status 0x%" PRIx32, method_name, status);
++    } else {
++        ET_LOG(Info,"ET: Model executed successfully.");
++    }
++
++    // Print the outputs.
++    std::vector<torch::executor::EValue> outputs(method->outputs_size());
++    ET_LOG(Info, "%zu outputs: ", outputs.size());
++    status = method->get_outputs(outputs.data(), outputs.size());
++    ET_CHECK(status == Error::Ok);
++    for (int i = 0; i < outputs.size(); ++i) {
++       for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
++          printf("Output[%d][%d]: %f\n", i, j, outputs[i].toTensor().const_data_ptr<float>()[j]);
++       }
++    }
++    return 0;
++}
+-- 
+2.39.3
+
diff --git a/examples/arm/cs300/setup.sh b/examples/arm/cs300/setup.sh
new file mode 100755
index 00000000000..63fbd36b3bc
--- /dev/null
+++ b/examples/arm/cs300/setup.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+set -eu
+
+ethos_u_dir=${1:-/tmp/ethos-u}
+script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+
+function patch_repo() {
+    echo -e "\nPreparing ${name}..."
+    cd ${ethos_u_dir}/${name}
+   
+    git reset --hard ${base_rev}
+    
+    patch_dir=${script_dir}/${name}/patches/
+    [[ -e ${patch_dir} && $(ls -A ${patch_dir}) ]] && \
+        git am -3 ${patch_dir}/*.patch
+    
+    echo -e "Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${ethos_u_dir}/${name} dir.\n"
+}
+
+name="core_platform"
+base_rev=204210b1074071532627da9dc69950d058a809f4
+patch_repo 
+
+name="core_software"
+base_rev=74c514a5b50a19197a64a86095bc0429188adcbe
+patch_repo 
+
+exit $?
diff --git a/examples/export/export_example.py b/examples/export/export_example.py
index 9c2a9d9362e..e26d929aeac 100644
--- a/examples/export/export_example.py
+++ b/examples/export/export_example.py
@@ -12,6 +12,7 @@
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
 from .utils import export_to_exec_prog, save_pte_program
+from executorch.exir.print_program import pretty_print, print_program  # noqa
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -40,4 +41,6 @@
     )
 
     prog = export_to_exec_prog(model, example_inputs)
+
+    pretty_print(prog.program.execution_plan)
     save_pte_program(prog.buffer, args.model_name)
diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py
index 0f7131fe21c..1c8f9f3b590 100644
--- a/examples/models/toy_model/model.py
+++ b/examples/models/toy_model/model.py
@@ -45,9 +45,6 @@ def __init__(self):
 
     def forward(self, x, y):
         z = x + y
-        z = z + x
-        z = z + x
-        z = z + z
         return z
 
     def get_eager_model(self) -> torch.nn.Module:
diff --git a/headrify.py b/headrify.py
new file mode 100644
index 00000000000..cdae780c31c
--- /dev/null
+++ b/headrify.py
@@ -0,0 +1,26 @@
+import binascii
+bytes_per_line = 32
+hex_digits_per_line = bytes_per_line * 2
+
+# copied from
+# https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/tree/scripts/py/gen_model_cpp.py
+
+magic_attr = '__attribute__((section(".sram.data"), aligned(16))) char'
+# magic_attr = '__attribute__((section("network_model_sec"), aligned(16))) char'
+# magic_attr = '__attribute__((section("input_data_sec"), aligned(16))) char'
+filename="./add.pte"
+with open(filename, "rb") as fr, open(f"{filename}.h", "w") as fw:
+    data = fr.read()
+    hexstream = binascii.hexlify(data).decode('utf-8')
+
+    hexstring = magic_attr + ' add_pte[] = {'
+
+    for i in range(0, len(hexstream), 2):
+        if 0 == (i % hex_digits_per_line):
+            hexstring += "\n"
+        hexstring += '0x' + hexstream[i:i+2] + ", "
+
+    hexstring += '};\n'
+    fw.write(hexstring)
+    print(f"Wrote {len(hexstring)} bytes, original {len(data)}")
+
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index 1da9d0eaee5..1ec18b3775d 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -33,10 +33,15 @@ Tensor& add_out(
 
   ET_CHECK(canCast(common_type, out_type));
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() {
-        ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() {
+//  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() {
+//    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() {
+//      ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() {
+//        ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() {
+
+          using CTYPE_A = float;
+          using CTYPE_B = float;
+          using CTYPE_IN = float;
+          using CTYPE_OUT = float;
           CTYPE_IN alpha_val;
           ET_EXTRACT_SCALAR(alpha, alpha_val);
 
@@ -51,10 +56,10 @@ Tensor& add_out(
               a,
               b,
               out);
-        });
-      });
-    });
-  });
+//        });
+//      });
+//    });
+//  });
 
   return out;
 }
diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h
index 0373196a4b6..5a297026050 100644
--- a/kernels/portable/cpu/vec_ops.h
+++ b/kernels/portable/cpu/vec_ops.h
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <numeric>
 #include <type_traits>
+#include <cinttypes>
 
 /**
  * @file
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 949b771b9cc..6e31dbe4939 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -17,702 +17,7 @@
 # See the README.md file in this directory for a description of the syntax used
 # by this file.
 
-- op: _log_softmax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::log_softmax_out
-
-- op: _native_batch_norm_legit_no_training.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::_native_batch_norm_legit_no_training_out
-
-- op: _softmax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::softmax_out
-
-- op: _to_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::to_copy_out
-
-- op: abs.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::abs_out
-
-- op: acos.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::acos_out
-
-- op: acosh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::acosh_out
-
 - op: add.out
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::add_out
-
-- op: add.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::add_scalar_out
-
-- op: addmm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::addmm_out
-
-- op: alias_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::alias_copy_out
-
-- op: amax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::amax_out
-
-- op: amin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::amin_out
-
-- op: any.all_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::any_all_out
-
-- op: arange.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::arange_out
-
-- op: arange.start_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::arange_start_out
-
-- op: argmax.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::argmax_out
-
-- op: argmin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::argmin_out
-
-- op: as_strided_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::as_strided_copy_out
-
-- op: asin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::asin_out
-
-- op: asinh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::asinh_out
-
-- op: atan.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::atan_out
-
-- op: atanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::atanh_out
-
-- op: avg_pool2d.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::avg_pool2d_out
-
-- op: bitwise_and.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_and_Scalar_out
-
-- op: bitwise_and.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_and_Tensor_out
-
-- op: bitwise_not.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_not_out
-
-- op: bitwise_or.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_or_Scalar_out
-
-- op: bitwise_or.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_or_Tensor_out
-
-- op: bitwise_xor.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_xor_Scalar_out
-
-- op: bitwise_xor.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bitwise_xor_Tensor_out
-
-- op: bmm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::bmm_out
-
-- op: cat.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cat_out
-
-- op: ceil.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ceil_out
-
-- op: clamp.out
-  cpp_no_default_args: ['min']
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::clamp_out
-
-- op: clone.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::clone_out
-
-- op: constant_pad_nd.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::constant_pad_nd_out
-
-- op: convolution.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::convolution_out
-
-- op: copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::copy_out
-
-- op: cos.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cos_out
-
-- op: cosh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cosh_out
-
-- op: cumsum.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::cumsum_out
-
-- op: detach_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::detach_copy_out
-
-- op: div.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::div_out
-
-- op: div.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::div_scalar_out
-
-- op: div.out_mode
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::div_out_mode
-
-
-- op: embedding.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::embedding_out
-
-- op: empty.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::empty_out
-
-- op: eq.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::eq_scalar_out
-
-- op: erf.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::erf_out
-
-- op: exp.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::exp_out
-
-- op: expand_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::expand_copy_out
-
-- op: fill.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fill_scalar_out
-
-- op: fill.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fill_tensor_out
-
-- op: floor.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::floor_out
-
-- op: floor_divide.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::floor_divide_out
-
-- op: fmod.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fmod_Tensor_out
-
-- op: fmod.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::fmod_Scalar_out
-
-- op: full.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::full_out
-
-# TODO: Investigate why empty dispatch is required for building:
-# buck2 build //executorch/kernels/portable:generated_lib
-- op: full_like.out
-  dispatch: {}
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::full_like_out
-
-- op: ge.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ge_scalar_out
-
-- op: ge.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ge_tensor_out
-
-- op: gelu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::gelu_out
-
-- op: glu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::glu_out
-
-- op: gt.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::gt_scalar_out
-
-- op: gt.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::gt_tensor_out
-
-- op: hardtanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::hardtanh_out
-
-- op: index.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::index_Tensor_out
-
-- op: index_put.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::index_put_out
-
-- op: index_select.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::index_select_out
-
-- op: isinf.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::isinf_out
-
-- op: isnan.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::isnan_out
-
-- op: le.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::le_scalar_out
-
-- op: le.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::le_tensor_out
-
-- op: leaky_relu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::leaky_relu_out
-
-- op: lift_fresh_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::lift_fresh_copy_out
-
-- op: log.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::log_out
-
-- op: logical_and.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_and_out
-
-- op: logical_not.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_not_out
-
-- op: logical_or.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_or_out
-
-- op: logical_xor.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logical_xor_out
-
-- op: logit.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::logit_out
-
-- op: lt.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::lt_scalar_out
-
-- op: lt.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::lt_tensor_out
-
-- op: masked_fill.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::masked_fill_scalar_out
-
-- op: max.dim_max
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::max_out
-
-- op: max_pool2d_with_indices.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::max_pool2d_with_indices_out
-
-- op: mean.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mean_dim_out
-
-- op: min.dim_min
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::min_out
-
-- op: minimum.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::minimum_out
-
-- op: mm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mm_out
-
-- op: mul.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mul_out
-
-- op: mul.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::mul_scalar_out
-
-- op: native_layer_norm.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::native_layer_norm_out
-
-- op: ne.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ne_scalar_out
-
-- op: ne.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ne_tensor_out
-
-- op: neg.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::neg_out
-
-- op: nonzero.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::nonzero_out
-
-- op: ones.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::ones_out
-
-- op: permute_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
-
-- op: pixel_shuffle.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::pixel_shuffle_out
-
-- op: pow.Tensor_Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::pow_Tensor_Scalar_out
-
-- op: pow.Tensor_Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::pow_Tensor_Tensor_out
-
-- op: reciprocal.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::reciprocal_out
-
-- op: relu.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::relu_out
-
-- op: remainder.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::remainder_Tensor_out
-
-- op: remainder.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::remainder_Scalar_out
-
-- op: repeat.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::repeat_out
-
-- op: round.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::round_out
-
-- op: rsqrt.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::rsqrt_out
-
-- op: rsub.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::rsub_scalar_out
-
-- op: scalar_tensor.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::scalar_tensor_out
-
-- op: scatter_add.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::scatter_add_out
-
-- op: select_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::select_copy_int_out
-
-- op: select_scatter.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::select_scatter_out
-
-- op: sigmoid.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sigmoid_out
-
-- op: sign.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sign_out
-
-- op: sin.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sin_out
-
-- op: sinh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sinh_out
-
-- op: slice_copy.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::slice_copy_Tensor_out
-
-- op: slice_scatter.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::slice_scatter_out
-
-- op: split_copy.Tensor_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::split_copy_Tensor_out
-
-- op: sqrt.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sqrt_out
-
-- op: squeeze_copy.dim_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::squeeze_copy_dim_out
-
-- op: stack.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::stack_out
-
-- op: sub.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sub_out
-
-- op: sub.Scalar_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sub_scalar_out
-
-- op: sum.IntList_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::sum_dim_out
-
-- op: t_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::t_copy_out
-
-- op: tan.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::tan_out
-
-- op: tanh.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::tanh_out
-
-- op: transpose_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::transpose_copy_int_out
-
-- op: tril.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::tril_out
-
-- op: unbind_copy.int_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::unbind_copy_int_out
-
-- op: unsqueeze_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::unsqueeze_copy_out
-
-- op: var.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::var_out
-
-- op: view_copy.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::view_copy_out
-
-- op: where.self_out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::where_out
-
-- op: zeros.out
-  kernels:
-    - arg_meta: null
-      kernel_name: torch::executor::zeros_out
diff --git a/manual.h b/manual.h
new file mode 100644
index 00000000000..3719a142718
--- /dev/null
+++ b/manual.h
@@ -0,0 +1,6 @@
+namespace torch {
+namespace executor {
+    void manual_override();
+    void digant_add_out(torch::executor::KernelRuntimeContext & context, EValue** stack);
+    void arm_backend_register();
+}}
diff --git a/runtime/core/function_ref.h b/runtime/core/function_ref.h
index 92171134291..a07f6151f10 100644
--- a/runtime/core/function_ref.h
+++ b/runtime/core/function_ref.h
@@ -59,9 +59,7 @@ class FunctionRef;
 
 template <typename Ret, typename... Params>
 class FunctionRef<Ret(Params...)> {
-  Ret (*callback_)(const void* memory, Params... params) = nullptr;
   union Storage {
-    void* callable;
     Ret (*function)(Params...);
   } storage_;
 
@@ -70,57 +68,18 @@ class FunctionRef<Ret(Params...)> {
   explicit FunctionRef(std::nullptr_t) {}
 
   /**
-   * Case 1: A callable object passed by lvalue reference.
-   * Taking rvalue reference is error prone because the object will be always
-   * be destroyed immediately.
-   */
-  template <
-      typename Callable,
-      // This is not the copy-constructor.
-      typename std::enable_if<
-          !std::is_same<remove_cvref_t<Callable>, FunctionRef>::value,
-          int32_t>::type = 0,
-      // Avoid lvalue reference to non-capturing lambda.
-      typename std::enable_if<
-          !std::is_convertible<Callable, Ret (*)(Params...)>::value,
-          int32_t>::type = 0,
-      // Functor must be callable and return a suitable type.
-      // To make this container type safe, we need to ensure either:
-      // 1. The return type is void.
-      // 2. Or the resulting type from calling the callable is convertible to
-      // the declared return type.
-      typename std::enable_if<
-          std::is_void<Ret>::value ||
-              std::is_convertible<
-                  decltype(std::declval<Callable>()(std::declval<Params>()...)),
-                  Ret>::value,
-          int32_t>::type = 0>
-  explicit FunctionRef(Callable& callable)
-      : callback_([](const void* memory, Params... params) {
-          auto& storage = *static_cast<const Storage*>(memory);
-          auto& callable = *static_cast<Callable*>(storage.callable);
-          return static_cast<Ret>(callable(std::forward<Params>(params)...));
-        }) {
-    storage_.callable = &callable;
-  }
-
-  /**
-   * Case 2: A plain function pointer.
+   * Case 1: A plain function pointer.
    * Instead of storing an opaque pointer to underlying callable object,
    * store a function pointer directly.
    * Note that in the future a variant which coerces compatible function
    * pointers could be implemented by erasing the storage type.
    */
-  /* implicit */ FunctionRef(Ret (*ptr)(Params...))
-      : callback_([](const void* memory, Params... params) {
-          auto& storage = *static_cast<const Storage*>(memory);
-          return storage.function(std::forward<Params>(params)...);
-        }) {
+  /* implicit */ FunctionRef(Ret (*ptr)(Params...)) {
     storage_.function = ptr;
   }
 
   /**
-   * Case 3: Implicit conversion from lambda to FunctionRef.
+   * Case 2: Implicit conversion from lambda to FunctionRef.
    * A common use pattern is like:
    * void foo(FunctionRef<...>) {...}
    * foo([](...){...})
@@ -144,11 +103,11 @@ class FunctionRef<Ret(Params...)> {
       : FunctionRef(static_cast<Ret (*)(Params...)>(function)) {}
 
   Ret operator()(Params... params) const {
-    return callback_(&storage_, std::forward<Params>(params)...);
+    return storage_.function(std::forward<Params>(params)...);
   }
 
   explicit operator bool() const {
-    return callback_;
+    return storage_.function;
   }
 };
 
diff --git a/runtime/platform/target/Posix.cpp b/runtime/platform/target/Posix.cpp
index bc0f1d9f312..9f53964278a 100644
--- a/runtime/platform/target/Posix.cpp
+++ b/runtime/platform/target/Posix.cpp
@@ -52,11 +52,9 @@
 #define _ASSERT_PAL_INITIALIZED()                                   \
   ({                                                                \
     if (!initialized) {                                             \
-      fprintf(                                                      \
-          ET_LOG_OUTPUT_FILE,                                       \
+      printf(                                                       \
           "ExecuTorch PAL must be initialized before call to %s()", \
           __ET_FUNCTION);                                           \
-      fflush(ET_LOG_OUTPUT_FILE);                                   \
       et_pal_abort();                                               \
     }                                                               \
   })
@@ -144,8 +142,7 @@ void et_pal_emit_log_message(
   //
   // Clients who want to change the format or add other fields can override this
   // weak implementation of et_pal_emit_log_message.
-  fprintf(
-      ET_LOG_OUTPUT_FILE,
+  printf(
       "%c %02u:%02u:%02u.%06lu executorch:%s:%zu] %s\n",
       level,
       hour,
@@ -155,5 +152,5 @@ void et_pal_emit_log_message(
       filename,
       line,
       message);
-  fflush(ET_LOG_OUTPUT_FILE);
+  // fflush(ET_LOG_OUTPUT_FILE);
 }
diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index 0c7dc2cbec4..55c07fd5f7b 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -41,7 +41,7 @@ add_custom_command(
     -o "${_program_schema__include_dir}/executorch/schema"
     ${_program_schema__srcs}
   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-  DEPENDS ${FLATC_EXECUTABLE} ${_program_schema__srcs}
+  DEPENDS ${_program_schema__srcs}
   COMMENT "Generating program_schema headers"
   VERBATIM)