diff --git a/.gitmodules b/.gitmodules index 4b783f4abc0..39909b9c84c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -37,3 +37,6 @@ [submodule "examples/third-party/llama"] path = examples/third-party/llama url = https://github.com/facebookresearch/llama.git +[submodule "backends/arm/third-party/ethos-u-core-driver"] + path = backends/arm/third-party/ethos-u-core-driver + url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 1da66101409..4f5decb194c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,10 +59,20 @@ endif() # directory, before and after this command is invoked - targets in # sub-directories added after this command is invoked if(CMAKE_BUILD_TYPE STREQUAL "Release") + # To enable logging in Release mode + option( + EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE + "Enable logging in release mode" OFF) + + set(_ET_LOG_ENABLE 0) + if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE}) + set(_ET_LOG_ENABLE 1) + endif() + # Avoid pulling in the logging strings, which can be large. - add_definitions(-DET_LOG_ENABLED=0) - # Avoid pulling in the flatbuffer data verification logic, which can add about - # 20kB. + add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE}) + # Avoid pulling in the flatbuffer data verification + # logic, which can add about 20kB. add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0) endif() @@ -94,17 +104,21 @@ option(BUILD_SELECTIVE_BUILD_TEST option(EXECUTORCH_BUILD_SIZE_TEST "Whether to build size test" OFF) +# Option to register op list +option(SELECT_OPS_LIST "Register the following list of ops" OFF) + if(BUILD_SELECTIVE_BUILD_TEST) option(SELECT_ALL_OPS "Whether to register all ops defined in portable kernel library." OFF) - # Option to register op list - option(SELECT_OPS_LIST "Register the following list of ops" OFF) - # Option to register ops from yaml file option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF) endif() +# Build Arm Baremetal backend +option(EXECUTORCH_BUILD_ARM_BAREMETAL + "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF) + # Build xnn_executor_runner which depends on XNNPACK option(EXECUTORCH_BUILD_XNNPACK "Build xnn_executor_runner which depends on XNNPACK" OFF) @@ -303,6 +317,10 @@ if(EXECUTORCH_BUILD_XNNPACK) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) endif() +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +endif() + # Add selective build subdirectory if(BUILD_SELECTIVE_BUILD_TEST) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt new file mode 100644 index 00000000000..4dcf2ff0539 --- /dev/null +++ b/backends/arm/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +set(_common_include_directories ${EXECUTORCH_ROOT}/..) +set(_common_compile_options -Wno-deprecated-declarations) + +include(cmake/Dependencies.cmake) + +set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) +list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") + +add_library( + executorch_delegate_ethos_u + STATIC ${_arm_baremetal_sources} +) +target_include_directories( + executorch_delegate_ethos_u + PUBLIC + ${_common_include_directories} +) +target_include_directories( + executorch_delegate_ethos_u + PUBLIC + ${DRIVER_ETHOSU_INCLUDE_DIR} +) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 2a734c68ff7..1727b3fe28d 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -12,6 +12,8 @@ import logging import operator import os +import struct +import subprocess import tempfile from typing import final, List @@ -143,6 +145,82 @@ def dbg_tosa_dump(tosa_fb, path): f.close() +# Output to Vela with current file-based compilation +# WARNING: if this changes, the runtime reader also needs to change +def vela_compile(tosa_fb): + with tempfile.TemporaryDirectory() as tmpdir: + tosaname = "out.tosa" + flatbuffer = tosa_fb.serialize() + f = open(os.path.join(tmpdir, tosaname), "wb") + f.write(flatbuffer) + f.close() + + # invoke vela + # TODO target ethos-u55-128 + vela_command = ( + f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" + ) + subprocess.run([vela_command], shell=True, check=True) + + np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") + blocks = b"" + with np.load(np_path, allow_pickle=False) as data: + # Emit the NPZ regions as: + # - 16 byte block name null terminated string (padded to 16 if name shorter) + # - 4 byes of int32 block length and 12 bytes of 0's + # - block data (padded to 16 byte alignment at end) + # Repeat for all blocks + for key in data.keys(): + block_name = bytes(key, "utf8")[:15] + block_name = block_name + b"\x00" * (16 - len(block_name)) + + block_data = b"" + if key in ("input_shape", "output_shape"): + inputs = data[key] + # Encode a struct of int len; and one or more int x,y,z,w shape; + input_struct = struct.pack(":-gdwarf-3>" + "$<$:-fno-unwind-tables;-fno-rtti;-fno-exceptions>" + -fdata-sections + -ffunction-sections) + +# Compile defines +add_compile_definitions( + "$<$>:NDEBUG>") + +# Link options +add_link_options( + -mcpu=${GCC_CPU} + -mthumb + --specs=nosys.specs) + +# Set floating point unit +if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp") + set(FLOAT soft) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)") + set(FLOAT hard) + set(FPU_CONFIG "fpv4-sp-d16") + add_compile_options(-mfpu=${FPU_CONFIG}) + add_link_options(-mfpu=${FPU_CONFIG}) +else() + set(FLOAT soft) +endif() + +if(FLOAT) + add_compile_options(-mfloat-abi=${FLOAT}) + add_link_options(-mfloat-abi=${FLOAT}) +endif() + +add_link_options(LINKER:--nmagic,--gc-sections) + +# Compilation warnings +add_compile_options( +# -Wall +# -Wextra + +# -Wcast-align +# -Wdouble-promotion +# -Wformat +# -Wmissing-field-initializers +# -Wnull-dereference +# -Wredundant-decls +# -Wshadow +# -Wswitch +# -Wswitch-default +# -Wunused + -Wno-redundant-decls + -Wno-psabi +) diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh new file mode 100755 index 00000000000..0dbb8cf2177 --- /dev/null +++ b/backends/arm/cmake/build.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# +# Setup toolchain +# +BASEDIR=`realpath $(dirname "$0")` +echo "building using build.sh in $BASEDIR" + +ARCH=$(uname -i) +GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/ + +echo $GCCPATH +if test -d "${GCCPATH}"; then + echo Using exising compiler ${GCCPATH} +else + pushd ${BASEDIR}/ + ./toolchain.sh + popd +fi +export PATH=${PATH}:${GCCPATH} + +echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"` + + +# +# Prepare and run clean build +# +rm -rf buck-out/ build/lib/ cmake-out/ +rm -rf cmake-corstone +mkdir cmake-corstone +cd cmake-corstone + +#cmake -DBUCK2=buck2 .. + +#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake .. +cmake -DFLATC_EXECUTABLE=flatc \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ + --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \ + .. + +cd .. +cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh new file mode 100755 index 00000000000..92188ee982d --- /dev/null +++ b/backends/arm/cmake/toolchain.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon) +ARCH=$(uname -i) +curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz +tar xf gcc.tar.xz +export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)` diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp new file mode 100644 index 00000000000..f1da72b6396 --- /dev/null +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -0,0 +1,313 @@ +/* + * Copyright 2023 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Arm backend for Ethos-U baremetal driver stack, this relies on the + * ethos-u-core-driver for hardware interaction. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +using namespace std; + +namespace torch { +namespace executor { + +// TODO we should be in 0x31, not this lower 1MB sRAM +// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000 +#define CS300_SRAM_LOW ((void*)0x11000000) +#define CS300_SRAM_HIGH ((void*)0x110FFFFF) + +class ArmBackend final : public PyTorchBackendInterface { + public: + ArmBackend() {} + + ~ArmBackend() = default; + + virtual bool is_available() const override { + return 1; + } + + Result init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const override { + ET_LOG(Info, "ArmBackend::init %p", processed->data()); + + char* data = (char*)processed->data(); + size_t size = processed->size(); + char* foot = data + size - 16; + + // Header and footer both 16 bit aligned suggest valid structure and we + // wont walk off the end of the chunks and segfault + if (!((int)data == next_mul_16((int)data))) { + ET_LOG(Error, "ArmBackend::init: Binary needs to be 16 byte unaligned"); + return Error::InvalidProgram; + } + if (!((int)foot == next_mul_16((int)foot))) { + ET_LOG(Error, "ArmBackend::init: Program unexpected size"); + return Error::InvalidProgram; + } + if (!(0 == strncmp(data, "vela_bin_stream", 15))) { + ET_LOG(Error, "ArmBackend::init: Binary passed not a vela_bin_stream"); + return Error::InvalidProgram; + } + if (!(0 == strncmp(foot, "vela_end_stream", 15))) { + ET_LOG(Error, "ArmBackend::init: Binary passed missing vela_end_stream"); + return Error::InvalidProgram; + } + // Verify address range is accessible current expectation is the program + // is wholly stored in SRAM + if (!(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH)) { + ET_LOG(Error, "ArmBackend::init: Expected program binary to be in SRAM"); + return Error::InvalidProgram; + } + + // Return the same buffer we were passed - this data will be + // executed directly + return processed; + } + + Error execute( + BackendExecutionContext& context, + DelegateHandle* input_handle, + EValue** args) const override { + FreeableBuffer* processed = (FreeableBuffer*)input_handle; + + ET_LOG(Info, "ArmBackend::execute %p", processed->data()); + + vela_handles handles; + + // Command stream - we know at this point it's aligned + char* data = (char*)processed->data(); + + // Read key sections from the vela_bin_stream + if (!this->vela_read(data, &handles, processed->size())) { + ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout"); + return Error::InvalidProgram; + } + + ET_LOG( + Debug, + "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + handles.cmd_data, + handles.cmd_data_size, + handles.weight_data, + handles.weight_data_size, + handles.scratch_data, + handles.scratch_data_size); + + // Write inputs into SRAM scratch area defined by Vela + for (int i = 0; i < handles.input_shape.size(); i++) { + const char* input_addr = handles.scratch_data + handles.input_offset[i]; + // Process input EValue into scratch + // TODO: optimise into direct write for compatible, contig layout + int* input_address = (int*)input_addr; + auto tensor_in = args[i]->toTensor(); + for (int j = 0; j < tensor_in.numel(); j++) { + // TODO: extend beyond 4 byte tensors + input_address[j] = tensor_in.mutable_data_ptr()[j]; + } + } + +#if 0 + // TMP emit scratch + printf("Scratch after setup:\n"); + for (int i = 0; i < handles.scratch_data_size; i++) { + printf("%02x ", ((char*)handles.scratch_data)[i]); + if (!((i + 1) % 4)) + printf("\n"); + } + printf("\n"); + // END TMP emit scratch +#endif + + // Allocate driver handle and synchronously invoke driver + ethosu_driver* drv = ethosu_reserve_driver(); + + uint64_t bases[2] = { + (uint64_t)handles.weight_data, (uint64_t)handles.scratch_data}; + size_t bases_size[2] = { + handles.weight_data_size, handles.scratch_data_size}; + int result = ethosu_invoke_v3( + drv, + (void*)handles.cmd_data, + handles.cmd_data_size, + bases, + bases_size, + 2, + nullptr); + + if (result != 0) { + ET_LOG( + Error, + "ArmBackend::execute: Ethos-U invocation failed error (%d)", + result); + return Error::InvalidProgram; + } + +#if 0 + // TMP emit scratch + printf("Scratch after:\n"); + for (int i = 0; i < handles.scratch_data_size; i++) { + printf("%02x ", ((char*)handles.scratch_data)[i]); + if (!((i + 1) % 4)) + printf("\n"); + } + printf("\n"); +#endif + + // output data from Ethos U + // We only handle one output at the moment + const char* output_addr = handles.scratch_data + handles.output_offset[0]; + // Outputs are in the index immediately after inputs + int output_index = handles.input_shape.size(); + + // Process results into EValue storage + // TODO: optimise into direct write for compatible, contig layout + int* output_address = (int*)output_addr; + auto tensor_out = args[output_index]->toTensor(); + for (int j = 0; j < tensor_out.numel(); j++) { + // TODO: extend beyond 4 byte tensors + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } + + return Error::Ok; + } + + void destroy(DelegateHandle* handle) const override { + return; + } + + private: + typedef struct { + const char* cmd_data; + size_t cmd_data_size; + const char* weight_data; + size_t weight_data_size; + const char* scratch_data; + size_t scratch_data_size; + vector input_offset; + vector> input_shape; + vector output_offset; + vector> output_shape; + } vela_handles; + + typedef struct { + char name[16]; + int size; + char _pad[12]; + char data[]; + } vela_bin_block; + + typedef struct { + int count; + int shape[][4]; + } vela_shapes; + + typedef struct { + int count; + int offsets[]; + } vela_offsets; + + static int next_mul_16(int n) { + return ((n - 1) | 15) + 1; + } + + int vela_read(char* data, vela_handles* h, int size) const { + // Read header string + if (strncmp(data, "vela_bin_stream", 15)) { + return 0; + } + data += 16; + + // Expect one or more 'vela_bin_block's + while (1) { + vela_bin_block* b = (vela_bin_block*)data; + data += 16 + 16 + next_mul_16(b->size); + + // Exit with success on finding end of stream + if (!strncmp(b->name, "vela_end_stream", 15)) + return 1; + + if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) { + // This magic header confirms a valid command stream in binary + if (strncmp(b->data, "COP1", 4)) + return 0; + h->cmd_data = b->data; + h->cmd_data_size = b->size; + } + if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { + h->weight_data = b->data; + h->weight_data_size = b->size; + } + if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + + // capture inputs and outputs + if (!strncmp(b->name, "scratch_data", strlen("scratch_data"))) { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + + if (!strncmp(b->name, "input_offset", strlen("input_offset"))) { + vela_offsets* offsets = (vela_offsets*)b->data; + for (int i = 0; i < offsets->count; i++) { + h->input_offset.push_back(offsets->offsets[i]); + } + } + if (!strncmp(b->name, "output_offset", strlen("output_offset"))) { + vela_offsets* offsets = (vela_offsets*)b->data; + for (int i = 0; i < offsets->count; i++) { + h->output_offset.push_back(offsets->offsets[i]); + } + } + + if (!strncmp(b->name, "input_shape", strlen("input_shape"))) { + vela_shapes* shapes = (vela_shapes*)b->data; + for (int i = 0; i < shapes->count; i++) { + vector s = { + shapes->shape[i][0], + shapes->shape[i][1], + shapes->shape[i][2], + shapes->shape[i][3]}; + h->input_shape.push_back(s); + } + } + if (!strncmp(b->name, "output_shape", strlen("output_shape"))) { + vela_shapes* shapes = (vela_shapes*)b->data; + for (int i = 0; i < shapes->count; i++) { + vector s = { + shapes->shape[i][0], + shapes->shape[i][1], + shapes->shape[i][2], + shapes->shape[i][3]}; + h->output_shape.push_back(s); + } + } + } + } +}; + +namespace { +auto backend = ArmBackend(); +Backend backend_id{"ArmBackend", &backend}; +static auto registered = register_backend(backend_id); +} // namespace + +} // namespace executor +} // namespace torch diff --git a/backends/arm/test/test_models.py b/backends/arm/test/test_models.py index 405d76ac50c..0b793721945 100644 --- a/backends/arm/test/test_models.py +++ b/backends/arm/test/test_models.py @@ -25,6 +25,7 @@ class TosaProfile(Enum): BI = 0 # Base Inference MI = 1 # Main Inference MT = 2 # Main Training + BI_INT = 3 # integer only BI subset tests (for test graphs) class TorchBuilder: @@ -39,6 +40,7 @@ class simple_add(torch.nn.Module): inputs = { TosaProfile.BI: (torch.ones(5),), TosaProfile.MI: (torch.ones(5),), + TosaProfile.BI_INT: (torch.ones(5, dtype=torch.int32),), } def __init__(self): @@ -47,6 +49,21 @@ def __init__(self): def forward(self, x): return x + x + @register_test + class simple_add_2(torch.nn.Module): + inputs = { + TosaProfile.BI_INT: ( + torch.ones(5, dtype=torch.int32), + torch.ones(5, dtype=torch.int32), + ), + } + + def __init__(self): + super().__init__() + + def forward(self, x, y): + return x + y + @register_test class simple_add_broadcast(torch.nn.Module): inputs = { @@ -82,7 +99,7 @@ def forward(self, x): x = self.fc(x) return x - @register_test + # @register_test class simple_conv2d(torch.nn.Module): inputs = { TosaProfile.BI: ( @@ -106,7 +123,7 @@ def forward(self, x): x = self.conv2d(x) return x - @register_test + # @register_test class block_two_conv2d(torch.nn.Module): inputs = { TosaProfile.BI: (torch.ones(1, 3, 256, 256),), @@ -127,7 +144,7 @@ def forward(self, x): x = self.conv2d_2(x) return x - @register_test + # @register_test class simple_depthwise_conv2d(torch.nn.Module): inputs = { TosaProfile.BI: ( @@ -231,7 +248,7 @@ def __init__(self): def forward(self, x): return self.softmax(x) - @register_test + # @register_test class block_conv_norm_activation(torch.nn.Module): inputs = { TosaProfile.BI: (torch.ones(1, 3, 256, 256),), @@ -253,7 +270,7 @@ def forward(self, x): x = self.relu6(x) return x - @register_test + # @register_test class block_bottleneck_residual(torch.nn.Module): # This is the essence of MobileNetV2 # Ref: https://arxiv.org/abs/1801.04381 diff --git a/backends/arm/test/test_tosa.py b/backends/arm/test/test_tosa.py index b3e59658641..089092275c9 100644 --- a/backends/arm/test/test_tosa.py +++ b/backends/arm/test/test_tosa.py @@ -34,22 +34,22 @@ class TestBasicNN(unittest.TestCase): def test_minimal_MI(self): - for test_model in TestList: + for test_model in ("simple_add",): print(f"Running test {test_model}") model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.MI) model_edge, exec_prog = export_model(model, inputs, []) # TODO: check there is a tosa delegate blob in the output - def test_minimal_BI(self): - for test_model in TestList: - print(f"Running test {test_model}") - model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.BI) - if inputs is None: - print(" Skipping, no inputs for this profile") - continue - model_edge, exec_prog = export_model(model, inputs, []) - # TODO: check there is a tosa delegate blob in the output + # def test_minimal_BI(self): + # for test_model in TestList: + # print(f"Running test {test_model}") + # model, inputs, outputs = prepare_model_and_ref(test_model, TosaProfile.BI) + # if inputs is None: + # print(" Skipping, no inputs for this profile") + # continue + # model_edge, exec_prog = export_model(model, inputs, []) + # # TODO: check there is a tosa delegate blob in the output def prepare_model_and_ref(test_model, profile=TosaProfile.MI): diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver new file mode 160000 index 00000000000..90f9df900ac --- /dev/null +++ b/backends/arm/third-party/ethos-u-core-driver @@ -0,0 +1 @@ +Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5 diff --git a/examples/backend/arm/arm_ethosu_minimal.py b/examples/backend/arm/arm_ethosu_minimal.py new file mode 100644 index 00000000000..93b73909251 --- /dev/null +++ b/examples/backend/arm/arm_ethosu_minimal.py @@ -0,0 +1,212 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import subprocess + +import executorch.exir as exir + +import numpy as np +from executorch.backends.arm.arm_backend import ArmPartitioner +from executorch.backends.arm.test.test_models import TosaProfile +from executorch.backends.arm.test.test_tosa import prepare_model_and_ref + +from executorch.exir.backend.backend_api import to_backend +from executorch.exir.backend.canonical_partitioners.duplicate_dequant_node_pass import ( + DuplicateDequantNodePass, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec + +# Assumes you have these two tools on your path +TOSA_REF_MODEL_PATH = "tosa_reference_model" +VELA_COMPILER_PATH = "vela" + +# Basic config for graph capture +_CAPTURE_CONFIG = exir.CaptureConfig(enable_aot=True) +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=False, +) + +EXAMPLE_TEST_LIST = ["simple_add", "simple_add_2"] + +# +# +# +# +def tosa_ref_capture_inputs( + model_edge, + inputs, + path, + input_quantization_scales, + input_quantization_zps, + profile=TosaProfile.MI, +): + # Emit TOSA test data from the model inputs - assumes whole graph lowered so we just have + # placeholders for the TOSA delegate. Emits data in tosa_ref_model expected layout. + # - Skips placeholders which are encoded as constants (i.e. are already captured weights) + # - Assumes argument order is fixed + argument_names = [] + for node in model_edge.exported_program.graph.nodes: + gs = model_edge.exported_program.graph_signature + if node.op == "placeholder": + if node.name in gs.inputs_to_parameters: + pass + elif node.name in gs.inputs_to_buffers: + pass + else: + argument_names.append(node.name) + else: + break + + for arg in zip(argument_names, inputs): + name = arg[0] + data = arg[1].detach().numpy() + file_path = path + "/" + name + ".npy" + + # Torch is doing Input[FP32]->Q[INT8]->DQ[FP32]->Operator[FP32]->Q[INT]->DQ[FP32]->[Output]FP32 + # Need to quantize the input to INT8 for TOSA comsumption + if profile is TosaProfile.BI: + data_quantized = ( + (data / input_quantization_scales[name]) - input_quantization_zps[name] + ).astype(np.int8) + np.save(file_path, data_quantized, allow_pickle=False) + else: + np.save(file_path, data, allow_pickle=False) + + +# +# Minimal sequence to take a model through the ArmPartitioner and produce +# both TOSA intermediate output, and an Ethos-U55 command stream within +# the ExecuTorch .pte binary +# +def run_test(op, profile=TosaProfile.MI, output_path="./ethosout/"): + # + # Minimal sequence to take model through TosaPartitioner and emit + # tosaout/ debug directory containing the flatbuffer - assumes one and will only save last output + # tosaout is generated even for partial/broken subgraph capture to aid in debg + # delegated.pte containing the flatbuffer within the executorch flatbuffer binary + # + print(f"\n\033[96mProcessing:::{op}\033[0m") + print(f"\033[96mDebug output path for intermediates: {output_path}\033[0m") + + os.makedirs(output_path, exist_ok=True) + + # Debug output for TORCH + TORCH_OUT_PATH = os.path.join(output_path, op, "torch", "") + os.makedirs(TORCH_OUT_PATH, exist_ok=True) + + # Debug output for TOSA + TOSA_OUT_PATH = os.path.join(output_path, op, "tosa", "") + os.makedirs(TOSA_OUT_PATH, exist_ok=True) + + model, inputs, torch_output = prepare_model_and_ref(op, profile) + + if inputs is None: + print("\033[96m Skipping, model has no inputs for TOSA profile \033[0m") + return + + print(f" Model: {op}\n Inputs: {inputs}\n Outputs: {torch_output}") + + # Export model + model_capture = exir.capture(model, inputs, _CAPTURE_CONFIG) + model_edge = model_capture.to_edge(_EDGE_COMPILE_CONFIG) + + # Partition with ArmBackend + ArmPartitioner.compile_spec = [ + CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8")) + ] + model_edge.exported_program = to_backend( + model_edge.transform(DuplicateDequantNodePass()).exported_program, + ArmPartitioner, + ) + exec_prog = model_edge.to_executorch() + + # Save .pte including delegated Vela section + with open(TORCH_OUT_PATH + "/delegated.pte", "wb") as fh: + fh.write(exec_prog.buffer) + + # NOTE: + # Additional steps from here are optional but can be helpful with + # debug as they will capture the inputs and outputs as well as running + # the intermediate output on the tosa_reference_model. + # This can ensure the compilation flow is working correctly as part of + # a development loop, ahead of running the example on hardware. + + # Save inputs for TOSA reference run + tosa_ref_capture_inputs(model_edge, inputs, TOSA_OUT_PATH, {}, {}, profile) + + # Save ground truth results to file + with open(TORCH_OUT_PATH + "/torch_output.npy", "wb") as f: + np.save(f, torch_output.detach().numpy()) + + # Convert TOSA Flatbuffer into JSON format for human debugging + cmd_flatc = ( + "flatc" + + " -o " + + TOSA_OUT_PATH + + " --raw-binary -t ./backends/arm/third-party/serialization_lib/schema/tosa.fbs -- " + + TOSA_OUT_PATH + + "/output.tosa" + ) + subprocess.run([cmd_flatc], shell=True, check=True) + + ### Run the TOSA flatbuffer through TOSA Ref_Model and print the results + DESC_FILE_NAME = "/desc.json" + DESC_FILE_PATH = TOSA_OUT_PATH + DESC_FILE_NAME + cmd_ref_model = TOSA_REF_MODEL_PATH + " --test_desc " + DESC_FILE_PATH + subprocess.run([cmd_ref_model], shell=True, check=True) + + ## Load in the JSON File, Read the tosa output + desc_file = open(DESC_FILE_PATH) + desc_json = json.load(desc_file) + tosa_out_filenames = desc_json["ofm_file"] + for tosa_out_fm_file_name in tosa_out_filenames: + f = open(TOSA_OUT_PATH + "/" + tosa_out_fm_file_name, "rb") + tosa_output = np.load(f) + + ## Read the Torch Output + torch_file = open(TORCH_OUT_PATH + "/torch_output.npy", "rb") + torch_output = np.load(torch_file) + + ## Compare Tosa and Torch Results + if np.allclose(tosa_output, torch_output, rtol=1e-1, atol=1e-1, equal_nan=True): + print( + "\033[92m" + + "Torch and Tosa Reference results are matching for operator: " + + op + + " from " + + str(str(profile)) + + "\033[0m" + ) + + else: + print("\033[91m" + "Sorry, Torch and Tosa Reference Results Do not Match!") + print("============================") + print("TOSA Output Shape is: " + str(tosa_output.shape)) + print("TOSA Output is: ") + print(tosa_output) + print("\033[93m") + print("============================") + print("Torch Output Shape is: " + str(torch_output.shape)) + print("Torch Output is: ") + print(torch_output) + print("\033[0m") + + if profile in (TosaProfile.BI, TosaProfile.BI_INT): + cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa" + try: + subprocess.run([cmd_vela], shell=True, check=True) + print("\033[92m" + "Vela compile worked for: " + op + "\033[0m") + except: + print("\033[91m" + "Vela compile failed for: " + op + "\033[0m") + else: + print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m") + + +# systest mode for running all models against both inference profiles +if __name__ == "__main__": + for op in EXAMPLE_TEST_LIST: + run_test(op, profile=TosaProfile.BI_INT) diff --git a/examples/arm/arm_tosa_e2e.py b/examples/backend/arm/arm_tosa_e2e.py similarity index 94% rename from examples/arm/arm_tosa_e2e.py rename to examples/backend/arm/arm_tosa_e2e.py index 0dba4fa9866..8522b73bc3c 100644 --- a/examples/arm/arm_tosa_e2e.py +++ b/examples/backend/arm/arm_tosa_e2e.py @@ -260,21 +260,21 @@ def tosa_run_test(op, profile=TosaProfile.MI): # noqa: C901 print(torch_output) print("\033[0m") - # if profile == TosaProfile.BI: - # cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa" - # try: - # subprocess.run([cmd_vela], shell=True, check=True) - # print("\033[92m" + "Vela compile worked for: " + op + "\033[0m") - # except: - # print("\033[91m" + "Vela compile failed for: " + op + "\033[0m") - # else: - # print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m") + if profile == TosaProfile.BI: + cmd_vela = "cd " + TOSA_OUT_PATH + "; " + VELA_COMPILER_PATH + " ./output.tosa" + try: + subprocess.run([cmd_vela], shell=True, check=True) + print("\033[92m" + "Vela compile worked for: " + op + "\033[0m") + except: + print("\033[91m" + "Vela compile failed for: " + op + "\033[0m") + else: + print("\033[96m" + "Skipping Vela test on non-BI profile." + "\033[0m") # Temp systest mode for running all models against both inference profiles if __name__ == "__main__": for op in TestList: - tosa_run_test(op, profile=TosaProfile.MI) + tosa_run_test(op, profile=TosaProfile.BI) for op in TestList: tosa_run_test(op, profile=TosaProfile.BI) diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0001-Executorch-Add-README.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0001-Executorch-Add-README.patch new file mode 100644 index 00000000000..b00141486dd --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0001-Executorch-Add-README.patch @@ -0,0 +1,35 @@ +From 48a99f4b00e504c13cd74ca44a5ce7128f719cba Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Tue, 3 Oct 2023 21:20:21 -0700 +Subject: [PATCH 1/6] [Executorch] Add README + +--- + applications/executorch_tests/README.md | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + create mode 100644 applications/executorch_tests/README.md + +diff --git a/applications/executorch_tests/README.md b/applications/executorch_tests/README.md +new file mode 100644 +index 0000000..f2dfb05 +--- /dev/null ++++ b/applications/executorch_tests/README.md +@@ -0,0 +1,16 @@ ++## ExecuTorch ++A unified ML software stack within the PyTorch platform for edge devices. It ++defines new compiler entry points as well as a state-of-art runtime. ++ ++Home: https://github.com/pytorch/executorch/ ++ ++### executor_runner ++ ++This test is a simple wrapper around ExecuTorch runtime, capable of running ++`.pte` model files compatible with ExecuTorch. ++ ++If configured correctly with `ET_*` CMake variables pointing to the ExecuTorch ++project build, then this test bin executes `model.pte.h` file converted from ++`model.pte` using `pte_to_header.py`, from the ExecuTorch project root dir, ++containing an ExecuTorch compatible PyTorch model on the Costrone 300 FVP using ++ExecuTorch runtime. +-- +2.39.3 + diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0002-Executorch-local-patch-regress-cmake-version-from-3..patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0002-Executorch-local-patch-regress-cmake-version-from-3..patch new file mode 100644 index 00000000000..7e96c139720 --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0002-Executorch-local-patch-regress-cmake-version-from-3..patch @@ -0,0 +1,26 @@ +From 3359f94fc57ac76b3f5995d4453975251b56ae71 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 18:05:03 -0700 +Subject: [PATCH 2/6] [Executorch][local-patch] regress cmake version from 3.21 + --> 3.20 + +--- + targets/corstone-300/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/targets/corstone-300/CMakeLists.txt b/targets/corstone-300/CMakeLists.txt +index 62205bb..7dda8a1 100644 +--- a/targets/corstone-300/CMakeLists.txt ++++ b/targets/corstone-300/CMakeLists.txt +@@ -42,7 +42,7 @@ set(MEMORY_ARENA "dram" CACHE STRING "Memory config for arena") + # Project + ############################################################################# + +-cmake_minimum_required(VERSION 3.21) ++cmake_minimum_required(VERSION 3.20) + + project(ethos-u-corstone-300 VERSION 0.0.1) + +-- +2.39.3 + diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0003-Executorch-local-patch-Disable-warnings-to-reduce-ve.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0003-Executorch-local-patch-Disable-warnings-to-reduce-ve.patch new file mode 100644 index 00000000000..af3325acda8 --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0003-Executorch-local-patch-Disable-warnings-to-reduce-ve.patch @@ -0,0 +1,53 @@ +From b13de10ad4920da069d44efb99eceb86f6169a32 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 18:05:30 -0700 +Subject: [PATCH 3/6] [Executorch][local-patch] Disable warnings to reduce + verbosity + +--- + cmake/toolchain/arm-none-eabi-gcc.cmake | 28 ++++++++++++------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake +index 093005e..0e6a2ed 100644 +--- a/cmake/toolchain/arm-none-eabi-gcc.cmake ++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake +@@ -85,21 +85,21 @@ add_link_options(LINKER:--nmagic,--gc-sections) + + # Compilation warnings + add_compile_options( +- -Wall +- -Wextra ++ # -Wall ++ # -Wextra + +- -Wcast-align +- -Wdouble-promotion +- -Wformat +- -Wmissing-field-initializers +- -Wnull-dereference +- -Wredundant-decls +- -Wshadow +- -Wswitch +- -Wswitch-default +- -Wunused ++ # -Wcast-align ++ # -Wdouble-promotion ++ # -Wformat ++ # -Wmissing-field-initializers ++ # -Wnull-dereference ++ # -Wredundant-decls ++ # -Wshadow ++ # -Wswitch ++ # -Wswitch-default ++ # -Wunused + +- -Wno-redundant-decls ++ # -Wno-redundant-decls + +- -Wno-psabi ++ # -Wno-psabi + ) +-- +2.39.3 + diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0004-Executorch-local-patch-New-phdr-for-.data-section.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0004-Executorch-local-patch-New-phdr-for-.data-section.patch new file mode 100644 index 00000000000..9981e302fef --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0004-Executorch-local-patch-New-phdr-for-.data-section.patch @@ -0,0 +1,33 @@ +From 5423ef7ec31e4260ec79f2f6e60deddc1640f3e4 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Mon, 2 Oct 2023 20:39:39 -0700 +Subject: [PATCH 4/6] [Executorch][local-patch] New phdr for .data section + +--- + targets/corstone-300/platform.ld | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld +index 8d77329..8de77c4 100644 +--- a/targets/corstone-300/platform.ld ++++ b/targets/corstone-300/platform.ld +@@ -94,6 +94,7 @@ PHDRS + { + rom_exec PT_LOAD; + rom_dram PT_LOAD; ++ data PT_LOAD; /* HACK: New prog header for .data (and friends) going in DTCM */ + null PT_NULL; + } + +@@ -247,7 +248,7 @@ SECTIONS + /* All data end */ + __data_end__ = .; + +- } > DTCM :rom_exec ++ } > DTCM :data + + .sram.bss : + { +-- +2.39.3 + diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0005-Executorch-Add-pte-to-header-script.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0005-Executorch-Add-pte-to-header-script.patch new file mode 100644 index 00000000000..47ed2c7e8be --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0005-Executorch-Add-pte-to-header-script.patch @@ -0,0 +1,84 @@ +From dcf2e249d7f96f521e19c556d7529757aa94a0f5 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Tue, 3 Oct 2023 21:20:07 -0700 +Subject: [PATCH 5/6] [Executorch] Add pte to header script + +--- + .../executorch_tests/pte_to_header.py | 65 +++++++++++++++++++ + 1 file changed, 65 insertions(+) + create mode 100644 applications/executorch_tests/pte_to_header.py + +diff --git a/applications/executorch_tests/pte_to_header.py b/applications/executorch_tests/pte_to_header.py +new file mode 100644 +index 0000000..37d88aa +--- /dev/null ++++ b/applications/executorch_tests/pte_to_header.py +@@ -0,0 +1,65 @@ ++# Copyright (c) Meta Platforms, Inc. and affiliates. ++# All rights reserved. ++# ++# This source code is licensed under the BSD-style license found in the ++# LICENSE file in the root directory of this source tree. ++ ++import binascii ++import os ++from argparse import ArgumentParser, ArgumentTypeError ++ ++# Also see: https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/tree/scripts/py/gen_model_cpp.py ++ ++bytes_per_line = 32 ++hex_digits_per_line = bytes_per_line * 2 ++ ++ ++def input_file_path(path): ++ if os.path.exists(path): ++ return path ++ else: ++ raise ArgumentTypeError(f"input filepath:{path} does not exist") ++ ++ ++parser = ArgumentParser() ++parser.add_argument( ++ "--pte", ++ help="ExecuTorch .pte model file", ++ type=input_file_path, ++ required=True, ++) ++parser.add_argument( ++ "--outdir", ++ help="Output dir for model_pte.h", ++ type=str, ++ required=False, ++ default=".", ++) ++parser.add_argument( ++ "--section", ++ help="Section attribute for the data array", ++ type=str, ++ required=False, ++ default=".sram.data", ++) ++args = parser.parse_args() ++outfile = os.path.join(args.outdir, "model_pte.h") ++attr = f'__attribute__((section("{args.section}"), aligned(16))) char ' ++ ++with open(args.pte, "rb") as fr, open( ++ outfile, "w" ++) as fw: ++ data = fr.read() ++ hexstream = binascii.hexlify(data).decode("utf-8") ++ hexstring = attr + "model_pte[] = {" ++ ++ for i in range(0, len(hexstream), 2): ++ if 0 == (i % hex_digits_per_line): ++ hexstring += "\n" ++ hexstring += "0x" + hexstream[i : i + 2] + ", " ++ ++ hexstring += "};\n" ++ fw.write(hexstring) ++ print( ++ f"Input: {args.pte} with {len(data)} bytes. Output: {outfile} with {len(hexstring)} bytes. Section: {args.section}." ++ ) +-- +2.39.3 + diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0006-Executorch-Add-executorch_runner-test.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0006-Executorch-Add-executorch_runner-test.patch new file mode 100644 index 00000000000..b87058071fe --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0006-Executorch-Add-executorch_runner-test.patch @@ -0,0 +1,283 @@ +From 0c91f25a52d32d7f4b6ec787a40633a92af7f885 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 19:07:51 -0700 +Subject: [PATCH 6/6] [Executorch] Add executorch_runner test + +--- + applications/CMakeLists.txt | 2 + + applications/executorch_tests/CMakeLists.txt | 76 +++++++++++ + applications/executorch_tests/runner.cpp | 133 +++++++++++++++++++ + cmake/helpers.cmake | 13 +- + 4 files changed, 222 insertions(+), 2 deletions(-) + create mode 100644 applications/executorch_tests/CMakeLists.txt + create mode 100644 applications/executorch_tests/runner.cpp + +diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt +index 1fa2b2e..68e5427 100644 +--- a/applications/CMakeLists.txt ++++ b/applications/CMakeLists.txt +@@ -28,6 +28,8 @@ add_subdirectory(threadx_demo) + + add_subdirectory(message_handler_openamp) + ++add_subdirectory(executorch_tests) ++ + if (CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") + # Only armclang supported for now + add_subdirectory(trustzone_inference) +diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt +new file mode 100644 +index 0000000..c95d53e +--- /dev/null ++++ b/applications/executorch_tests/CMakeLists.txt +@@ -0,0 +1,76 @@ ++# ++# Copyright (c) 2021 Arm Limited. All rights reserved. ++# ++# SPDX-License-Identifier: Apache-2.0 ++# ++# Licensed under the Apache License, Version 2.0 (the License); you may ++# not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an AS IS BASIS, WITHOUT ++# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++if (NOT TARGET ethosu_core_driver) ++ return() ++endif() ++ ++#### ++#### ExecuTorch demo app/test ++#### ++ ++set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir") ++set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir") ++set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers") ++set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte") ++ ++get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) ++get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) ++get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH) ++get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH) ++ ++message("**********************") ++message("ExecuTorch dir (ET_DIR_PATH) : ${ET_DIR_PATH}") ++message("ExecuTorch build dir(ET_BUILD_DIR_PATH) : ${ET_BUILD_DIR_PATH}") ++message("ExecuTorch headers (ET_INCUDE_PATH) : ${ET_INCLUDE_PATH}") ++message("ExecuTorch pte file (ET_PTE_FILE_PATH) : ${ET_PTE_FILE_PATH}") ++message("**********************") ++ ++set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") ++set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") ++set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") ++ ++add_custom_target( ++ gen_model_header ALL ++ DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/fake_dep ++) ++ ++add_custom_command( ++ OUTPUT ++ ${CMAKE_CURRENT_BINARY_DIR}/fake_dep ++ ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h ++ COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_FILE_PATH} ++ --out ${CMAKE_CURRENT_BINARY_DIR} ++ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ++ ) ++ ++ethosu_add_executable_test(executor_runner PRIVATE ++ WHOLE_ARCHIVE TRUE ++ SOURCES runner.cpp ++ LIBRARIES ++ ${LIB_ET_RUNTIME} ++ ${LIB_ET_OP_REGISTRATION} ++ ${LIB_ET_OP_KERNELS}) ++ ++add_dependencies(executor_runner gen_model_header) ++ ++target_include_directories(executor_runner PRIVATE ++${ET_INCLUDE_PATH} ++${CMAKE_CURRENT_BINARY_DIR}) ++ ++# TODO Memory setup +diff --git a/applications/executorch_tests/runner.cpp b/applications/executorch_tests/runner.cpp +new file mode 100644 +index 0000000..7ef920d +--- /dev/null ++++ b/applications/executorch_tests/runner.cpp +@@ -0,0 +1,133 @@ ++/* Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under the BSD-style license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++// Model file - TODO make this configurable through CMake ++#include "model_pte.h" ++ ++using namespace std; ++using torch::executor::Result; ++using torch::executor::Error; ++ ++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; ++ ++void et_pal_init(void) {} ++ ++__ET_NORETURN void et_pal_abort(void) { ++ __builtin_trap(); ++} ++ ++et_timestamp_t et_pal_current_ticks(void) { ++ // libc.a - warning: _gettimeofday is not implemented and will always fail ++ return 11223344; ++} ++ ++/** ++ * Emit a log message via platform output (serial port, console, etc). ++ */ ++void et_pal_emit_log_message( ++ __ET_UNUSED et_timestamp_t timestamp, ++ et_pal_log_level_t level, ++ const char* filename, ++ __ET_UNUSED const char* function, ++ size_t line, ++ const char* message, ++ __ET_UNUSED size_t length) { ++ fprintf( ++ stderr, ++ "%c executorch:%s:%zu] %s\n", ++ level, ++ filename, ++ line, ++ message); ++} ++ ++int main() { ++ torch::executor::runtime_init(); ++ ++ auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte)); ++ ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", sizeof(model_pte)); ++ Result program = torch::executor::Program::load(&loader); ++ if(!program.ok()) { ++ ET_LOG(Info,"Program loading failed @ 0x%p: 0x%" PRIx32, model_pte, program.error()); ++ } ++ ++ ET_LOG(Info,"Model buffer loaded, has %lu methods", program->num_methods()); ++ ++ const char* method_name = nullptr; ++ { ++ const auto method_name_result = program->get_method_name(0); ++ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); ++ method_name = *method_name_result; ++ } ++ ET_LOG(Info,"Running method %s", method_name); ++ ++ Result method_meta = program->method_meta(method_name); ++ if (!method_meta.ok()) { ++ ET_LOG(Info,"Failed to get method_meta for %s: 0x%x", ++ method_name, (unsigned int)method_meta.error()); ++ } ++ ++ torch::executor::MemoryAllocator method_allocator{ ++ torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; ++ ++ std::vector> planned_buffers; // Owns the memory ++ std::vector> planned_spans; // Passed to the allocator ++ size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); ++ ++ for (size_t id = 0; id < num_memory_planned_buffers; ++id) { ++ size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get()); ++ ET_LOG(Info,"Setting up planned buffer %zu, size %zu.", id, buffer_size); ++ ++ planned_buffers.push_back(std::make_unique(buffer_size)); ++ planned_spans.push_back({planned_buffers.back().get(), buffer_size}); ++ } ++ ++ torch::executor::HierarchicalAllocator planned_memory( ++ {planned_spans.data(), planned_spans.size()}); ++ ++ torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory); ++ ++ Result method = program->load_method(method_name, &memory_manager); ++ if(!method.ok()) { ++ ET_LOG(Info,"Loading of method %s failed with status 0x%" PRIx32, method_name, method.error()); ++ } ++ ET_LOG(Info,"Method loaded."); ++ ++ ET_LOG(Info,"Preparing inputs..."); ++ auto inputs = torch::executor::util::PrepareInputTensors(*method); ++ ET_LOG(Info,"Input prepared."); ++ ++ ET_LOG(Info,"Starting the model execution..."); ++ Error status = method->execute(); ++ if(status != Error::Ok){ ++ ET_LOG(Info,"Execution of method %s failed with status 0x%" PRIx32, method_name, status); ++ } else { ++ ET_LOG(Info,"Model executed successfully."); ++ } ++ ++ std::vector outputs(method->outputs_size()); ++ ET_LOG(Info, "%zu outputs: ", outputs.size()); ++ status = method->get_outputs(outputs.data(), outputs.size()); ++ ET_CHECK(status == Error::Ok); ++ for (int i = 0; i < outputs.size(); ++i) { ++ for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { ++ printf("Output[%d][%d]: %f\n", i, j, outputs[i].toTensor().const_data_ptr()[j]); ++ } ++ } ++ return 0; ++} +diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake +index a21d9f0..036f189 100644 +--- a/cmake/helpers.cmake ++++ b/cmake/helpers.cmake +@@ -85,7 +85,7 @@ endfunction() + ############################################################################# + + function(ethosu_add_executable target) +- cmake_parse_arguments(ARGS "" "TARGET_LIBRARY" "SOURCES;LIBRARIES" ${ARGN}) ++ cmake_parse_arguments(ARGS "WHOLE_ARCHIVE" "TARGET_LIBRARY" "SOURCES;LIBRARIES" ${ARGN}) + add_executable(${target}) + + target_sources(${target} PRIVATE +@@ -95,8 +95,17 @@ function(ethosu_add_executable target) + set(ARGS_TARGET_LIBRARY ethosu_target_init) + endif() + ++ if (ARGS_WHOLE_ARCHIVE) ++ set(PRE_LINKER_FLAGS "-Wl,--whole-archive") ++ set(POST_LINKER_FLAGS "-Wl,--no-whole-archive") ++ endif() ++ + target_link_libraries(${target} PRIVATE +- ${ARGS_TARGET_LIBRARY} ${ARGS_LIBRARIES}) ++ ${PRE_LINKER_FLAGS} ++ ${ARGS_TARGET_LIBRARY} ++ ${ARGS_LIBRARIES} ++ ${POST_LINKER_FLAGS} ++ ) + + ethosu_eval_link_options(${target}) + +-- +2.39.3 + diff --git a/examples/backend/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch b/examples/backend/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch new file mode 100644 index 00000000000..c1270961510 --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/core_platform/patches/0007-Add-delegate-runner-test.patch @@ -0,0 +1,300 @@ +From 0fe8caba3068da05021232912c069124a81e0d94 Mon Sep 17 00:00:00 2001 +From: Rob Elliott +Date: Wed, 4 Oct 2023 13:31:33 +0000 +Subject: [PATCH] Add delegate runner test + +Signed-off-by: Rob Elliott +--- + applications/executorch_tests/CMakeLists.txt | 27 ++- + .../executorch_tests/pte_to_header.py | 11 +- + .../executorch_tests/runner_delegate.cpp | 160 ++++++++++++++++++ + cmake/toolchain/arm-none-eabi-gcc.cmake | 6 +- + 4 files changed, 195 insertions(+), 9 deletions(-) + create mode 100644 applications/executorch_tests/runner_delegate.cpp + +diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt +index c95d53e..835f824 100644 +--- a/applications/executorch_tests/CMakeLists.txt ++++ b/applications/executorch_tests/CMakeLists.txt +@@ -28,20 +28,24 @@ set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to ExecuTorch dir") + set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to ExecuTorch build dir") + set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to ExecuTorch headers") + set(ET_PTE_FILE_PATH "${ET_PTE_FILE_PATH}" CACHE PATH "Path to ExecuTorch model pte") ++set(ET_PTE_DELEGATE_FILE_PATH "${ET_PTE_DELGATE__FILE_PATH}" CACHE PATH "Path to ExecuTorch delegate model pte") + + get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) + get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) + get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH) + get_filename_component(ET_PTE_FILE_PATH ${ET_PTE_FILE_PATH} REALPATH) ++get_filename_component(ET_PTE_DELEGATE_FILE_PATH ${ET_PTE_DELEGATE_FILE_PATH} REALPATH) + + message("**********************") + message("ExecuTorch dir (ET_DIR_PATH) : ${ET_DIR_PATH}") + message("ExecuTorch build dir(ET_BUILD_DIR_PATH) : ${ET_BUILD_DIR_PATH}") + message("ExecuTorch headers (ET_INCUDE_PATH) : ${ET_INCLUDE_PATH}") + message("ExecuTorch pte file (ET_PTE_FILE_PATH) : ${ET_PTE_FILE_PATH}") ++message("ExecuTorch pte delegate file (ET_PTE_DELEGATE_FILE_PATH) : ${ET_PTE_DELEGATE_FILE_PATH}") + message("**********************") + + set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") ++set(LIB_ET_ETHOS "${ET_BUILD_DIR_PATH}/backends/arm/libexecutorch_delegate_ethos_u.a") + set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") + set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") + +@@ -54,8 +58,11 @@ add_custom_command( + OUTPUT + ${CMAKE_CURRENT_BINARY_DIR}/fake_dep + ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h ++ ${CMAKE_CURRENT_BINARY_DIR}/model_delegate_pte.h + COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_FILE_PATH} +- --out ${CMAKE_CURRENT_BINARY_DIR} ++ --outdir ${CMAKE_CURRENT_BINARY_DIR} ++ COMMAND ${PYTHON_EXECUTABLE} ./pte_to_header.py --pte ${ET_PTE_DELEGATE_FILE_PATH} ++ --outdir ${CMAKE_CURRENT_BINARY_DIR} --outfile model_delegate_pte.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + +@@ -67,10 +74,24 @@ ethosu_add_executable_test(executor_runner PRIVATE + ${LIB_ET_OP_REGISTRATION} + ${LIB_ET_OP_KERNELS}) + +-add_dependencies(executor_runner gen_model_header) +- + target_include_directories(executor_runner PRIVATE + ${ET_INCLUDE_PATH} + ${CMAKE_CURRENT_BINARY_DIR}) + ++ethosu_add_executable_test(executor_runner_delegate PRIVATE ++ WHOLE_ARCHIVE TRUE ++ SOURCES runner_delegate.cpp ++ LIBRARIES ++ ${LIB_ET_RUNTIME} ++ ${LIB_ET_ETHOS} ++ ) ++ ++target_include_directories(executor_runner_delegate PRIVATE ++${ET_INCLUDE_PATH} ++${CMAKE_CURRENT_BINARY_DIR}) ++ ++add_dependencies(executor_runner gen_model_header) ++ ++ ++ + # TODO Memory setup +diff --git a/applications/executorch_tests/pte_to_header.py b/applications/executorch_tests/pte_to_header.py +index 37d88aa..be3282d 100644 +--- a/applications/executorch_tests/pte_to_header.py ++++ b/applications/executorch_tests/pte_to_header.py +@@ -30,11 +30,18 @@ parser.add_argument( + ) + parser.add_argument( + "--outdir", +- help="Output dir for model_pte.h", ++ help="Output dir for model header", + type=str, + required=False, + default=".", + ) ++parser.add_argument( ++ "--outfile", ++ help="Output filename for model header", ++ type=str, ++ required=False, ++ default="model_pte.h", ++) + parser.add_argument( + "--section", + help="Section attribute for the data array", +@@ -43,7 +50,7 @@ parser.add_argument( + default=".sram.data", + ) + args = parser.parse_args() +-outfile = os.path.join(args.outdir, "model_pte.h") ++outfile = os.path.join(args.outdir, args.outfile) + attr = f'__attribute__((section("{args.section}"), aligned(16))) char ' + + with open(args.pte, "rb") as fr, open( +diff --git a/applications/executorch_tests/runner_delegate.cpp b/applications/executorch_tests/runner_delegate.cpp +new file mode 100644 +index 0000000..ff40084 +--- /dev/null ++++ b/applications/executorch_tests/runner_delegate.cpp +@@ -0,0 +1,160 @@ ++/* ++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates ++ * ++ * SPDX-License-Identifier: Apache-2.0 ++ * ++ * Licensed under the Apache License, Version 2.0 (the License); you may ++ * not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT ++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/**************************************************************************** ++ * Includes ++ ****************************************************************************/ ++ ++#include ++#include ++#include ++ ++using namespace std; ++ ++#include ++#include ++#include ++#include ++#include ++ ++/**************************************************************************** ++ * Data ++ ****************************************************************************/ ++ ++// Our .pte file generated from the AoT flow ++#include "model_delegate_pte.h" // contains model_pte ++ ++// Storage for intermediate data in SRAM ++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; ++ ++void et_pal_init(void) {} ++ ++__ET_NORETURN void et_pal_abort(void) { ++ __builtin_trap(); ++} ++ ++et_timestamp_t et_pal_current_ticks(void) { ++ // libc.a - warning: _gettimeofday is not implemented and will always fail ++ return 11223344; ++} ++ ++/** ++ * Emit a log message via platform output (serial port, console, etc). ++ */ ++void et_pal_emit_log_message( ++ __ET_UNUSED et_timestamp_t timestamp, ++ et_pal_log_level_t level, ++ const char* filename, ++ __ET_UNUSED const char* function, ++ size_t line, ++ const char* message, ++ __ET_UNUSED size_t length) { ++ fprintf( ++ stderr, ++ "%c executorch:%s:%zu] %s\n", ++ level, ++ filename, ++ line, ++ message); ++} ++ ++int main() ++{ ++ ET_LOG(Info, "Initialising runtime"); ++ torch::executor::runtime_init(); ++ ++ using torch::executor::Result; ++ using torch::executor::Error; ++ ++ // Load pte from the global model_pte .pte file loaded into SRAM. ++ auto loader = torch::executor::util::BufferDataLoader(model_pte, sizeof(model_pte)); ++ Result program = torch::executor::Program::load(&loader); ++ if(!program.ok()) { ++ ET_LOG(Info, "Program loading failed @ 0x%p: 0x%x", model_pte, (int)program.error()); ++ } ++ ET_LOG(Info, "Model buffer loaded, has %u methods", program->num_methods()); ++ ++ // Find our entrypoint in the .pte program ++ const char* method_name = nullptr; ++ const auto method_name_result = program->get_method_name(0); ++ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); ++ method_name = *method_name_result; ++ ET_LOG(Info, "Found (and will run) method '%s'", method_name); ++ ++ // Allocate necessary memories for this method ++ Result method_meta = program->method_meta(method_name); ++ if (!method_meta.ok()) { ++ ET_LOG(Info, "Failed to get method_meta for %s: 0x%x", ++ method_name, (unsigned int)method_meta.error()); ++ } ++ ++ torch::executor::MemoryAllocator method_allocator{ ++ torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; ++ ++ std::vector> planned_buffers; // Owns the memory ++ std::vector> planned_spans; // Passed to the allocator ++ size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); ++ ++ for (size_t id = 0; id < num_memory_planned_buffers; ++id) { ++ size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get()); ++ ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size); ++ ++ planned_buffers.push_back(std::make_unique(buffer_size)); ++ planned_spans.push_back({planned_buffers.back().get(), buffer_size}); ++ } ++ ++ torch::executor::HierarchicalAllocator planned_memory( ++ {planned_spans.data(), planned_spans.size()}); ++ ++ torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory); ++ ++ Result method = program->load_method(method_name, &memory_manager); ++ ++ if(!method.ok()) { ++ ET_LOG(Info, "Loading of method %s failed with status 0x%x", method_name, (int)method.error()); ++ } ++ ET_LOG(Info, "Loading of method '%s' succesful", method_name); ++ ++ auto inputs = torch::executor::util::PrepareInputTensors(*method); ++ ++ ET_LOG(Info, "Starting the model execution..."); ++ Error status = method->execute(); ++ if(status != Error::Ok){ ++ ET_LOG(Info, "Execution of method %s failed with status 0x%x", method_name, (int)status); ++ } else { ++ ET_LOG(Info, "Model executed successfully."); ++ } ++ ++ // Print the outputs. ++ std::vector outputs(method->outputs_size()); ++ ET_LOG(Info, "%d outputs - ", outputs.size()); ++ status = method->get_outputs(outputs.data(), outputs.size()); ++ ET_CHECK(status == Error::Ok); ++ for (size_t i = 0; i < outputs.size(); ++i) ++ { ++ ET_LOG(Info, "Output %d numel %d", i, outputs[i].toTensor().numel()); ++ for (size_t j = 0; j < outputs[i].toTensor().numel(); ++j) ++ { ++ ET_LOG(Info, " Output[%d]: %d", j, outputs[i].toTensor().const_data_ptr()[j]); ++ } ++ } ++ ++ return 0; ++} ++ ++ +diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake +index 0e6a2ed..fdb0d7c 100644 +--- a/cmake/toolchain/arm-none-eabi-gcc.cmake ++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake +@@ -98,8 +98,6 @@ add_compile_options( + # -Wswitch + # -Wswitch-default + # -Wunused +- +- # -Wno-redundant-decls +- +- # -Wno-psabi ++ -Wno-redundant-decls ++ -Wno-psabi + ) +-- +2.41.0 + diff --git a/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch new file mode 100644 index 00000000000..e131ca76ee8 --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch @@ -0,0 +1,129 @@ +From ef07230fbb15edbf27ecaf48994fb157430a5e7c Mon Sep 17 00:00:00 2001 +From: Rob Elliott +Date: Thu, 5 Oct 2023 16:45:42 +0000 +Subject: [PATCH] Improve rescale codegen for TOSA + +Signed-off-by: Rob Elliott +--- + ethosu/vela/tosa_graph_optimiser.py | 56 +++++++++++------------------ + ethosu/vela/tosa_mapping.py | 2 +- + 2 files changed, 22 insertions(+), 36 deletions(-) + +diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py +index df6b575..b2e3697 100644 +--- a/ethosu/vela/tosa_graph_optimiser.py ++++ b/ethosu/vela/tosa_graph_optimiser.py +@@ -337,7 +337,8 @@ def rewrite_concat(op): + + def remove_memory_ops(op, arch): + if op.run_on_npu and op.type in (Op.Reshape, Op.Identity): +- bypass_memory_only_ops(op) ++ # TODO: is this ok - function doesn't use arch or nng ++ bypass_memory_only_ops(op, arch, None) + + + def rewrite_activation(op, arch, nng): +@@ -357,7 +358,6 @@ def rewrite_activation(op, arch, nng): + + return op + +- + def rewrite_rescale(op, arch, nng): + if op.type == Op.Rescale: + ifm = op.ifm +@@ -368,7 +368,7 @@ def rewrite_rescale(op, arch, nng): + prev_op = ifm.ops[0] + + # TODO currently not supported +- assert len(ifm.consumer_list) == 1 ++ #assert len(ifm.consumer_list) == 1 + + input_zp = op.attrs["input_zp"] + output_zp = op.attrs["output_zp"] +@@ -390,6 +390,9 @@ def rewrite_rescale(op, arch, nng): + assert False + ifm.quantization.zero_point = input_zp + ofm.quantization.zero_point = output_zp ++ ++ assert False == per_channel, "Don't like per_channel!" ++ + for s, m in zip(shift, multiplier): + # TODO these are the TOSA limitations + assert m >= 0 +@@ -403,45 +406,28 @@ def rewrite_rescale(op, arch, nng): + else: + rounding_mode = RoundingMode.HalfUp + +- if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected: ++ fuse = len(ifm.ops) == 1 and prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() ++ if fuse: ++ # TODO: ERROR: bias.values didn't exist for an op like Add - presumably not a capability of that op + assert len(multiplier) == len(shift) == len(prev_op.bias.values) +- +- if ifm.dtype == DataType.int32 and per_channel: +- prev_op.explicit_scaling = explicit_scaling +- prev_op.rounding_mode = rounding_mode +- +- # Bypass op +- prev_op.set_output_tensor(ofm) +- DebugDatabase.add_optimised(op, prev_op) +- return op +- else: +- print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) +- assert False +- # TODO which are the cases we need to and can do standalone Rescale? +- # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops? +- # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE? +- # limited to these at the moment: +- elif ( +- (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8) +- or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8) +- or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8) +- ): +- # Create NOP performing the RESCALE ++ # TODO: generate replacement fusion code from below ++ assert False, "Fusion possible but i've not implemented it" ++ else: ++ # Generate Rescale behaviour attached to a compatible NOP ++ # TODO: I assume this attaches a new operator into the graph?? + avgpool_op = replace_rescale_with_avg_pool(op) + avgpool_op.rounding_mode = rounding_mode +- ++ + if per_channel: +- # TODO +- avgpool_op.explicit_scaling = explicit_scaling +- print("Warning, unsupported TOSA Rescale") +- assert False ++ assert False, "Assert above removed but still not implemented... :/" + else: + avgpool_op.explicit_scaling = explicit_scaling +- else: +- print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) +- assert False +- return op + ++ #print( len(multiplier), len(shift), len(prev_op.get_bias_tensors()) ) ++ #print( ifm.dtype, "PC:", per_channel, op.type ) ++ #print( ifm.dtype, ofm.dtype ) ++ ++ return op + + def convert_pad_in_width(op): + """ +diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py +index 2dafd81..ed5aa2e 100644 +--- a/ethosu/vela/tosa_mapping.py ++++ b/ethosu/vela/tosa_mapping.py +@@ -148,7 +148,7 @@ transpose_conv_attrs = AttrSerializer( + ) + transpose_attrs = AttrSerializer("TransposeAttribute", (("perms", is_vec),)) + axis_attrs = AttrSerializer("AxisAttribute", ("axis",)) +-reshape_attrs = AttrSerializer("ReshapeAttribute", (("shape", is_vec),)) ++reshape_attrs = AttrSerializer("ReshapeAttribute", (("newShape", is_vec),)) + slice_attrs = AttrSerializer("SliceAttribute", (("start", is_vec), ("size", is_vec))) + tile_attrs = AttrSerializer("TileAttribute", (("multiplies", is_vec),)) + resize_attrs = AttrSerializer( +-- +2.41.0 + diff --git a/examples/backend/arm/ethos-u-setup/setup.sh b/examples/backend/arm/ethos-u-setup/setup.sh new file mode 100755 index 00000000000..f82d3333e7f --- /dev/null +++ b/examples/backend/arm/ethos-u-setup/setup.sh @@ -0,0 +1,213 @@ +#!/usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -eu + + +######## +### Helper functions +######## +function get_os_name() { + # Returns the name of the system i.e. Linux or Darwin + uname -s +} + +function get_cpu_arch() { + # Returns the cpu architecture like arm64 or x86-64 + uname -m +} + +######## +### Hardcoded constants +######## +script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) + +if [[ $(get_cpu_arch) == "x86_64" ]]; then + # FVP + fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64.tgz?rev=018659bd574f4e7b95fa647e7836ccf4&hash=22A79103C6FA5FFA7AFF3BE0447F3FF9" + fvp_model_dir="Linux64_GCC-9.3" + + # toochain + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi" +elif [[ $(get_cpu_arch) == "aarch64" ]]; then + # FVP + fvp_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073" + fvp_model_dir="Linux64_armv8l_GCC-9.3" + + # toochain + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi" +else + echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; +fi + +# ethos-u +ethos_u_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u" +ethos_u_base_rev="0995223100e3da8011700f58e491f1bf59511e3c" + +######## +### Optional user args +######## +root_dir=${1:-"$(readlink -f ${script_dir}/../ethos-u)"} + +######## +### Functions +######## +function setup_fvp() { + # Download and install the Corstone 300 FVP simulator platform + cd "${root_dir}" + if [[ ! -e FVP_cs300.tgz ]]; then + echo "[${FUNCNAME[0]}] Downloading FVP ..." + curl --output FVP_cs300.tgz "${fvp_url}" + fi + + echo "[${FUNCNAME[0]}] Installing FVP ..." + rm -rf FVP + mkdir -p FVP + cd FVP + tar xf ../FVP_cs300.tgz + ./FVP_Corstone_SSE-300.sh --i-agree-to-the-contained-eula --force --destination ./ --quiet --no-interactive + + fvp_bin_path="$(cd models/${fvp_model_dir} && pwd)" + export PATH=${PATH}:${fvp_bin_path} + + hash FVP_Corstone_SSE-300_Ethos-U55 + echo "export PATH=\${PATH}:${fvp_bin_path}" | tee -a ${update_path_script} +} + +function setup_toolchain() { + # Download and install the arm-none-eabi toolchain + cd "${root_dir}" + if [[ ! -e gcc.tar.xz ]]; then + echo "[${FUNCNAME[0]}] Downloading toolchain ..." + curl --output gcc.tar.xz "${toolchain_url}" + echo "Done" + fi + + echo "[${FUNCNAME[0]}] Installing toolchain ..." + rm -rf "${toolchain_dir}" + tar xf gcc.tar.xz + toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)" + export PATH=${PATH}:${toolchain_bin_path} + hash arm-none-eabi-gcc + echo "export PATH=\${PATH}:${toolchain_bin_path}" >> ${update_path_script} +} + +function setup_ethos_u() { + # This is the main dir which will pull more repos to do baremetal software dev for cs300 + echo "[${FUNCNAME[0]}] Setting up the repo" + cd "${root_dir}" + [[ ! -d ethos-u ]] && \ + git clone ${ethos_u_repo_url} + cd ethos-u + git reset --hard ${ethos_u_base_rev} + ./fetch_externals.py fetch + pip install pyelftools + echo "[${FUNCNAME[0]}] Done @ $(git describe --all --long 3> /dev/null) in ${root_dir}/ethos-u dir." +} + +function patch_repo() { + # This is a temporary hack until it finds a better home in one for the ARM Ml repos + echo -e "[${FUNCNAME[0]}] Preparing ${name}..." + local repo_dir="${root_dir}/ethos-u/${name}" + cd $repo_dir + + git reset --hard ${base_rev} + + patch_dir=${script_dir}/${name}/patches/ + [[ -e ${patch_dir} && $(ls -A ${patch_dir}) ]] && \ + git am -3 ${patch_dir}/*.patch + + echo -e "[${FUNCNAME[0]}] Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${repo_dir} dir.\n" +} + +function setup_tosa_reference_model() { + # The debug flow on the host includes running on a reference implementation of TOSA + # This is useful primarily for debug of quantization accuracy, but also for internal + # errors for the early codebase + cd "${root_dir}" + if [[ ! -e reference_model ]]; then + git clone https://git.mlplatform.org/tosa/reference_model.git -b v0.80.0 + cd reference_model + git submodule update --init --recursive + cd .. + fi + cd reference_model + mkdir -p build + cd build + cmake .. + make + cd reference_model + tosa_bin_path=`pwd` + echo adding ${tosa_bin_path} to path + echo "export PATH=\${PATH}:${tosa_bin_path}" >> ${update_path_script} + cd ../.. + echo back at `pwd` +} + +function setup_vela() { + # + # Prepare the Vela compiler for AoT to Ethos-U compilation + # + cd "${root_dir}/ethos-u/" + if [[ ! -e ethos-u-vela ]]; then + git clone https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git + name="ethos-u-vela" + base_rev=00a15db3e1a188b25065d095152d701f4394cdc5 + patch_repo + fi + pip install . + cd .. +} + +######## +### main +######## + +cd "${script_dir}" + +# Make sure we are on a supported platform +# Linux ARM64 is a supported platform - adding it here is a WIP +# No OSx support for FVP +[[ $(get_cpu_arch) != "x86_64" ]] && [[ $(get_cpu_arch) != "aarch64" ]] \ + && { echo "[main] Error: only x86-64 & aarch64 architecture is supported for now!"; exit 1; } + +[[ $(get_os_name) != "Linux" ]] \ + && { echo "[main] Error: only Linux os is supported for now!"; exit 1; } + +# Setup the root dir +mkdir -p "${root_dir}" +cd "${root_dir}" +echo "[main] Using root dir ${root_dir}" + +update_path_script="${root_dir}/setup_path.sh" +echo "" > "${update_path_script}" + +# Setup FVP +setup_fvp + +# Setup toolchain +setup_toolchain + +# Setup the ethos-u dev environment +setup_ethos_u + +# Patch the ethos-u dev environment to include executorch application +name="core_platform" +base_rev=204210b1074071532627da9dc69950d058a809f4 +patch_repo + +# Setup the tosa_reference_model +setup_tosa_reference_model + +# Setup vela and patch in codegen fixes +setup_vela + +echo "[main] update path using script: ${update_path_script}" +echo "[main] sucecss!" +exit $? diff --git a/examples/backend/arm/run.sh b/examples/backend/arm/run.sh new file mode 100755 index 00000000000..aa19cebca43 --- /dev/null +++ b/examples/backend/arm/run.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -eu + +######## +### Hardcoded constants +######## +script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +et_root_dir=$(readlink -f ${script_dir}/../../../) +et_build_dir=${et_root_dir}/cmake-out +ethos_u_root_dir=$(readlink -f ${script_dir}/ethos-u/ethos-u) +ethos_u_build_dir=${ethos_u_root_dir}/core_platform/build +fvp_model=FVP_Corstone_SSE-300_Ethos-U55 +toolchain_cmake=${ethos_u_root_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake +toolchain_cmake_executorch=${et_root_dir}/backends/arm/cmake/arm-none-eabi-gcc.cmake +_setup_msg="please refer to ${script_dir}/ethos-u-setup/setup.sh to properly install necessary tools." + + +# Generate eager mode results +# TODO + +# Generate the PTE file +function generate_pte_file() { + cd $et_root_dir + python3 -m examples.export.export_example --model_name="softmax" + local pte_file=$(readlink -f ./softmax.pte) + [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } + echo "${pte_file}" +} + +# Generate the ethos delegate PTE file +function generate_ethos_pte_file() { + cd $et_root_dir + python3 examples/backend/arm/arm_ethosu_minimal.py &> /dev/null + cd ./ethosout/simple_add/torch/ + local pte_file=$(readlink -f ./delegated.pte) + [[ -f ${pte_file} ]] || { echo "Failed to generate a pte file - ${pte_file}"; exit 1; } + echo "${pte_file}" +} + +# build ExecuTorch Libraries +function build_executorch() { + rm -rf "${et_build_dir}" + mkdir "${et_build_dir}" + cd "${et_build_dir}" + cmake \ + -DBUCK2=/tmp/buck2 \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_GFLAGS=OFF \ + -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ + -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake_executorch}" \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \ + -DSELECT_OPS_LIST="aten::_softmax.out" \ + "${et_root_dir}" + + echo "[${FUNCNAME[0]}] Configured CMAKE" + + n=$(nproc) + cmake --build . -j"$((n - 5))" -- VERBOSE=1 + echo "[${FUNCNAME[0]}] Generated static libraries for ExecuTorch:" + find . -name "*.a" -exec ls -al {} \; +} + +# build Arm Baremetal executor_runner +function build_executorch_runner() { + [[ $# -ne 2 ]] && { echo "[${FUNCNAME[0]}] Expecting 2 pte files as arguments got, $@"; exit 1; } + local pte=${1} + local pte_delegate=${2} + cd "${ethos_u_root_dir}"/core_platform + cmake \ + -DCMAKE_TOOLCHAIN_FILE=${toolchain_cmake_executorch} \ + -B build targets/corstone-300 \ + -DET_DIR_PATH:PATH=${et_root_dir} \ + -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ + -DET_PTE_FILE_PATH:PATH="${pte}" \ + -DET_PTE_DELEGATE_FILE_PATH:PATH="${pte_delegate}" \ + -DPYTHON_EXECUTABLE=$(which python3) + echo "[${FUNCNAME[0]}] Configured CMAKE" + + n=$(nproc) + cmake --build build -- -j"$((n - 5))" executor_runner executor_runner_delegate #VERBOSE=1 + echo "[${FUNCNAME[0]}] Generated baremetal elf file:" + find . -name "executor_runner.elf" +} + +# Execute the executor_runner on FVP Simulator +function run_fvp() { + elf=$(find ${ethos_u_build_dir} -name "executor_runner.elf") + [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner elf: ${elf}"; exit 1; } + FVP_Corstone_SSE-300_Ethos-U55 \ + -C ethosu.num_macs=128 \ + -C mps3_board.visualisation.disable-visualisation=1 \ + -C mps3_board.telnetterminal0.start_telnet=0 \ + -C mps3_board.uart0.out_file='-' \ + -a "${elf}" \ + --timelimit 5 || true + echo "[${FUNCNAME[0]} Simulation complete, $?" +} + +# Execute the executor_runner on FVP Simulator +function run_fvp_delegate() { + elf=$(find ${ethos_u_build_dir} -name "executor_runner_delegate.elf") + [[ ! -f $elf ]] && { echo "[${FUNCNAME[0]}]: Unable to find executor_runner_delegate elf: ${elf}"; exit 1; } + FVP_Corstone_SSE-300_Ethos-U55 \ + -C ethosu.num_macs=128 \ + -C mps3_board.visualisation.disable-visualisation=1 \ + -C mps3_board.telnetterminal0.start_telnet=0 \ + -C mps3_board.uart0.out_file='-' \ + -a "${elf}" \ + --timelimit 5 || true + echo "[${FUNCNAME[0]} Simulation complete, $?" +} + +####### +### Main +####### + +# basic checks before we get started +hash ${fvp_model} \ + || { echo "Could not find ${fvp_model} on PATH, ${_setup_msg}"; exit 1; } + +hash arm-none-eabi-gcc \ + || { echo "Could not find arm baremetal toolchain on PATH, ${_setup_msg}"; exit 1; } + +[[ -f ${toolchain_cmake} ]] \ + || { echo "Could not find ${toolchain_cmake} file, ${_setup_msg}"; exit 1; } + +[[ -f ${et_root_dir}/CMakeLists.txt ]] \ + || { echo "Executorch repo doesn't contain CMakeLists.txt file at root level"; exit 1; } + +# get the pte +pte=$(generate_pte_file) +pte_delegate=$(generate_ethos_pte_file) + +# build et +build_executorch + +# build the et baremetal app +build_executorch_runner "${pte}" "${pte_delegate}" + +# run the app +run_fvp + +# run the delegate app +run_fvp_delegate + +exit $? diff --git a/examples/models/__init__.py b/examples/models/__init__.py index 48544bd94bf..745c5a4c05d 100644 --- a/examples/models/__init__.py +++ b/examples/models/__init__.py @@ -11,6 +11,7 @@ "linear": ("toy_model", "LinearModule"), "add": ("toy_model", "AddModule"), "add_mul": ("toy_model", "AddMulModule"), + "softmax" : ("toy_model", "SoftmaxModule"), "dl3": ("deeplab_v3", "DeepLabV3ResNet50Model"), "edsr": ("edsr", "EdsrModel"), "emformer_transcribe": ("emformer_rnnt", "EmformerRnntTranscriberModel"), diff --git a/examples/models/toy_model/__init__.py b/examples/models/toy_model/__init__.py index f0400df8301..0f77b325105 100644 --- a/examples/models/toy_model/__init__.py +++ b/examples/models/toy_model/__init__.py @@ -4,11 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from .model import AddModule, AddMulModule, LinearModule, MulModule +from .model import AddModule, AddMulModule, LinearModule, MulModule, SoftmaxModule __all__ = [ AddModule, AddMulModule, LinearModule, MulModule, + SoftmaxModule, ] diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py index 0f7131fe21c..72ef27e188f 100644 --- a/examples/models/toy_model/model.py +++ b/examples/models/toy_model/model.py @@ -75,3 +75,19 @@ def get_example_inputs(self): def get_compile_spec(self): max_value = self.get_example_inputs()[0].shape[0] return [CompileSpec("max_value", bytes([max_value]))] + +class SoftmaxModule(torch.nn.Module, EagerModelBase): + def __init__(self): + super().__init__() + self.softmax = torch.nn.Softmax() + + def forward(self, x): + z = self.softmax(x) + return z + + def get_eager_model(self) -> torch.nn.Module: + return self + + def get_example_inputs(self): + return (torch.ones(2, 2),) + diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index a0f1712b4e3..271b1cf087a 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -37,9 +37,19 @@ file(GLOB_RECURSE _portable_kernels__srcs "${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp") list(FILTER _portable_kernels__srcs EXCLUDE REGEX "test/*.cpp") list(FILTER _portable_kernels__srcs EXCLUDE REGEX "codegen") -# Generate C++ bindings to register kernels into both PyTorch (for AOT) and -# Executorch (for runtime). Here select all ops in functions.yaml -gen_selected_ops("${CMAKE_CURRENT_LIST_DIR}/functions.yaml" "" "") + +# If a filterlist is provided only generate wrappers for those +# Else for all - this is the default behavior. +if(SELECT_OPS_LIST) + message("Selecting only ${SELECT_OPS_LIST} op(s)!") + gen_selected_ops("" "${SELECT_OPS_LIST}" "") +else() + # Generate C++ bindings to register kernels into both PyTorch (for AOT) and + # Executorch (for runtime). Here select all ops in functions.yaml + message("Selecting all ops") + gen_selected_ops("${CMAKE_CURRENT_LIST_DIR}/functions.yaml" "" "") +endif() + # Expect gen_selected_ops output file to be selected_operators.yaml generate_bindings_for_kernels(${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml "") message("Generated files ${gen_command_sources}")