diff --git a/.gitmodules b/.gitmodules index 980a999eff0..3138391f7c0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -31,3 +31,9 @@ [submodule "backends/arm/third-party/serialization_lib"] path = backends/arm/third-party/serialization_lib url = https://git.mlplatform.org/tosa/serialization_lib.git +[submodule "backends/arm/third-party/ethos-u-core-driver"] + path = backends/arm/third-party/ethos-u-core-driver + url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git +[submodule "backends/arm/third-party/cmsis"] + path = backends/arm/third-party/cmsis + url = https://github.com/ARM-software/CMSIS_5.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cc626e0d5d..1678bc2d8a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,8 +59,18 @@ endif() # - targets in the current directory, before and after this command is invoked # - targets in sub-directories added after this command is invoked if(CMAKE_BUILD_TYPE STREQUAL "Release") + # To enable logging in Release mode + option( + EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE + "Enable logging in release mode" OFF) + + set(_ET_LOG_ENABLE 0) + if (${EXECUTORCH_ENABLE_LOGGING_RELEASE_MODE}) + set(_ET_LOG_ENABLE 1) + endif() + # Avoid pulling in the logging strings, which can be large. - add_definitions(-DET_LOG_ENABLED=0) + add_definitions(-DET_LOG_ENABLED=${_ET_LOG_ENABLE}) # Avoid pulling in the flatbuffer data verification # logic, which can add about 20kB. add_definitions(-DET_ENABLE_PROGRAM_VERIFICATION=0) @@ -106,6 +116,10 @@ if(BUILD_SELECTIVE_BUILD_TEST) option(SELECT_OPS_YAML "Register all the ops from a given yaml file" OFF) endif() +# Build Arm Baremetal backend +option(EXECUTORCH_BUILD_ARM_BAREMETAL + "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF) + # Build xnn_executor_runner which depends on XNNPACK option(EXECUTORCH_BUILD_XNNPACK "Build xnn_executor_runner which depends on XNNPACK" OFF) @@ -295,6 +309,10 @@ if(EXECUTORCH_BUILD_XNNPACK) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) endif() +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +endif() + # Add selective build subdirectory if(BUILD_SELECTIVE_BUILD_TEST) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/selective_build) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt new file mode 100644 index 00000000000..2cc5cf94740 --- /dev/null +++ b/backends/arm/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +set(_common_include_directories ${EXECUTORCH_ROOT}/..) +set(_common_compile_options -Wno-deprecated-declarations) + +include(cmake/Dependencies.cmake) + +set(_arm_baremetal_sources backends/arm/runtime/ArmBackendEthosU.cpp) +list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") +add_library(ethos_u STATIC ${_arm_baremetal_sources}) +target_include_directories(ethos_u PUBLIC ${_common_include_directories}) +target_include_directories(ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 1a6499cf07d..82b24f4b9b6 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -13,6 +13,7 @@ import operator import os import tempfile +import subprocess from typing import final, List import numpy as np @@ -140,6 +141,64 @@ def dbg_tosa_dump(tosa_fb, path): f.write(js) f.close() +# Output to Vela with current file-based compilation +# WARNING: if this changes, the runtime reader also needs to change +def vela_compile(tosa_fb): + with tempfile.TemporaryDirectory() as tmpdir: + print(f"compiling to Vela in {tmpdir}") + + tosaname = "out.tosa" + flatbuffer = tosa_fb.serialize() + f = open(os.path.join(tmpdir,tosaname), "wb") + f.write(flatbuffer) + f.close() + + # invoke vela + # TODO target ethos-u55-128 + vela_command = f"cd {tmpdir}; vela --accelerator-config ethos-u55-128 {tosaname}" + subprocess.run([vela_command], shell=True, check=True) + + np_path = os.path.join(tmpdir,"output","out_sg0_vela.npz") + blocks = b'' + with np.load(np_path, allow_pickle=False) as data: + # Emit the NPZ regions as: + # - 16 byte block name null terminated string (padded to 16 if name shorter) + # - 4 byes of int32 block length and 12 bytes of 0's + # - block data (padded to 16 byte alignment at end) + # Repeat for all blocks + for key in data.keys(): + block_name = bytes(key,"utf8")[:15] + block_name = block_name + b'\x00'*(16-len(block_name)) + block_data = data[key].tobytes() + # We need the acual unpadded block lengths for hw setup + block_length = len(block_data).to_bytes(16, 'little') + # pad block data to multiple of 16 bytes + block_data = block_data + b'\x00'*(15-(len(block_data)-1)%16) + + block = block_name + block_length + block_data + blocks = blocks + block + + # Add a block for scratch, inputs and outputs + # scratch shape is a 1 element array giving us size in bytes + block_name = bytes("scratch_data","utf8")[:15] + block_name = block_name + b'\x00'*(16-len(block_name)) + block_length = data["scratch_shape"][0].item() + print(f"scratch length = {block_length}") + block_length = block_length+(15-(block_length-1)%16) + block_data = b'\x00'*block_length + block_length = block_length.to_bytes(16, 'little') + print(f"lengths {len(block_name)} {len(block_length)} {len(block_data)}") + block = block_name + block_length + block_data + blocks = blocks + block + # TODO are these already in scratch shape? look to be + #input_shape * input_elem_size + #output_shape * output_elem_size + # input_offset and output_offset specify the location these arrays are written from base of scratch + + # return 16 byte VELA bin header + blocks + footer + header = bytes("vela_bin_stream","utf-8") + b'\x00' + footer = bytes("vela_end_stream","utf-8") + b'\x00' + return header + blocks + footer def dbg_fail(node, tosa_fb, path): dbg_tosa_dump(tosa_fb, path) @@ -205,10 +264,6 @@ def preprocess( # noqa: C901 path = spec.value.decode() debug_output = True - # in non debug builds we still pass files to vela - if path is None: - path = tempfile.mkdtemp(prefix="arm_tosa_") - # Converted output for this subgraph, serializer needs path early as it emits # const data directly. Path created and data written only in debug builds. tosa_fb = ts.TosaSerializer(path) @@ -680,5 +735,7 @@ def preprocess( # noqa: C901 dbg_tosa_dump(tosa_fb, path) # Serialize and return the tosa flatbuffer - fb = tosa_fb.serialize() - return PreprocessResult(processed_bytes=bytes(fb)) + # fb = bytes(tosa_fb.serialize()) + binary = vela_compile(tosa_fb) + + return PreprocessResult(processed_bytes=binary) diff --git a/backends/arm/cmake/Dependencies.cmake b/backends/arm/cmake/Dependencies.cmake new file mode 100644 index 00000000000..27a587176bb --- /dev/null +++ b/backends/arm/cmake/Dependencies.cmake @@ -0,0 +1,12 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") + +# Ethos-U driver +set(DRIVER_ETHOSU_SOURCE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver") +set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include") +add_subdirectory( ${DRIVER_ETHOSU_SOURCE_DIR} ) +include_directories( ${DRIVER_ETHOSU_INCLUDE_DIR} ) diff --git a/backends/arm/cmake/arm-none-eabi-gcc.cmake b/backends/arm/cmake/arm-none-eabi-gcc.cmake new file mode 100644 index 00000000000..d70f79361cd --- /dev/null +++ b/backends/arm/cmake/arm-none-eabi-gcc.cmake @@ -0,0 +1,90 @@ +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU") +string(TOLOWER ${TARGET_CPU} CMAKE_SYSTEM_PROCESSOR) + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_C_COMPILER "arm-none-eabi-gcc") +set(CMAKE_CXX_COMPILER "arm-none-eabi-g++") +set(CMAKE_ASM_COMPILER "arm-none-eabi-gcc") +set(CMAKE_LINKER "arm-none-eabi-ld") + +set(CMAKE_EXECUTABLE_SUFFIX ".elf") +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# Select C/C++ version +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) + +set(GCC_CPU ${CMAKE_SYSTEM_PROCESSOR}) +string(REPLACE "cortex-m85" "cortex-m55" GCC_CPU ${GCC_CPU}) + +# Compile options +add_compile_options( + -mcpu=${GCC_CPU} + -mthumb + "$<$:-gdwarf-3>" + "$<$:-fno-unwind-tables;-fno-rtti;-fno-exceptions>" + -fdata-sections + -ffunction-sections) + +# Compile defines +add_compile_definitions( + "$<$>:NDEBUG>") + +# Link options +add_link_options( + -mcpu=${GCC_CPU} + -mthumb + --specs=nosys.specs) + +# Set floating point unit +if(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+fp") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "\\+nofp") + set(FLOAT soft) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m55(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m85(\\+|$)") + set(FLOAT hard) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m4(\\+|$)" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m7(\\+|$)") + set(FLOAT hard) + set(FPU_CONFIG "fpv4-sp-d16") + add_compile_options(-mfpu=${FPU_CONFIG}) + add_link_options(-mfpu=${FPU_CONFIG}) +else() + set(FLOAT soft) +endif() + +if (FLOAT) + add_compile_options(-mfloat-abi=${FLOAT}) + add_link_options(-mfloat-abi=${FLOAT}) +endif() + +add_link_options(LINKER:--nmagic,--gc-sections) + +# Compilation warnings +add_compile_options( +# -Wall +# -Wextra + +# -Wcast-align +# -Wdouble-promotion +# -Wformat +# -Wmissing-field-initializers +# -Wnull-dereference +# -Wredundant-decls +# -Wshadow +# -Wswitch +# -Wswitch-default +# -Wunused + -Wno-redundant-decls + -Wno-psabi +) diff --git a/backends/arm/cmake/build.sh b/backends/arm/cmake/build.sh new file mode 100755 index 00000000000..0dbb8cf2177 --- /dev/null +++ b/backends/arm/cmake/build.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# +# Setup toolchain +# +BASEDIR=`realpath $(dirname "$0")` +echo "building using build.sh in $BASEDIR" + +ARCH=$(uname -i) +GCCPATH=${BASEDIR}/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi/bin/ + +echo $GCCPATH +if test -d "${GCCPATH}"; then + echo Using exising compiler ${GCCPATH} +else + pushd ${BASEDIR}/ + ./toolchain.sh + popd +fi +export PATH=${PATH}:${GCCPATH} + +echo building with `arm-none-eabi-gcc -v 2>&1 | grep "^gcc"` + + +# +# Prepare and run clean build +# +rm -rf buck-out/ build/lib/ cmake-out/ +rm -rf cmake-corstone +mkdir cmake-corstone +cd cmake-corstone + +#cmake -DBUCK2=buck2 .. + +#cmake --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake .. +cmake -DFLATC_EXECUTABLE=flatc \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_HOST_TARGETS=OFF \ + -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ + -DCMAKE_SYSTEM_PROCESSOR=cortex-m55+nodsp+nofp \ + -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \ + --toolchain backends/arm/cmake/arm-none-eabi-gcc.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_LOGGING_RELEASE_MODE=ON \ + .. + +cd .. +cmake --build cmake-corstone -j9 --target ethos_u ethosu_core_driver executorch portable_ops_lib portable_kernels diff --git a/backends/arm/cmake/toolchain.sh b/backends/arm/cmake/toolchain.sh new file mode 100755 index 00000000000..92188ee982d --- /dev/null +++ b/backends/arm/cmake/toolchain.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Copyright 2023 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +set -e + +# Cross compiler for Arm baremetal (e.g. Corestone-300 FVP or silcon) +ARCH=$(uname -i) +curl -o gcc.tar.xz https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-${ARCH}-arm-none-eabi.tar.xz +tar xf gcc.tar.xz +export PATH=${PATH}:`(cd arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi/bin/; pwd)` diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp new file mode 100644 index 00000000000..3dc52645089 --- /dev/null +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -0,0 +1,261 @@ +/* + * Copyright 2023 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Arm backend for Ethos-U baremetal driver stack, this relies on the + * ethos-u-core-driver for hardware interaction. + */ + +#include + +#include +#include +#include + +#include +#include + +namespace torch { +namespace executor { + +// TODO we should be in 0x31, not this lower 1MB sRAM +// SRAM (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00200000 +#define CS300_SRAM_LOW ((void*)0x11000000) +#define CS300_SRAM_HIGH ((void*)0x110FFFFF) + +class ArmBackend final : public PyTorchBackendInterface { + +public: + ArmBackend() { + ET_LOG(Debug, "Constructing ARM Backend"); + } + + ~ArmBackend() = default; + + virtual bool is_available() const override { + return 1; + } + + Result init( + BackendInitContext& context, + FreeableBuffer* processed, + ArrayRef compile_specs) const override { + + ET_LOG(Info, "ArmBackend::init %p", processed->data() ); + + char *data = (char*)processed->data(); + size_t size = processed->size(); + char *foot = data + size - 16; + + // Header and footer both 16 bit aligned suggest valid structure and we + // wont walk off the end of the chunks and segfault + if( !((int)data == next_mul_16((int)data)) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !((int)foot == next_mul_16((int)foot)) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !(0 == strncmp( data, "vela_bin_stream", 15 )) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + if( !(0 == strncmp( foot, "vela_end_stream", 15 )) ) + { + ET_LOG(Error, "ArmBackend::init header unaligned"); + return Error::InvalidProgram; + } + // Verify address range is accessible current expectation is the program + // is wholly stored in SRAM + if( !(data > CS300_SRAM_LOW || foot < CS300_SRAM_HIGH) ); + + // Return the same buffer we were passed - this data will be + // executed directly + return processed; + } + + Error execute( + BackendExecutionContext& context, + DelegateHandle* input_handle, + EValue** args) const override { + + FreeableBuffer* processed = (FreeableBuffer*)input_handle; + + ET_LOG(Info, "ArmBackend::execute %p", processed->data() ); + + vela_handles handles = { 0, 0, 0, 0, 0, 0 }; + + // Command stream - we know at this point it's aligned + char *data = (char*)processed->data(); + + // Read key sections from the vela_bin_stream + if( !this->vela_read( data, &handles, processed->size() ) ) + { + ET_LOG(Error, "ArmBackend::vela_read: error, invalid binary layout" ); + return Error::InvalidProgram; + } + + ET_LOG(Debug, "ArmBackend::execute: Running program data:\n cmd %p %d\n weight %p %d\n scratch %p %d\n", + handles.cmd_data, handles.cmd_data_size, + handles.weight_data, handles.weight_data_size, + handles.scratch_data, handles.scratch_data_size ); + + // TMP emit scratch + printf("Scratch before:\n"); + for( int i=0; itoTensor(); + for(int j=0; j()[j] = output_address[j]; + } + + return Error::Ok; + } + + void destroy(DelegateHandle* handle) const override { + return; + } + +private: + typedef struct { + const char *cmd_data; size_t cmd_data_size; + const char *weight_data; size_t weight_data_size; + const char *scratch_data; size_t scratch_data_size; + size_t input_offset; size_t input_data_shape[3]; + size_t output_offset; size_t output_data_shape[3]; + } vela_handles; + + typedef struct { + char name[16]; + int size; char _pad[12]; + char data[]; + } vela_bin_block; + + static int next_mul_16( int n ) { + return ((n-1)|15)+1; + } + + int vela_read(char* data, vela_handles *h, int size ) const { + + // Read header string + if( strncmp( data, "vela_bin_stream", 15 ) ) + { + return 0; + } + data += 16; + + // Expect one or more 'vela_bin_block's + while( 1 ) + { + vela_bin_block *b = (vela_bin_block*)data; + data += 16 + 16 + next_mul_16(b->size); + + // Exit with success on finding end of stream + if( !strncmp( b->name, "vela_end_stream", 15 ) ) return 1; + + if( !strncmp( b->name, "cmd_data", strlen("cmd_data")) ) + { + // This magic header confirms a valid command stream in binary + if( strncmp( b->data, "COP1", 4 ) ) return 0; + h->cmd_data = b->data; + h->cmd_data_size = b->size; + } + if( !strncmp( b->name, "weight_data", strlen("weight_data")) ) + { + h->weight_data = b->data;; + h->weight_data_size = b->size; + } + if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) + { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + + // capture inputs and outputs + if( !strncmp( b->name, "scratch_data", strlen("scratch_data")) ) + { + h->scratch_data = b->data; + h->scratch_data_size = b->size; + } + if( !strncmp( b->name, "input_offset", strlen("input_offset")) ) + { + h->input_offset = ((int*)b->data)[0]; + } + if( !strncmp( b->name, "output_offset", strlen("output_offset")) ) + { + h->output_offset = ((int*)b->data)[0]; + } + if( !strncmp( b->name, "input_shape", strlen("input_shape")) ) + { + h->input_data_shape[0] = ((int*)b->data)[0]; + h->input_data_shape[0] = ((int*)b->data)[1]; + h->input_data_shape[0] = ((int*)b->data)[2]; + + } + if( !strncmp( b->name, "output_shape", strlen("output_shape")) ) + { + h->output_data_shape[0] = ((int*)b->data)[0]; + h->output_data_shape[0] = ((int*)b->data)[1]; + h->output_data_shape[0] = ((int*)b->data)[2]; + } + } + } + +}; + + auto backend = ArmBackend(); + void arm_backend_register() { + Backend backend_id{"ArmBackend", &backend}; + static auto registered = register_backend(backend_id); + } + +} // namespace executor +} // namespace torch diff --git a/backends/arm/third-party/cmsis b/backends/arm/third-party/cmsis new file mode 160000 index 00000000000..a75f01746df --- /dev/null +++ b/backends/arm/third-party/cmsis @@ -0,0 +1 @@ +Subproject commit a75f01746df18bb5b929dfb8dc6c9407fac3a0f3 diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver new file mode 160000 index 00000000000..90f9df900ac --- /dev/null +++ b/backends/arm/third-party/ethos-u-core-driver @@ -0,0 +1 @@ +Subproject commit 90f9df900acdc0718ecd2dfdc53780664758dec5 diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index a7790be7fed..86938d065b8 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -11,6 +11,8 @@ #include #include #include "${fn_header}" // Generated Function import headers +#include + // ${generated_comment} // NOTE [Sharded File]: This file is generated in a sharded fashion to speed up @@ -24,8 +26,6 @@ using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>; namespace torch { namespace executor { -namespace function { -namespace { static Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels @@ -39,8 +39,11 @@ static KernelArrayRef kernel_array_ref( // Return value not used. Keep the static variable assignment to register // kernels in static initialization time. -static auto success_with_kernel_reg = register_kernels(kernel_array_ref); -} // namespace -} // namespace function +// static auto success_with_kernel_reg = register_kernels(kernel_array_ref); + +void manual_override() { + static auto success_with_kernel_reg = register_kernels(kernel_array_ref); +} + } // namespace executor } // namespace torch diff --git a/examples/arm/arm_tosa_e2e.py b/examples/arm/arm_tosa_e2e.py index e320ca0cf4e..a9e07bed4c9 100644 --- a/examples/arm/arm_tosa_e2e.py +++ b/examples/arm/arm_tosa_e2e.py @@ -153,7 +153,7 @@ def tosa_run_test(op, profile=TosaProfile.MI): # noqa: C901 # Temp systest mode for running all models against both inference profiles if __name__ == "__main__": for op in TestList: - tosa_run_test(op, profile=TosaProfile.MI) + tosa_run_test(op, profile=TosaProfile.BI) # TODO: haven't added the quantized lowerings for BI, comment out for now # for op in TestList: diff --git a/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch new file mode 100644 index 00000000000..efb02478229 --- /dev/null +++ b/examples/arm/cs300/core_platform/patches/0001-HACK-regress-cmake-version-from-3.21-3.20.patch @@ -0,0 +1,25 @@ +From a969839b90756b2458cb80ac5edb619e87210bea Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 18:05:03 -0700 +Subject: [PATCH 1/3] [HACK] regress cmake version from 3.21 --> 3.20 + +--- + targets/corstone-300/CMakeLists.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/targets/corstone-300/CMakeLists.txt b/targets/corstone-300/CMakeLists.txt +index 62205bb..7dda8a1 100644 +--- a/targets/corstone-300/CMakeLists.txt ++++ b/targets/corstone-300/CMakeLists.txt +@@ -42,7 +42,7 @@ set(MEMORY_ARENA "dram" CACHE STRING "Memory config for arena") + # Project + ############################################################################# + +-cmake_minimum_required(VERSION 3.21) ++cmake_minimum_required(VERSION 3.20) + + project(ethos-u-corstone-300 VERSION 0.0.1) + +-- +2.39.3 + diff --git a/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch new file mode 100644 index 00000000000..f2a6e17ccd8 --- /dev/null +++ b/examples/arm/cs300/core_platform/patches/0002-HACK-disable-warnings-to-reduce-verbosity.patch @@ -0,0 +1,52 @@ +From 3687c49c2ca85ca8a7d554b1206272870c565de3 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 18:05:30 -0700 +Subject: [PATCH 2/3] [HACK] disable warnings to reduce verbosity + +--- + cmake/toolchain/arm-none-eabi-gcc.cmake | 28 ++++++++++++------------- + 1 file changed, 14 insertions(+), 14 deletions(-) + +diff --git a/cmake/toolchain/arm-none-eabi-gcc.cmake b/cmake/toolchain/arm-none-eabi-gcc.cmake +index 093005e..0e6a2ed 100644 +--- a/cmake/toolchain/arm-none-eabi-gcc.cmake ++++ b/cmake/toolchain/arm-none-eabi-gcc.cmake +@@ -85,21 +85,21 @@ add_link_options(LINKER:--nmagic,--gc-sections) + + # Compilation warnings + add_compile_options( +- -Wall +- -Wextra ++ # -Wall ++ # -Wextra + +- -Wcast-align +- -Wdouble-promotion +- -Wformat +- -Wmissing-field-initializers +- -Wnull-dereference +- -Wredundant-decls +- -Wshadow +- -Wswitch +- -Wswitch-default +- -Wunused ++ # -Wcast-align ++ # -Wdouble-promotion ++ # -Wformat ++ # -Wmissing-field-initializers ++ # -Wnull-dereference ++ # -Wredundant-decls ++ # -Wshadow ++ # -Wswitch ++ # -Wswitch-default ++ # -Wunused + +- -Wno-redundant-decls ++ # -Wno-redundant-decls + +- -Wno-psabi ++ # -Wno-psabi + ) +-- +2.39.3 + diff --git a/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch new file mode 100644 index 00000000000..9a0b0be554e --- /dev/null +++ b/examples/arm/cs300/core_platform/patches/0003-HACK-Add-Executorch-add-example.patch @@ -0,0 +1,224 @@ +From b5369c873814d765276a746ce26d2be5724da8f8 Mon Sep 17 00:00:00 2001 +From: Digant Desai +Date: Thu, 28 Sep 2023 19:07:51 -0700 +Subject: [PATCH 3/3] [HACK] Add Executorch add example + +--- + applications/CMakeLists.txt | 2 + + applications/executorch_tests/CMakeLists.txt | 53 ++++++++ + applications/executorch_tests/add.cpp | 130 +++++++++++++++++++ + 3 files changed, 185 insertions(+) + create mode 100644 applications/executorch_tests/CMakeLists.txt + create mode 100644 applications/executorch_tests/add.cpp + +diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt +index 1fa2b2e..68e5427 100644 +--- a/applications/CMakeLists.txt ++++ b/applications/CMakeLists.txt +@@ -28,6 +28,8 @@ add_subdirectory(threadx_demo) + + add_subdirectory(message_handler_openamp) + ++add_subdirectory(executorch_tests) ++ + if (CMAKE_CXX_COMPILER_ID STREQUAL "ARMClang") + # Only armclang supported for now + add_subdirectory(trustzone_inference) +diff --git a/applications/executorch_tests/CMakeLists.txt b/applications/executorch_tests/CMakeLists.txt +new file mode 100644 +index 0000000..8a34c44 +--- /dev/null ++++ b/applications/executorch_tests/CMakeLists.txt +@@ -0,0 +1,53 @@ ++# ++# Copyright (c) 2021 Arm Limited. All rights reserved. ++# ++# SPDX-License-Identifier: Apache-2.0 ++# ++# Licensed under the Apache License, Version 2.0 (the License); you may ++# not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an AS IS BASIS, WITHOUT ++# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ++ ++if (NOT TARGET ethosu_core_driver) ++ return() ++endif() ++ ++#### ++#### Executorch demo app/test ++#### ++ ++set(ET_DIR_PATH "<..>/executorch" CACHE PATH "Path to Executorch dir") ++set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH "Path to Executorch build dir") ++set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH "Path to Executorch headers") ++ ++get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) ++get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) ++get_filename_component(ET_INCLUDE_PATH ${ET_INCLUDE_PATH} REALPATH) ++ ++message("**********************") ++message("Executorch dir (ET_DIR_PATH) : ${ET_DIR_PATH}") ++message("Executorch build dir(ET_BUILD_DIR_PATH): ${ET_BUILD_DIR_PATH}") ++message("Executorch headers (ET_INCUDE_PATH) : ${ET_INCLUDE_PATH}") ++message("**********************") ++ ++set(LIB_ET_RUNTIME "${ET_BUILD_DIR_PATH}/libexecutorch.a") ++set(LIB_ET_OP_REGISTRATION "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_ops_lib.a") ++set(LIB_ET_OP_KERNELS "${ET_BUILD_DIR_PATH}/kernels/portable/libportable_kernels.a") ++ ++ethosu_add_executable_test(executorch_add PRIVATE ++ SOURCES add.cpp ++ LIBRARIES ${LIB_ET_RUNTIME} ${LIB_ET_OP_REGISTRATION} ++ ${LIB_ET_OP_KERNELS}) ++ ++target_include_directories(executorch_add PRIVATE ++${ET_INCLUDE_PATH}) ++ ++# TODO Memory setup +diff --git a/applications/executorch_tests/add.cpp b/applications/executorch_tests/add.cpp +new file mode 100644 +index 0000000..115af66 +--- /dev/null ++++ b/applications/executorch_tests/add.cpp +@@ -0,0 +1,130 @@ ++/* ++ * SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates ++ * ++ * SPDX-License-Identifier: Apache-2.0 ++ * ++ * Licensed under the Apache License, Version 2.0 (the License); you may ++ * not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an AS IS BASIS, WITHOUT ++ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ */ ++ ++/**************************************************************************** ++ * Includes ++ ****************************************************************************/ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include ++#include ++ ++using namespace std; ++ ++__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; ++ ++/**************************************************************************** ++ * Functions ++ ****************************************************************************/ ++ ++int main() { ++ /* ++ * This is a simple Executorch app which runs `add.pte`. ++ */ ++ ++ torch::executor::runtime_init(); ++ ++ torch::executor::manual_override(); // Hack: This will be updated soon. ++ ++ using torch::executor::Result; ++ using torch::executor::Error; ++ ++ auto loader = torch::executor::util::BufferDataLoader(add_pte, sizeof(add_pte)); ++ ++ Result program = torch::executor::Program::load(&loader); ++ if(!program.ok()) { ++ ET_LOG(Info,"ET: Program loading failed @ 0x%p: 0x%" PRIx32, add_pte, program.error()); ++ } ++ ++ ET_LOG(Info,"ET: Model buffer loaded, has %lu methods", program->num_methods()); ++ ++ const char* method_name = nullptr; ++ { ++ const auto method_name_result = program->get_method_name(0); ++ ET_CHECK_MSG(method_name_result.ok(), "Program has no methods"); ++ method_name = *method_name_result; ++ } ++ ET_LOG(Info,"ET: Running method %s", method_name); ++ ++ Result method_meta = program->method_meta(method_name); ++ if (!method_meta.ok()) { ++ ET_LOG(Info,"ET: Failed to get method_meta for %s: 0x%x", ++ method_name, (unsigned int)method_meta.error()); ++ } ++ ++ torch::executor::MemoryAllocator method_allocator{ ++ torch::executor::MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; ++ ++ std::vector> planned_buffers; // Owns the memory ++ std::vector> planned_spans; // Passed to the allocator ++ size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); ++ ++ for (size_t id = 0; id < num_memory_planned_buffers; ++id) { ++ size_t buffer_size = static_cast(method_meta->memory_planned_buffer_size(id).get()); ++ ET_LOG(Info,"ET: Setting up planned buffer %zu, size %zu.", id, buffer_size); ++ ++ planned_buffers.push_back(std::make_unique(buffer_size)); ++ planned_spans.push_back({planned_buffers.back().get(), buffer_size}); ++ } ++ ++ torch::executor::HierarchicalAllocator planned_memory( ++ {planned_spans.data(), planned_spans.size()}); ++ ++ torch::executor::MemoryManager memory_manager(&method_allocator, &planned_memory); ++ ++ Result method = program->load_method(method_name, &memory_manager); ++ if(!method.ok()) { ++ ET_LOG(Info,"ET: Loading of method %s failed with status 0x%" PRIx32, method_name, method.error()); ++ } ++ ET_LOG(Info,"ET: Method loaded."); ++ ++ ET_LOG(Info,"ET: Preparing inputs..."); ++ auto inputs = torch::executor::util::PrepareInputTensors(*method); ++ ET_LOG(Info,"ET: Input prepared."); ++ ++ ET_LOG(Info,"ET: Starting the model execution..."); ++ Error status = method->execute(); ++ if(status != Error::Ok){ ++ ET_LOG(Info,"ET: Execution of method %s failed with status 0x%" PRIx32, method_name, status); ++ } else { ++ ET_LOG(Info,"ET: Model executed successfully."); ++ } ++ ++ // Print the outputs. ++ std::vector outputs(method->outputs_size()); ++ ET_LOG(Info, "%zu outputs: ", outputs.size()); ++ status = method->get_outputs(outputs.data(), outputs.size()); ++ ET_CHECK(status == Error::Ok); ++ for (int i = 0; i < outputs.size(); ++i) { ++ for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { ++ printf("Output[%d][%d]: %f\n", i, j, outputs[i].toTensor().const_data_ptr()[j]); ++ } ++ } ++ return 0; ++} +-- +2.39.3 + diff --git a/examples/arm/cs300/setup.sh b/examples/arm/cs300/setup.sh new file mode 100755 index 00000000000..63fbd36b3bc --- /dev/null +++ b/examples/arm/cs300/setup.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -eu + +ethos_u_dir=${1:-/tmp/ethos-u} +script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) + +function patch_repo() { + echo -e "\nPreparing ${name}..." + cd ${ethos_u_dir}/${name} + + git reset --hard ${base_rev} + + patch_dir=${script_dir}/${name}/patches/ + [[ -e ${patch_dir} && $(ls -A ${patch_dir}) ]] && \ + git am -3 ${patch_dir}/*.patch + + echo -e "Patched ${name} @ $(git describe --all --long 2> /dev/null) in ${ethos_u_dir}/${name} dir.\n" +} + +name="core_platform" +base_rev=204210b1074071532627da9dc69950d058a809f4 +patch_repo + +name="core_software" +base_rev=74c514a5b50a19197a64a86095bc0429188adcbe +patch_repo + +exit $? diff --git a/examples/export/export_example.py b/examples/export/export_example.py index 9c2a9d9362e..e26d929aeac 100644 --- a/examples/export/export_example.py +++ b/examples/export/export_example.py @@ -12,6 +12,7 @@ from ..models import MODEL_NAME_TO_MODEL from ..models.model_factory import EagerModelFactory from .utils import export_to_exec_prog, save_pte_program +from executorch.exir.print_program import pretty_print, print_program # noqa FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" @@ -40,4 +41,6 @@ ) prog = export_to_exec_prog(model, example_inputs) + + pretty_print(prog.program.execution_plan) save_pte_program(prog.buffer, args.model_name) diff --git a/examples/models/toy_model/model.py b/examples/models/toy_model/model.py index 0f7131fe21c..1c8f9f3b590 100644 --- a/examples/models/toy_model/model.py +++ b/examples/models/toy_model/model.py @@ -45,9 +45,6 @@ def __init__(self): def forward(self, x, y): z = x + y - z = z + x - z = z + x - z = z + z return z def get_eager_model(self) -> torch.nn.Module: diff --git a/headrify.py b/headrify.py new file mode 100644 index 00000000000..cdae780c31c --- /dev/null +++ b/headrify.py @@ -0,0 +1,26 @@ +import binascii +bytes_per_line = 32 +hex_digits_per_line = bytes_per_line * 2 + +# copied from +# https://git.mlplatform.org/ml/ethos-u/ml-embedded-evaluation-kit.git/tree/scripts/py/gen_model_cpp.py + +magic_attr = '__attribute__((section(".sram.data"), aligned(16))) char' +# magic_attr = '__attribute__((section("network_model_sec"), aligned(16))) char' +# magic_attr = '__attribute__((section("input_data_sec"), aligned(16))) char' +filename="./add.pte" +with open(filename, "rb") as fr, open(f"{filename}.h", "w") as fw: + data = fr.read() + hexstream = binascii.hexlify(data).decode('utf-8') + + hexstring = magic_attr + ' add_pte[] = {' + + for i in range(0, len(hexstream), 2): + if 0 == (i % hex_digits_per_line): + hexstring += "\n" + hexstring += '0x' + hexstream[i:i+2] + ", " + + hexstring += '};\n' + fw.write(hexstring) + print(f"Wrote {len(hexstring)} bytes, original {len(data)}") + diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index 1da9d0eaee5..1ec18b3775d 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -33,10 +33,15 @@ Tensor& add_out( ET_CHECK(canCast(common_type, out_type)); - ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "add", CTYPE_A, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "add", CTYPE_B, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, common_type, ctx, "add", CTYPE_IN, [&]() { +// ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "add", CTYPE_OUT, [&]() { + + using CTYPE_A = float; + using CTYPE_B = float; + using CTYPE_IN = float; + using CTYPE_OUT = float; CTYPE_IN alpha_val; ET_EXTRACT_SCALAR(alpha, alpha_val); @@ -51,10 +56,10 @@ Tensor& add_out( a, b, out); - }); - }); - }); - }); +// }); +// }); +// }); +// }); return out; } diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h index 0373196a4b6..5a297026050 100644 --- a/kernels/portable/cpu/vec_ops.h +++ b/kernels/portable/cpu/vec_ops.h @@ -13,6 +13,7 @@ #include #include #include +#include /** * @file diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 949b771b9cc..6e31dbe4939 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -17,702 +17,7 @@ # See the README.md file in this directory for a description of the syntax used # by this file. -- op: _log_softmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::log_softmax_out - -- op: _native_batch_norm_legit_no_training.out - kernels: - - arg_meta: null - kernel_name: torch::executor::_native_batch_norm_legit_no_training_out - -- op: _softmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::softmax_out - -- op: _to_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::to_copy_out - -- op: abs.out - kernels: - - arg_meta: null - kernel_name: torch::executor::abs_out - -- op: acos.out - kernels: - - arg_meta: null - kernel_name: torch::executor::acos_out - -- op: acosh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::acosh_out - - op: add.out kernels: - arg_meta: null kernel_name: torch::executor::add_out - -- op: add.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::add_scalar_out - -- op: addmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::addmm_out - -- op: alias_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::alias_copy_out - -- op: amax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::amax_out - -- op: amin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::amin_out - -- op: any.all_out - kernels: - - arg_meta: null - kernel_name: torch::executor::any_all_out - -- op: arange.out - kernels: - - arg_meta: null - kernel_name: torch::executor::arange_out - -- op: arange.start_out - kernels: - - arg_meta: null - kernel_name: torch::executor::arange_start_out - -- op: argmax.out - kernels: - - arg_meta: null - kernel_name: torch::executor::argmax_out - -- op: argmin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::argmin_out - -- op: as_strided_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::as_strided_copy_out - -- op: asin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::asin_out - -- op: asinh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::asinh_out - -- op: atan.out - kernels: - - arg_meta: null - kernel_name: torch::executor::atan_out - -- op: atanh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::atanh_out - -- op: avg_pool2d.out - kernels: - - arg_meta: null - kernel_name: torch::executor::avg_pool2d_out - -- op: bitwise_and.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_and_Scalar_out - -- op: bitwise_and.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_and_Tensor_out - -- op: bitwise_not.out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_not_out - -- op: bitwise_or.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_or_Scalar_out - -- op: bitwise_or.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_or_Tensor_out - -- op: bitwise_xor.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_xor_Scalar_out - -- op: bitwise_xor.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::bitwise_xor_Tensor_out - -- op: bmm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::bmm_out - -- op: cat.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cat_out - -- op: ceil.out - kernels: - - arg_meta: null - kernel_name: torch::executor::ceil_out - -- op: clamp.out - cpp_no_default_args: ['min'] - kernels: - - arg_meta: null - kernel_name: torch::executor::clamp_out - -- op: clone.out - kernels: - - arg_meta: null - kernel_name: torch::executor::clone_out - -- op: constant_pad_nd.out - kernels: - - arg_meta: null - kernel_name: torch::executor::constant_pad_nd_out - -- op: convolution.out - kernels: - - arg_meta: null - kernel_name: torch::executor::convolution_out - -- op: copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::copy_out - -- op: cos.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cos_out - -- op: cosh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cosh_out - -- op: cumsum.out - kernels: - - arg_meta: null - kernel_name: torch::executor::cumsum_out - -- op: detach_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::detach_copy_out - -- op: div.out - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out - -- op: div.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::div_scalar_out - -- op: div.out_mode - kernels: - - arg_meta: null - kernel_name: torch::executor::div_out_mode - - -- op: embedding.out - kernels: - - arg_meta: null - kernel_name: torch::executor::embedding_out - -- op: empty.out - kernels: - - arg_meta: null - kernel_name: torch::executor::empty_out - -- op: eq.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::eq_scalar_out - -- op: erf.out - kernels: - - arg_meta: null - kernel_name: torch::executor::erf_out - -- op: exp.out - kernels: - - arg_meta: null - kernel_name: torch::executor::exp_out - -- op: expand_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::expand_copy_out - -- op: fill.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fill_scalar_out - -- op: fill.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fill_tensor_out - -- op: floor.out - kernels: - - arg_meta: null - kernel_name: torch::executor::floor_out - -- op: floor_divide.out - kernels: - - arg_meta: null - kernel_name: torch::executor::floor_divide_out - -- op: fmod.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fmod_Tensor_out - -- op: fmod.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::fmod_Scalar_out - -- op: full.out - kernels: - - arg_meta: null - kernel_name: torch::executor::full_out - -# TODO: Investigate why empty dispatch is required for building: -# buck2 build //executorch/kernels/portable:generated_lib -- op: full_like.out - dispatch: {} - kernels: - - arg_meta: null - kernel_name: torch::executor::full_like_out - -- op: ge.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ge_scalar_out - -- op: ge.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ge_tensor_out - -- op: gelu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::gelu_out - -- op: glu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::glu_out - -- op: gt.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::gt_scalar_out - -- op: gt.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::gt_tensor_out - -- op: hardtanh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::hardtanh_out - -- op: index.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::index_Tensor_out - -- op: index_put.out - kernels: - - arg_meta: null - kernel_name: torch::executor::index_put_out - -- op: index_select.out - kernels: - - arg_meta: null - kernel_name: torch::executor::index_select_out - -- op: isinf.out - kernels: - - arg_meta: null - kernel_name: torch::executor::isinf_out - -- op: isnan.out - kernels: - - arg_meta: null - kernel_name: torch::executor::isnan_out - -- op: le.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::le_scalar_out - -- op: le.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::le_tensor_out - -- op: leaky_relu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::leaky_relu_out - -- op: lift_fresh_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::lift_fresh_copy_out - -- op: log.out - kernels: - - arg_meta: null - kernel_name: torch::executor::log_out - -- op: logical_and.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_and_out - -- op: logical_not.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_not_out - -- op: logical_or.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_or_out - -- op: logical_xor.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logical_xor_out - -- op: logit.out - kernels: - - arg_meta: null - kernel_name: torch::executor::logit_out - -- op: lt.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::lt_scalar_out - -- op: lt.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::lt_tensor_out - -- op: masked_fill.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::masked_fill_scalar_out - -- op: max.dim_max - kernels: - - arg_meta: null - kernel_name: torch::executor::max_out - -- op: max_pool2d_with_indices.out - kernels: - - arg_meta: null - kernel_name: torch::executor::max_pool2d_with_indices_out - -- op: mean.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mean_dim_out - -- op: min.dim_min - kernels: - - arg_meta: null - kernel_name: torch::executor::min_out - -- op: minimum.out - kernels: - - arg_meta: null - kernel_name: torch::executor::minimum_out - -- op: mm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mm_out - -- op: mul.out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_out - -- op: mul.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::mul_scalar_out - -- op: native_layer_norm.out - kernels: - - arg_meta: null - kernel_name: torch::executor::native_layer_norm_out - -- op: ne.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ne_scalar_out - -- op: ne.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::ne_tensor_out - -- op: neg.out - kernels: - - arg_meta: null - kernel_name: torch::executor::neg_out - -- op: nonzero.out - kernels: - - arg_meta: null - kernel_name: torch::executor::nonzero_out - -- op: ones.out - kernels: - - arg_meta: null - kernel_name: torch::executor::ones_out - -- op: permute_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::permute_copy_out - -- op: pixel_shuffle.out - kernels: - - arg_meta: null - kernel_name: torch::executor::pixel_shuffle_out - -- op: pow.Tensor_Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::pow_Tensor_Scalar_out - -- op: pow.Tensor_Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::pow_Tensor_Tensor_out - -- op: reciprocal.out - kernels: - - arg_meta: null - kernel_name: torch::executor::reciprocal_out - -- op: relu.out - kernels: - - arg_meta: null - kernel_name: torch::executor::relu_out - -- op: remainder.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::remainder_Tensor_out - -- op: remainder.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::remainder_Scalar_out - -- op: repeat.out - kernels: - - arg_meta: null - kernel_name: torch::executor::repeat_out - -- op: round.out - kernels: - - arg_meta: null - kernel_name: torch::executor::round_out - -- op: rsqrt.out - kernels: - - arg_meta: null - kernel_name: torch::executor::rsqrt_out - -- op: rsub.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::rsub_scalar_out - -- op: scalar_tensor.out - kernels: - - arg_meta: null - kernel_name: torch::executor::scalar_tensor_out - -- op: scatter_add.out - kernels: - - arg_meta: null - kernel_name: torch::executor::scatter_add_out - -- op: select_copy.int_out - kernels: - - arg_meta: null - kernel_name: torch::executor::select_copy_int_out - -- op: select_scatter.out - kernels: - - arg_meta: null - kernel_name: torch::executor::select_scatter_out - -- op: sigmoid.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sigmoid_out - -- op: sign.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sign_out - -- op: sin.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sin_out - -- op: sinh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sinh_out - -- op: slice_copy.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::slice_copy_Tensor_out - -- op: slice_scatter.out - kernels: - - arg_meta: null - kernel_name: torch::executor::slice_scatter_out - -- op: split_copy.Tensor_out - kernels: - - arg_meta: null - kernel_name: torch::executor::split_copy_Tensor_out - -- op: sqrt.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sqrt_out - -- op: squeeze_copy.dim_out - kernels: - - arg_meta: null - kernel_name: torch::executor::squeeze_copy_dim_out - -- op: stack.out - kernels: - - arg_meta: null - kernel_name: torch::executor::stack_out - -- op: sub.out - kernels: - - arg_meta: null - kernel_name: torch::executor::sub_out - -- op: sub.Scalar_out - kernels: - - arg_meta: null - kernel_name: torch::executor::sub_scalar_out - -- op: sum.IntList_out - kernels: - - arg_meta: null - kernel_name: torch::executor::sum_dim_out - -- op: t_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::t_copy_out - -- op: tan.out - kernels: - - arg_meta: null - kernel_name: torch::executor::tan_out - -- op: tanh.out - kernels: - - arg_meta: null - kernel_name: torch::executor::tanh_out - -- op: transpose_copy.int_out - kernels: - - arg_meta: null - kernel_name: torch::executor::transpose_copy_int_out - -- op: tril.out - kernels: - - arg_meta: null - kernel_name: torch::executor::tril_out - -- op: unbind_copy.int_out - kernels: - - arg_meta: null - kernel_name: torch::executor::unbind_copy_int_out - -- op: unsqueeze_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::unsqueeze_copy_out - -- op: var.out - kernels: - - arg_meta: null - kernel_name: torch::executor::var_out - -- op: view_copy.out - kernels: - - arg_meta: null - kernel_name: torch::executor::view_copy_out - -- op: where.self_out - kernels: - - arg_meta: null - kernel_name: torch::executor::where_out - -- op: zeros.out - kernels: - - arg_meta: null - kernel_name: torch::executor::zeros_out diff --git a/manual.h b/manual.h new file mode 100644 index 00000000000..3719a142718 --- /dev/null +++ b/manual.h @@ -0,0 +1,6 @@ +namespace torch { +namespace executor { + void manual_override(); + void digant_add_out(torch::executor::KernelRuntimeContext & context, EValue** stack); + void arm_backend_register(); +}} diff --git a/runtime/core/function_ref.h b/runtime/core/function_ref.h index 92171134291..a07f6151f10 100644 --- a/runtime/core/function_ref.h +++ b/runtime/core/function_ref.h @@ -59,9 +59,7 @@ class FunctionRef; template class FunctionRef { - Ret (*callback_)(const void* memory, Params... params) = nullptr; union Storage { - void* callable; Ret (*function)(Params...); } storage_; @@ -70,57 +68,18 @@ class FunctionRef { explicit FunctionRef(std::nullptr_t) {} /** - * Case 1: A callable object passed by lvalue reference. - * Taking rvalue reference is error prone because the object will be always - * be destroyed immediately. - */ - template < - typename Callable, - // This is not the copy-constructor. - typename std::enable_if< - !std::is_same, FunctionRef>::value, - int32_t>::type = 0, - // Avoid lvalue reference to non-capturing lambda. - typename std::enable_if< - !std::is_convertible::value, - int32_t>::type = 0, - // Functor must be callable and return a suitable type. - // To make this container type safe, we need to ensure either: - // 1. The return type is void. - // 2. Or the resulting type from calling the callable is convertible to - // the declared return type. - typename std::enable_if< - std::is_void::value || - std::is_convertible< - decltype(std::declval()(std::declval()...)), - Ret>::value, - int32_t>::type = 0> - explicit FunctionRef(Callable& callable) - : callback_([](const void* memory, Params... params) { - auto& storage = *static_cast(memory); - auto& callable = *static_cast(storage.callable); - return static_cast(callable(std::forward(params)...)); - }) { - storage_.callable = &callable; - } - - /** - * Case 2: A plain function pointer. + * Case 1: A plain function pointer. * Instead of storing an opaque pointer to underlying callable object, * store a function pointer directly. * Note that in the future a variant which coerces compatible function * pointers could be implemented by erasing the storage type. */ - /* implicit */ FunctionRef(Ret (*ptr)(Params...)) - : callback_([](const void* memory, Params... params) { - auto& storage = *static_cast(memory); - return storage.function(std::forward(params)...); - }) { + /* implicit */ FunctionRef(Ret (*ptr)(Params...)) { storage_.function = ptr; } /** - * Case 3: Implicit conversion from lambda to FunctionRef. + * Case 2: Implicit conversion from lambda to FunctionRef. * A common use pattern is like: * void foo(FunctionRef<...>) {...} * foo([](...){...}) @@ -144,11 +103,11 @@ class FunctionRef { : FunctionRef(static_cast(function)) {} Ret operator()(Params... params) const { - return callback_(&storage_, std::forward(params)...); + return storage_.function(std::forward(params)...); } explicit operator bool() const { - return callback_; + return storage_.function; } }; diff --git a/runtime/platform/target/Posix.cpp b/runtime/platform/target/Posix.cpp index bc0f1d9f312..9f53964278a 100644 --- a/runtime/platform/target/Posix.cpp +++ b/runtime/platform/target/Posix.cpp @@ -52,11 +52,9 @@ #define _ASSERT_PAL_INITIALIZED() \ ({ \ if (!initialized) { \ - fprintf( \ - ET_LOG_OUTPUT_FILE, \ + printf( \ "ExecuTorch PAL must be initialized before call to %s()", \ __ET_FUNCTION); \ - fflush(ET_LOG_OUTPUT_FILE); \ et_pal_abort(); \ } \ }) @@ -144,8 +142,7 @@ void et_pal_emit_log_message( // // Clients who want to change the format or add other fields can override this // weak implementation of et_pal_emit_log_message. - fprintf( - ET_LOG_OUTPUT_FILE, + printf( "%c %02u:%02u:%02u.%06lu executorch:%s:%zu] %s\n", level, hour, @@ -155,5 +152,5 @@ void et_pal_emit_log_message( filename, line, message); - fflush(ET_LOG_OUTPUT_FILE); + // fflush(ET_LOG_OUTPUT_FILE); } diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt index 0c7dc2cbec4..55c07fd5f7b 100644 --- a/schema/CMakeLists.txt +++ b/schema/CMakeLists.txt @@ -41,7 +41,7 @@ add_custom_command( -o "${_program_schema__include_dir}/executorch/schema" ${_program_schema__srcs} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS ${FLATC_EXECUTABLE} ${_program_schema__srcs} + DEPENDS ${_program_schema__srcs} COMMENT "Generating program_schema headers" VERBATIM)