diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index bbc9eec3a0e..a251891e622 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -89,6 +89,7 @@ deps = [ "executorch", "executorch_core", "extension_threadpool", + "optimized_cpublas", "portable_kernels", ] @@ -146,6 +147,7 @@ deps = [ "executorch_core", "executorch", "extension_threadpool", + "optimized_cpublas", "portable_kernels", ] # ---------------------------------- core end ---------------------------------- @@ -413,6 +415,7 @@ excludes = [ deps = [ "executorch", "executorch_core", + "optimized_cpublas", "optimized_kernels", "extension_threadpool", "reduce_util", @@ -452,6 +455,7 @@ deps = [ "extension_data_loader", "extension_module", "extension_threadpool", + "optimized_cpublas", "portable_kernels", "quantized_kernels", "xnnpack_backend", diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 931d31de8ef..13ec0b876ad 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -89,6 +89,7 @@ set(lib_list pthreadpool vulkan_backend optimized_kernels + optimized_portable_kernels cpublas eigen_blas optimized_ops_lib @@ -132,7 +133,26 @@ endforeach() # target_compile_options/target_compile_definitions for everything. if(TARGET cpublas) set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES + "extension_threadpool;eigen_blas" + ) +endif() +if(TARGET optimized_kernels) + set_target_properties( + optimized_kernels PROPERTIES INTERFACE_LINK_LIBRARIES + "executorch_core;cpublas;extension_threadpool" + ) +endif() +if(TARGET optimized_native_cpu_ops_lib) + if(TARGET optimized_portable_kernels) + set(_maybe_optimized_portable_kernels_lib optimized_portable_kernels) + else() + set(_maybe_optimized_portable_kernels_lib portable_kernels) + endif() + set_target_properties( + optimized_native_cpu_ops_lib + PROPERTIES INTERFACE_LINK_LIBRARIES + "optimized_kernels;${_maybe_optimized_portable_kernels_lib}" ) endif() if(TARGET extension_threadpool) diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt index 462124a6ea6..cf304d92523 100644 --- a/configurations/CMakeLists.txt +++ b/configurations/CMakeLists.txt @@ -47,12 +47,17 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) message("Generated files ${gen_command_sources}") # optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime + if(TARGET optimized_portable_kernels) + set(_optimized_native_cpu_ops_lib_portable_kernels_lib optimized_portable_kernels) + else() + set(_optimized_native_cpu_ops_lib_portable_kernels_lib portable_kernels) + endif() gen_operators_lib( LIB_NAME "optimized_native_cpu_ops_lib" KERNEL_LIBS - portable_kernels optimized_kernels + ${_optimized_native_cpu_ops_lib_portable_kernels_lib} DEPS executorch ) diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index ba722d9c791..03595efbfea 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -84,10 +84,6 @@ if(TARGET optimized_native_cpu_ops_lib) APPEND link_libraries optimized_native_cpu_ops_lib - optimized_kernels - portable_kernels - cpublas - eigen_blas ) target_link_options_shared_lib(optimized_native_cpu_ops_lib) else() diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 885c509246b..e15970329c1 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -63,6 +63,22 @@ gen_operators_lib( LIB_NAME "portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch ) +# Portable kernels support optional parallelization (and, in the +# future, perhaps other performance features). If support is present, +# produce an optimized version. +set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) + +if(BUILD_OPTIMIZED_PORTABLE_KERNELS) + add_library(optimized_portable_kernels ${_portable_kernels__srcs}) + target_link_libraries(optimized_portable_kernels PRIVATE executorch) + target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) + target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) + install( + TARGETS optimized_portable_kernels + DESTINATION lib + ) +endif() + install( TARGETS portable_kernels portable_ops_lib DESTINATION lib diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index a0ee82d2612..87e90de4c04 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -12,6 +12,7 @@ #include #include +#include #include namespace torch { @@ -47,30 +48,43 @@ Tensor& argmin_out( ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] { long* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { - // the below condition as written is equivalent to !isnan(accval) && - // (isnan(v) || v < acc_val). cases: - // - if neither acc_val nor v is NaN, !(v >= acc_val) is - // trivially equivalent to v < acc_val. - // - if acc_val is NaN, the whole thing is trivially false. - // - if acc_val is not NaN and v is NaN, then v >= acc_val - // - is false because all comparisons involving NaN are - // - false, so the result is true. The result is trivially - // - true for the above condition that uses isnan(v) as - // - well. - if (!std::isnan(acc_val) && !(v >= acc_val)) { - acc_val = v; - acc_ix = ix; - } - return std::tuple{acc_val, acc_ix}; - }, - in, - dim, - out_ix); - out_data[out_ix] = std::get<1>(acc); - } + // REVIEW: this is the parallelization strategy ATen uses + // specifically when the reduction is along the last dimension and + // that dimension is contiguous. Is there any particular reason we + // shouldn't just always use this strategy since we aren't + // otherwise capable of parallelizing reductions? + const int64_t reduction_size = get_reduced_dim_product(in, dim); + const auto grain_size = std::max( + static_cast(1), + executorch::extension::internal::GRAIN_SIZE / reduction_size); + const bool success = executorch::extension::parallel_for( + 0, out.numel(), grain_size, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + // the below condition as written is equivalent to + // !isnan(accval) && (isnan(v) || v < acc_val). cases: + // - if neither acc_val nor v is NaN, !(v >= acc_val) is + // trivially equivalent to v < acc_val. + // - if acc_val is NaN, the whole thing is trivially false. + // - if acc_val is not NaN and v is NaN, then v >= acc_val + // - is false because all comparisons involving NaN are + // - false, so the result is true. The result is trivially + // - true for the above condition that uses isnan(v) as + // - well. + if (!std::isnan(acc_val) && !(v >= acc_val)) { + acc_val = v; + acc_ix = ix; + } + return std::tuple{acc_val, acc_ix}; + }, + in, + dim, + out_ix); + out_data[out_ix] = std::get<1>(acc); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp index 09ba508a31d..31296d67ee7 100644 --- a/kernels/portable/cpu/util/reduce_util.cpp +++ b/kernels/portable/cpu/util/reduce_util.cpp @@ -83,12 +83,8 @@ size_t get_reduced_dim_product( if (in.dim() == 0) { return 1; } - size_t dim_product = 1; if (!dim.has_value()) { - for (size_t i = 0; i < static_cast(in.dim()); ++i) { - dim_product *= in.size(i); - } - return dim_product; + return in.numel(); } const size_t d = _normalize_non_neg_d(dim.value(), in.dim()); return in.size(d); @@ -104,16 +100,12 @@ size_t get_reduced_dim_product( if (in.dim() == 0) { return 1; } - size_t dim_product = 1; - const size_t in_dim = in.dim(); if (!dim_list.has_value() || dim_list.value().size() == 0) { - for (size_t i = 0; i < static_cast(in.dim()); ++i) { - dim_product *= in.size(i); - } - return dim_product; + return in.numel(); } + size_t dim_product = 1; for (const auto& d : dim_list.value()) { - const size_t non_neg_d = _normalize_non_neg_d(d, in_dim); + const size_t non_neg_d = _normalize_non_neg_d(d, in.dim()); dim_product *= in.size(non_neg_d); } return dim_product; diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 394ec241698..af0716b06ed 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -23,11 +23,11 @@ foreach(kernel ${_kernels}) "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test" ) set(_wrapper_path "${_wrapper_dir}/FunctionHeaderWrapper.h") + set(_functions_include "#include ") add_custom_command( OUTPUT "${_wrapper_path}" COMMAND mkdir -p ${_wrapper_dir} - COMMAND echo "#include " > - "${_wrapper_path}" + COMMAND echo ${_functions_include} > "${_wrapper_path}" DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h" "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h" @@ -53,7 +53,17 @@ foreach(kernel ${_kernels}) COMMENT "Generating ${_wrapper_dir}/supported_features.cpp and header" VERBATIM ) - + if(${kernel} STREQUAL "optimized") + set(_kernel_ops_lib "optimized_native_cpu_ops_lib") + set(_kernel_ops_lib_path + "${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib" + ) + else() + set(_kernel_ops_lib "${kernel}_ops_lib") + set(_kernel_ops_lib_path + "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib" + ) + endif() add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h" @@ -63,10 +73,9 @@ foreach(kernel ${_kernels}) mkdir -p "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/" COMMAND - cp - "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h" + cp "${_kernel_ops_lib_path}/*.h" "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/" - DEPENDS "${kernel}_ops_lib" + DEPENDS ${_kernel_ops_lib} ) endforeach() @@ -280,6 +289,11 @@ set(_optimized_kernels_test_sources ${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp ) +if(TARGET optimized_portable_kernels) + list(APPEND _optimized_kernels_test_sources ${all_test_sources}) + list(REMOVE_DUPLICATES _optimized_kernels_test_sources) +endif() + et_cxx_test( optimized_kernels_test SOURCES @@ -287,16 +301,16 @@ et_cxx_test( EXTRA_LIBS cpuinfo extension_threadpool - optimized_kernels - optimized_ops_lib - portable_kernels + optimized_native_cpu_ops_lib pthreadpool eigen_blas ) add_dependencies(optimized_kernels_test generate_wrapper) target_include_directories( - optimized_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized" - "${CMAKE_INSTALL_PREFIX}/include" + optimized_kernels_test + PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized" + "${CMAKE_CURRENT_BINARY_DIR}/include/portable" + "${CMAKE_INSTALL_PREFIX}/include" ) if(TARGET quantized_kernels) diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index b56413b92f4..dd48da64c30 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -284,6 +284,7 @@ ATEN_OPS = ( name = "op_argmin", deps = [ "//executorch/kernels/portable/cpu/util:reduce_util", + "//executorch/runtime/kernel:thread_parallel_interface", ], ), op_target(