From d0b11e8bb47dacb6e200ac8b0e2609626ccee0f3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 11:35:29 -0800 Subject: [PATCH 01/38] Update [ghstack-poisoned] --- CMakeLists.txt | 2 +- build/cmake_deps.toml | 18 ++++++++++++++++++ kernels/optimized/CMakeLists.txt | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index de941663a88..73b89b6171e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -749,9 +749,9 @@ endif() if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO - AND CMAKE_CXX_STANDARD GREATER_EQUAL 14 ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel) endif() if(EXECUTORCH_BUILD_PYBIND) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 4b22a09cb5b..4bbfd636a96 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -73,6 +73,7 @@ excludes = [ deps = [ "executorch", "executorch_core", + "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -115,6 +116,7 @@ excludes = [ deps = [ "executorch_core", "executorch", + "extension_parallel", ] [targets.optimized_native_cpu_ops] @@ -129,6 +131,8 @@ excludes = [ deps = [ "executorch_core", "executorch", + "extension_parallel", + "extension_threadpool", "portable_kernels", ] # ---------------------------------- core end ---------------------------------- @@ -208,6 +212,19 @@ deps = [ "extension_runner_util", ] +[targets.extension_parallel] +buck_targets = [ + "//extension/parallel:thread_parallel", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_core", + "extension_threadpool", +] + [targets.extension_tensor] buck_targets = [ "//extension/tensor:tensor", @@ -395,6 +412,7 @@ deps = [ "executorch", "executorch_core", "optimized_kernels", + "extension_parallel", "extension_threadpool", "xnnpack_backend", ] diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 235c6738d9a..d9b19d4f9c2 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -43,7 +43,7 @@ endif() list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(cpublas STATIC ${_optimized_cpublas__srcs}) target_link_libraries( - cpublas PRIVATE executorch_core eigen_blas extension_threadpool + cpublas PRIVATE executorch_core eigen_blas extension_parallel extension_threadpool ) target_compile_options(cpublas PUBLIC ${_common_compile_options}) From 9437be1e7055d5705540d3544955b5d30f72be43 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 12:29:58 -0800 Subject: [PATCH 02/38] Update [ghstack-poisoned] --- extension/parallel/CMakeLists.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 extension/parallel/CMakeLists.txt diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt new file mode 100644 index 00000000000..7f727aafe46 --- /dev/null +++ b/extension/parallel/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please keep this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)) + message(FATAL_ERROR "extension/parallel requires extension/threadpool") +endif() + +add_library(extension_parallel thread_parallel.cpp) + +target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool) +target_compile_options(extension_parallel PUBLIC ${_common_compile_options}) + +install( + TARGETS extension_parallel + DESTINATION lib + INCLUDES + DESTINATION ${_common_include_directories}) From 643e10ee081b1ea34f0f5fb49f7df44a9f2f666b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 14:54:47 -0800 Subject: [PATCH 03/38] Update [ghstack-poisoned] --- build/executorch-config.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index d14a1227cd9..539c7c2960e 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -67,6 +67,7 @@ set(lib_list portable_ops_lib extension_module extension_module_static + extension_parallel extension_runner_util extension_tensor extension_threadpool @@ -114,3 +115,7 @@ foreach(lib ${lib_list}) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endif() endforeach() + +# TODO: investigate use of install(EXPORT) to cleanly handle +# target_compile_options/target_compile_definitions for everything. +target_link_libraries(cpublas INTERFACE extension_parallel) From 6f2842b876a5a2310c6bf7311d6f6b76bc54549e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 14:54:48 -0800 Subject: [PATCH 04/38] Update [ghstack-poisoned] --- build/executorch-config.cmake | 1 + examples/models/llama/CMakeLists.txt | 9 --------- examples/models/llava/CMakeLists.txt | 9 --------- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 539c7c2960e..d238db8ca95 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -65,6 +65,7 @@ set(lib_list neuron_backend qnn_executorch_backend portable_ops_lib + custom_ops extension_module extension_module_static extension_parallel diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 5f49581ea25..f5d5a78d430 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -84,14 +84,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) target_link_options_shared_lib(executorch) endif() -# custom ops library -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops - ) -endif() - # llama_runner library add_subdirectory(runner) @@ -119,7 +111,6 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - target_link_options_shared_lib(custom_ops) list(APPEND link_libraries custom_ops) endif() diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index ecd00809fdb..f7fa4bacc04 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -93,14 +93,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) target_link_options_shared_lib(executorch) endif() -# custom ops library -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${EXECUTORCH_ROOT}/extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops - ) -endif() - # llava_runner library add_subdirectory(runner) @@ -132,7 +124,6 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - target_link_options_shared_lib(custom_ops) list(APPEND link_libraries custom_ops) endif() From e47dfeb68a2e8e2ae0e3fa2add553f724f73404d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 14:54:52 -0800 Subject: [PATCH 05/38] Update [ghstack-poisoned] --- build/executorch-config.cmake | 1 + examples/models/llama/CMakeLists.txt | 1 - examples/models/llava/CMakeLists.txt | 1 - examples/models/llava/targets.bzl | 3 --- extension/android/CMakeLists.txt | 1 - extension/llm/custom_ops/CMakeLists.txt | 4 ++-- extension/threadpool/CMakeLists.txt | 1 + 7 files changed, 4 insertions(+), 8 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index d238db8ca95..75e1075d929 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -120,3 +120,4 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. target_link_libraries(cpublas INTERFACE extension_parallel) +target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index f5d5a78d430..b3364be610a 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -131,7 +131,6 @@ endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) - list(APPEND _common_compile_options -DET_USE_THREADPOOL) list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index f7fa4bacc04..5d5857dd5af 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -130,7 +130,6 @@ endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) - list(APPEND _common_compile_options -DET_USE_THREADPOOL) list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl index 5efb099f06f..6f3a370acf4 100644 --- a/examples/models/llava/targets.bzl +++ b/examples/models/llava/targets.bzl @@ -7,9 +7,6 @@ def define_common_targets(): "main.cpp", ], compiler_flags = ["-Wno-global-constructors"], - preprocessor_flags = [ - "-DET_USE_THREADPOOL", - ], deps = [ "//executorch/examples/models/llava/runner:runner", "//executorch/extension/evalue_util:print_evalue", diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 70f21f2751c..849d1d14364 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -124,7 +124,6 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM) endif() if(TARGET pthreadpool) - target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1) target_include_directories( executorch_jni PUBLIC diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index c3969e6f9bf..eeb118d4344 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -78,7 +78,7 @@ target_include_directories( target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core) target_compile_options( - custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL + custom_ops PUBLIC ${_common_compile_options} ) install(TARGETS custom_ops DESTINATION lib) @@ -130,7 +130,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) target_compile_options( custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions - ${_common_compile_options} -DET_USE_THREADPOOL + ${_common_compile_options} ) install(TARGETS custom_ops_aot_lib diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index 90288656674..c1d86acf75d 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -32,6 +32,7 @@ target_include_directories( PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include ) +target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL) target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) # Install libraries From a92958a1f6fbfbc154ec24c6b4ee6c6ebd41aea8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 18:41:25 -0800 Subject: [PATCH 06/38] Update [ghstack-poisoned] --- build/executorch-config.cmake | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 1a2da26416e..8f64c502b47 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -118,6 +118,8 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel -) +if (TARGET cpublas) + set_target_properties( + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel + ) +endif() From 3bd64370f6454f9523b6cc51d05d377fed8a77fb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 19:54:10 -0800 Subject: [PATCH 07/38] Update [ghstack-poisoned] --- build/executorch-config.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 8f64c502b47..2c459b66ac8 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -118,7 +118,12 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -if (TARGET cpublas) +if(TARGET extension_parallel) + set_target_properties( + extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool + ) +endif() +if(TARGET cpublas) set_target_properties( cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel ) From 9fdebee5a3e9a439895df57b49343816fcceee86 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 21:11:25 -0800 Subject: [PATCH 08/38] Update [ghstack-poisoned] --- build/cmake_deps.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 4bbfd636a96..4811563269c 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -449,6 +449,7 @@ deps = [ "executorch_core", "extension_data_loader", "extension_module", + "extension_parallel", "portable_kernels", "quantized_kernels", "xnnpack_backend", From e48e81617b32f3460c5b449b8a524e221f79bef1 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 14:40:02 -0800 Subject: [PATCH 09/38] Update [ghstack-poisoned] --- extension/parallel/targets.bzl | 38 ++++++++++++-------------- extension/parallel/thread_parallel.cpp | 12 +++++--- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl index 82b3deab129..82a8502c034 100644 --- a/extension/parallel/targets.bzl +++ b/extension/parallel/targets.bzl @@ -7,24 +7,20 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ - for aten_mode in get_aten_mode_options(): - aten_suffix = ("_aten" if aten_mode else "") - - runtime.cxx_library( - name = "thread_parallel" + aten_suffix, - srcs = [ - "thread_parallel.cpp", - ], - exported_headers = [ - "thread_parallel.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/extension/threadpool:threadpool", - "//executorch/runtime/core:core", - "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ], - ) + runtime.cxx_library( + name = "thread_parallel", + srcs = [ + "thread_parallel.cpp", + ], + exported_headers = [ + "thread_parallel.h", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/runtime/core:core", + ], + ) diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp index dfbb911d3a9..5d481ccd44c 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/parallel/thread_parallel.cpp @@ -6,11 +6,12 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include -#include +#include #include namespace executorch { @@ -53,9 +54,12 @@ bool parallel_for( const int64_t end, const int64_t grain_size, const std::function& f) { - ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0); - ET_LOG_AND_RETURN_IF_FALSE(end >= begin); - ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0); + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); int64_t num_tasks = 0, chunk_size = 0; std::tie(num_tasks, chunk_size) = calc_num_tasks_and_chunk_size(begin, end, grain_size); From 3351d50555714d72d46d0ff4f096eff4ab4e61c4 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 16:38:46 -0800 Subject: [PATCH 10/38] Update [ghstack-poisoned] --- .lintrunner.toml | 2 + CMakeLists.txt | 1 - Test.cmake | 1 - build/cmake_deps.toml | 19 +--- build/executorch-config.cmake | 8 +- extension/llm/custom_ops/op_sdpa.cpp | 2 +- extension/llm/custom_ops/targets.bzl | 1 - extension/parallel/CMakeLists.txt | 25 ----- extension/parallel/TARGETS | 8 -- extension/parallel/targets.bzl | 26 ------ extension/parallel/test/TARGETS | 8 -- extension/parallel/test/targets.bzl | 19 ---- extension/parallel/thread_parallel.h | 49 ++-------- extension/threadpool/CMakeLists.txt | 7 +- extension/threadpool/targets.bzl | 3 + .../test/CMakeLists.txt | 20 +--- extension/threadpool/test/targets.bzl | 12 +++ .../test/thread_parallel_test.cpp | 41 ++++++--- .../thread_parallel.cpp | 2 +- kernels/optimized/CMakeLists.txt | 2 +- kernels/optimized/blas/BlasKernel.h | 2 +- kernels/optimized/lib_defs.bzl | 6 +- runtime/kernel/targets.bzl | 13 +++ runtime/kernel/thread_parallel_interface.h | 92 +++++++++++++++++++ test/utils/OSSTestConfig.json | 10 ++ 25 files changed, 185 insertions(+), 194 deletions(-) delete mode 100644 extension/parallel/CMakeLists.txt delete mode 100644 extension/parallel/TARGETS delete mode 100644 extension/parallel/targets.bzl delete mode 100644 extension/parallel/test/TARGETS delete mode 100644 extension/parallel/test/targets.bzl rename extension/{parallel => threadpool}/test/CMakeLists.txt (53%) rename extension/{parallel => threadpool}/test/thread_parallel_test.cpp (77%) rename extension/{parallel => threadpool}/thread_parallel.cpp (97%) create mode 100644 runtime/kernel/thread_parallel_interface.h diff --git a/.lintrunner.toml b/.lintrunner.toml index 7667ac430d1..1a27228d266 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -218,6 +218,8 @@ exclude_patterns = [ 'examples/**', 'extension/**', 'kernels/optimized/**', + # Justified include. + 'runtime/kernel/thread_parallel_interface.h', 'scripts/**', 'third-party/**', 'util/**', diff --git a/CMakeLists.txt b/CMakeLists.txt index 73b89b6171e..fabf667cbe1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel) endif() if(EXECUTORCH_BUILD_PYBIND) diff --git a/Test.cmake b/Test.cmake index d4b5f6aa1db..6bd7a86e70b 100644 --- a/Test.cmake +++ b/Test.cmake @@ -13,7 +13,6 @@ if(BUILD_TESTING) add_subdirectory(extension/evalue_util/test) add_subdirectory(extension/kernel_util/test) add_subdirectory(extension/memory_allocator/test) - add_subdirectory(extension/parallel/test) add_subdirectory(extension/pytree/test) add_subdirectory(kernels/portable/cpu/util/test) add_subdirectory(kernels/prim_ops/test) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index b1ed81b6a7e..9937f1b882f 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -88,7 +88,6 @@ excludes = [ deps = [ "executorch", "executorch_core", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -131,7 +130,6 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", ] [targets.optimized_native_cpu_ops] @@ -146,7 +144,6 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -227,19 +224,6 @@ deps = [ "extension_runner_util", ] -[targets.extension_parallel] -buck_targets = [ - "//extension/parallel:thread_parallel", -] -filters = [ - ".cpp$", -] -deps = [ - "executorch", - "executorch_core", - "extension_threadpool", -] - [targets.extension_tensor] buck_targets = [ "//extension/tensor:tensor", @@ -393,6 +377,7 @@ filters = [ deps = [ "executorch", "executorch_core", + "extension_threadpool", ] [targets.xnnpack_schema] @@ -427,7 +412,6 @@ deps = [ "executorch", "executorch_core", "optimized_kernels", - "extension_parallel", "extension_threadpool", "reduce_util", "xnnpack_backend", @@ -465,7 +449,6 @@ deps = [ "executorch_core", "extension_data_loader", "extension_module", - "extension_parallel", "portable_kernels", "quantized_kernels", "xnnpack_backend", diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 35fe03467f2..2e8cb95b70f 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -68,7 +68,6 @@ set(lib_list custom_ops extension_module extension_module_static - extension_parallel extension_runner_util extension_tensor extension_threadpool @@ -119,14 +118,9 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -if(TARGET extension_parallel) - set_target_properties( - extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool - ) -endif() if(TARGET cpublas) set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool ) endif() target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index f0a7775e803..db7cb42f6d0 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -19,7 +19,7 @@ #include #ifdef ET_USE_THREADPOOL -#include +#include #include #endif #include diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index e3e8b30520f..1c4686fe3d0 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -37,7 +37,6 @@ def define_common_targets(): "//executorch/kernels/optimized:libblas{}".format(mkl_dep), "//executorch/kernels/optimized:libvec", "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", "//executorch/extension/threadpool:threadpool", ], deps = [ diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt deleted file mode 100644 index 7f727aafe46..00000000000 --- a/extension/parallel/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Please keep this file formatted by running: -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ - -if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)) - message(FATAL_ERROR "extension/parallel requires extension/threadpool") -endif() - -add_library(extension_parallel thread_parallel.cpp) - -target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool) -target_compile_options(extension_parallel PUBLIC ${_common_compile_options}) - -install( - TARGETS extension_parallel - DESTINATION lib - INCLUDES - DESTINATION ${_common_include_directories}) diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl deleted file mode 100644 index 82a8502c034..00000000000 --- a/extension/parallel/targets.bzl +++ /dev/null @@ -1,26 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_library( - name = "thread_parallel", - srcs = [ - "thread_parallel.cpp", - ], - exported_headers = [ - "thread_parallel.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/extension/threadpool:threadpool", - "//executorch/runtime/core:core", - ], - ) diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl deleted file mode 100644 index 791c0727471..00000000000 --- a/extension/parallel/test/targets.bzl +++ /dev/null @@ -1,19 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_test( - name = "thread_parallel_test", - srcs = [ - "thread_parallel_test.cpp", - ], - deps = [ - "//executorch/extension/parallel:thread_parallel", - "//executorch/runtime/platform:platform", - ], - ) diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 8b174075ae9..5f4edeb333c 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -8,46 +8,9 @@ #pragma once -#include -#include - -namespace executorch { -namespace extension { - -/** - * A helper to run function in parallel. - * - * begin, end: describe the extent of the workitems via first and last workitem - * to be processed - * grain_size: number of workitems processed by user callback which is - * described below - * f: user function applied in parallel to the chunks, signature: - * void f(int64_t begin, int64_t end) - * Returns true if all work items are processed successfully, false otherwise - * - * Warning: parallel_for does NOT copy thread local states from the current - * thread to the worker threads. Users need to protect the access to captured - * data if they mutate them in f. - */ -bool parallel_for( - const int64_t begin, - const int64_t end, - const int64_t grain_size, - const std::function& f); - -int64_t get_thread_num(); - -void set_thread_num(int64_t thread_num); - -} // namespace extension -} // namespace executorch - -namespace torch { -namespace executor { -// TODO(T197294990): Remove these deprecated aliases once all users have moved -// to the new `::executorch` namespaces. -using ::executorch::extension::get_thread_num; -using ::executorch::extension::parallel_for; -using ::executorch::extension::set_thread_num; -} // namespace executor -} // namespace torch +// This header is a stub left behind after the move to +// executorch/runtime/kernel. Depend on this target and include this +// header if you have a hard requirement for threading; if you want to +// cleanly use parallelization if available, then depend on and use +// the below header instead. +#include diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index c1d86acf75d..6e107cb6634 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD) endif() add_library( - extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp + extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp + cpuinfo_utils.cpp ) target_link_libraries( extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool @@ -42,3 +43,7 @@ install( INCLUDES DESTINATION ${_common_include_directories} ) + +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl index 8bb0398b385..1c34dbbc7d4 100644 --- a/extension/threadpool/targets.bzl +++ b/extension/threadpool/targets.bzl @@ -9,6 +9,7 @@ def define_common_targets(): """ _THREADPOOL_SRCS = [ + "thread_parallel.cpp", "threadpool.cpp", "threadpool_guard.cpp", ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else []) @@ -29,6 +30,8 @@ def define_common_targets(): exported_deps = [ third_party_dep("pthreadpool"), third_party_dep("cpuinfo"), + # Allow users to use the header without an extra deps entry. + "//executorch/runtime/kernel:thread_parallel_interface", ], exported_preprocessor_flags = [ "-DET_USE_THREADPOOL", diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt similarity index 53% rename from extension/parallel/test/CMakeLists.txt rename to extension/threadpool/test/CMakeLists.txt index ab37f66c17d..3f9b13f2ab4 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/threadpool/test/CMakeLists.txt @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# @generated by test/utils/generate_gtest_cmakelists.py +# # This file should be formatted with # ~~~ # cmake-format -i CMakeLists.txt @@ -12,28 +14,14 @@ # cmake_minimum_required(VERSION 3.19) -project(extension_parallel_test) - -# Use C++17 for test. -set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp) +set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp) et_cxx_test( - extension_parallel_test - SOURCES - ${_test_srcs} - EXTRA_LIBS - pthreadpool - cpuinfo + extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS extension_threadpool ) -target_include_directories( - extension_parallel_test - PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include - ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include -) diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl index b8a39d8969a..8bdf776c825 100644 --- a/extension/threadpool/test/targets.bzl +++ b/extension/threadpool/test/targets.bzl @@ -18,3 +18,15 @@ def define_common_targets(): "//executorch/extension/threadpool:threadpool", ], ) + + runtime.cxx_test( + name = "thread_parallel_test", + srcs = [ + "thread_parallel_test.cpp", + ], + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/platform:platform", + ], + ) diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp similarity index 77% rename from extension/parallel/test/thread_parallel_test.cpp rename to extension/threadpool/test/thread_parallel_test.cpp index d386429100d..63581be29e8 100644 --- a/extension/parallel/test/thread_parallel_test.cpp +++ b/extension/threadpool/test/thread_parallel_test.cpp @@ -11,13 +11,16 @@ #include #include -#include +#include #include using namespace ::testing; using ::executorch::extension::parallel_for; -class ParallelTest : public ::testing::Test { +#ifndef ET_USE_THREADPOOL +#endif + +class ParallelTest : public ::testing::TestWithParam { protected: void SetUp() override { data_.fill(0); @@ -42,12 +45,20 @@ class ParallelTest : public ::testing::Test { } } + template + bool parallel_for(const int64_t begin, const int64_t end, const int64_t grain_size, const Func& func) { + if (GetParam()) { + return executorch::extension::parallel_for(begin, end, grain_size, func); + } + return executorch::extension::internal::parallel_for_no_threadpool(begin, end, grain_size, func); + } + std::array data_; std::mutex mutex_; int sum_of_all_elements_; }; -TEST_F(ParallelTest, TestAllInvoked) { +TEST_P(ParallelTest, TestAllInvoked) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -57,7 +68,7 @@ TEST_F(ParallelTest, TestAllInvoked) { } } -TEST_F(ParallelTest, TestAllInvokedWithMutex) { +TEST_P(ParallelTest, TestAllInvokedWithMutex) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); })); @@ -70,7 +81,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) { EXPECT_EQ(sum_of_all_elements_, expected_sum); } -TEST_F(ParallelTest, TestInvalidRange) { +TEST_P(ParallelTest, TestInvalidRange) { et_pal_init(); EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -82,7 +93,7 @@ TEST_F(ParallelTest, TestInvalidRange) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvalidRange2) { +TEST_P(ParallelTest, TestInvalidRange2) { et_pal_init(); EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -94,7 +105,7 @@ TEST_F(ParallelTest, TestInvalidRange2) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvokePartialFromBeginning) { +TEST_P(ParallelTest, TestInvokePartialFromBeginning) { EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -107,7 +118,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) { } } -TEST_F(ParallelTest, TestInvokePartialToEnd) { +TEST_P(ParallelTest, TestInvokePartialToEnd) { EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -120,7 +131,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) { } } -TEST_F(ParallelTest, TestInvokePartialMiddle) { +TEST_P(ParallelTest, TestInvokePartialMiddle) { EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -136,7 +147,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) { } } -TEST_F(ParallelTest, TestChunkSize2) { +TEST_P(ParallelTest, TestChunkSize2) { EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -146,7 +157,7 @@ TEST_F(ParallelTest, TestChunkSize2) { } } -TEST_F(ParallelTest, TestChunkSize2Middle) { +TEST_P(ParallelTest, TestChunkSize2Middle) { EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -162,7 +173,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) { } } -TEST_F(ParallelTest, TestChunkSize3) { +TEST_P(ParallelTest, TestChunkSize3) { EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -172,7 +183,7 @@ TEST_F(ParallelTest, TestChunkSize3) { } } -TEST_F(ParallelTest, TestChunkSize6) { +TEST_P(ParallelTest, TestChunkSize6) { EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -182,7 +193,7 @@ TEST_F(ParallelTest, TestChunkSize6) { } } -TEST_F(ParallelTest, TestChunkSizeTooLarge) { +TEST_P(ParallelTest, TestChunkSizeTooLarge) { EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -191,3 +202,5 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) { EXPECT_EQ(data_[i], i); } } + +INSTANTIATE_TEST_SUITE_P(ParallelTestWithOrWithoutThreadpool, ParallelTest, ::testing::Values(true, false)); diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp similarity index 97% rename from extension/parallel/thread_parallel.cpp rename to extension/threadpool/thread_parallel.cpp index 5d481ccd44c..fa26742368f 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/threadpool/thread_parallel.cpp @@ -9,9 +9,9 @@ #include #include -#include #include #include +#include #include namespace executorch { diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index c6d31c20263..23e26bfa72b 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -43,7 +43,7 @@ endif() list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(cpublas STATIC ${_optimized_cpublas__srcs}) target_link_libraries( - cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool + cpublas PUBLIC executorch_core eigen_blas extension_threadpool ) target_compile_options(cpublas PUBLIC ${_common_compile_options}) diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h index c2b03cfebdd..fc47b4482d6 100644 --- a/kernels/optimized/blas/BlasKernel.h +++ b/kernels/optimized/blas/BlasKernel.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 659c7afe090..dd246f38984 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -186,7 +186,10 @@ def define_libs(is_fbcode=False): ], ) - LIBBLAS_DEPS = [third_party_dep("cpuinfo")] + LIBBLAS_DEPS = [ + third_party_dep("cpuinfo"), + "//executorch/extension/threadpool:threadpool", + ] for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]: runtime.cxx_library( @@ -229,7 +232,6 @@ def define_libs(is_fbcode=False): "DEFAULT": [], }) + LIBBLAS_DEPS, exported_deps = [ - "//executorch/extension/parallel:thread_parallel", "//executorch/kernels/optimized:libutils", "//executorch/runtime/core/exec_aten:lib", ], diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl index d49435f2825..e67f76728b8 100644 --- a/runtime/kernel/targets.bzl +++ b/runtime/kernel/targets.bzl @@ -51,6 +51,19 @@ def define_common_targets(): preprocessor_flags = ["-DMAX_KERNEL_NUM=1"], ) + runtime.cxx_library( + name = "thread_parallel_interface", + exported_headers = ["thread_parallel_interface.h"], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) + for aten_mode in get_aten_mode_options(): aten_suffix = "_aten" if aten_mode else "" diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h new file mode 100644 index 00000000000..82e34ecf7c0 --- /dev/null +++ b/runtime/kernel/thread_parallel_interface.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace executorch { +namespace extension { +namespace internal { +template +inline bool parallel_for_no_threadpool( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& f) { + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); + f(begin, end); + return true; +} + +} // namespace internal + +#ifdef ET_USE_THREADPOOL +/** + * A helper to run a function in parallel. + * + * begin, end: describe the extent of the workitems via first and last workitem + * to be processed + * grain_size: number of workitems processed by user callback which is + * described below + * f: user function applied in parallel to the chunks, signature: + * void f(int64_t begin, int64_t end) + * Returns true if all work items are processed successfully, false otherwise + * + * Warning: parallel_for does NOT copy thread local states from the current + * thread to the worker threads. Users need to protect the access to captured + * data if they mutate them in f. + */ +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f); + +int64_t get_thread_num(); + +void set_thread_num(int64_t thread_num); +#else // ET_USE_THREADPOOL +template +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { + return internal::parallel_for_no_threadpool(begin, end, grain_size, func); +} + +inline int64_t get_thread_num() { + return 0; +} + +void set_thread_num(int64_t thread_num) { + ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); +} +#endif // ET_USE_THREADPOOL +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::get_thread_num; +using ::executorch::extension::parallel_for; +using ::executorch::extension::set_thread_num; +} // namespace executor +} // namespace torch diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index cc5e625f1e8..be594f9d5f4 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -59,6 +59,16 @@ "extension_tensor" ] }, + { + "directory": "extension/threadpool/test", + "sources": [ + "thread_parallel_test.cpp", + "threadpool_test.cpp" + ], + "additional_libs": [ + "extension_threadpool" + ] + }, { "directory": "kernels/portable/cpu/util/test", "sources": [ From 0102e256c1b5dff99bad9ef25e6ab3982d2ab9b3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 17:36:57 -0800 Subject: [PATCH 11/38] Update [ghstack-poisoned] --- extension/llm/custom_ops/op_sdpa.cpp | 2 +- extension/threadpool/test/thread_parallel_test.cpp | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index db7cb42f6d0..371fcf38a24 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -19,8 +19,8 @@ #include #ifdef ET_USE_THREADPOOL -#include #include +#include #endif #include diff --git a/extension/threadpool/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp index 63581be29e8..e31f16eee22 100644 --- a/extension/threadpool/test/thread_parallel_test.cpp +++ b/extension/threadpool/test/thread_parallel_test.cpp @@ -46,11 +46,16 @@ class ParallelTest : public ::testing::TestWithParam { } template - bool parallel_for(const int64_t begin, const int64_t end, const int64_t grain_size, const Func& func) { + bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { if (GetParam()) { return executorch::extension::parallel_for(begin, end, grain_size, func); } - return executorch::extension::internal::parallel_for_no_threadpool(begin, end, grain_size, func); + return executorch::extension::internal::parallel_for_no_threadpool( + begin, end, grain_size, func); } std::array data_; @@ -203,4 +208,7 @@ TEST_P(ParallelTest, TestChunkSizeTooLarge) { } } -INSTANTIATE_TEST_SUITE_P(ParallelTestWithOrWithoutThreadpool, ParallelTest, ::testing::Values(true, false)); +INSTANTIATE_TEST_SUITE_P( + ParallelTestWithOrWithoutThreadpool, + ParallelTest, + ::testing::Values(true, false)); From 956f8a5ec412862697753db5c2d8f84decb990bb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 17:36:57 -0800 Subject: [PATCH 12/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/reduce_util.h | 20 ++++++++++++++------ kernels/portable/cpu/util/targets.bzl | 6 +++++- runtime/kernel/thread_parallel_interface.h | 14 +++++++++++++- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 35cfdfbaa72..6d7b17443ee 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -8,8 +8,10 @@ #pragma once +#include #include #include +#include #include #include @@ -24,9 +26,12 @@ void apply_on_flat_ix_with_stride_and_base( const size_t base, const size_t start, const size_t end) { - for (size_t i = start; i <= end; i++) { - fn(base + i * stride); - } + executorch::extension::parallel_for( + start, end + 1, [&](auto start_, auto end_) { + for (const auto i : c10::irange(start_, end_)) { + fn(base + i * stride); + } + }); } template @@ -36,9 +41,12 @@ void apply_on_flat_and_dim_ix_with_stride_and_base( const size_t base, const size_t start, const size_t end) { - for (size_t i = start; i <= end; i++) { - fn(base + i * stride, i); - } + executorch::extension::parallel_for( + start, end + 1, [&](auto start_, auto end_) { + for (const auto i : c10::irange(start_, end_)) { + fn(base + i * stride, i); + } + }); } template diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index c42f38fd8b0..3a7e4e1f9bc 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -299,8 +299,12 @@ def define_common_targets(): srcs = ["reduce_util.cpp"], exported_headers = ["reduce_util.h"], deps = [ - "//executorch/runtime/kernel:kernel_includes{}".format(suffix), "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix), + "//executorch/runtime/kernel:kernel_includes{}".format(suffix), + ], + exported_deps = [ + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/core/portable_type/c10/c10:c10", ], exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [], visibility = [ diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h index 82e34ecf7c0..ad90218fd22 100644 --- a/runtime/kernel/thread_parallel_interface.h +++ b/runtime/kernel/thread_parallel_interface.h @@ -33,6 +33,10 @@ inline bool parallel_for_no_threadpool( return true; } +// Match GRAIN_SIZE from PyTorch core. +// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78 +constexpr int64_t GRAIN_SIZE = 32768; + } // namespace internal #ifdef ET_USE_THREADPOOL @@ -74,10 +78,18 @@ inline int64_t get_thread_num() { return 0; } -void set_thread_num(int64_t thread_num) { +inline void set_thread_num(int64_t thread_num) { ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); } #endif // ET_USE_THREADPOOL + +/** + * Convenience version of parallel_for that sets the grain size to internal::GRAIN_SIZE. + */ +template +bool parallel_for(const int64_t begin, const int64_t end, const Func& func) { + return parallel_for(begin, end, internal::GRAIN_SIZE, func); +} } // namespace extension } // namespace executorch From 9f7f0c1fb07bb90be79c8aeccc787853c361dad1 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 18:29:03 -0800 Subject: [PATCH 13/38] Update [ghstack-poisoned] --- configurations/CMakeLists.txt | 13 ++++++++++++- extension/android/CMakeLists.txt | 4 ---- kernels/portable/CMakeLists.txt | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt index 462124a6ea6..a63999f8833 100644 --- a/configurations/CMakeLists.txt +++ b/configurations/CMakeLists.txt @@ -47,12 +47,23 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) message("Generated files ${gen_command_sources}") # optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime + if(NOT DEFINED EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS) + message(FATAL_ERROR "EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS was not defined!") + endif() + if(${EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS}) + if(NOT TARGET optimized_portable_kernels) + message(FATAL_ERROR "optimized_portable_kernels missing") + endif() + set(_optimized_native_cpu_ops_lib_portable_kernels_lib optimized_portable_kernels) + else() + set(_optimized_native_cpu_ops_lib_portable_kernels_lib portable_kernels) + endif() gen_operators_lib( LIB_NAME "optimized_native_cpu_ops_lib" KERNEL_LIBS - portable_kernels optimized_kernels + ${_optimized_native_cpu_ops_lib_portable_kernels_lib} DEPS executorch ) diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index ba722d9c791..03595efbfea 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -84,10 +84,6 @@ if(TARGET optimized_native_cpu_ops_lib) APPEND link_libraries optimized_native_cpu_ops_lib - optimized_kernels - portable_kernels - cpublas - eigen_blas ) target_link_options_shared_lib(optimized_native_cpu_ops_lib) else() diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 885c509246b..2cecefd6d24 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -63,6 +63,23 @@ gen_operators_lib( LIB_NAME "portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch ) +# Portable kernels support optional parallelization (and, in the +# future, perhaps other performance features). If support is present, +# produce an optimized version. +set(EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL PARENT_SCOPE) +set(EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) + +if(${EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS}) + add_library(optimized_portable_kernels ${_portable_kernels__srcs}) + target_link_libraries(optimized_portable_kernels PRIVATE executorch) + target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) + target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) + install( + TARGETS optimized_portable_kernels + DESTINATION lib + ) +endif() + install( TARGETS portable_kernels portable_ops_lib DESTINATION lib From c130224cbfd24d014f1e758264b9c974426dc683 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 20:09:42 -0800 Subject: [PATCH 14/38] Update [ghstack-poisoned] --- extension/parallel/thread_parallel.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 5f4edeb333c..8bd1a572cd7 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -9,8 +9,6 @@ #pragma once // This header is a stub left behind after the move to -// executorch/runtime/kernel. Depend on this target and include this -// header if you have a hard requirement for threading; if you want to -// cleanly use parallelization if available, then depend on and use -// the below header instead. +// executorch/runtime/kernel. As such, it is deprecated; include and +// use the below header directly instead. #include From 754a4f6db525a3c33327b8ca9c00e6f22326266d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 6 Mar 2025 09:56:39 -0800 Subject: [PATCH 15/38] Update [ghstack-poisoned] --- build/executorch-config.cmake | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index e2cff7da6b5..b49f45aa241 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -1,4 +1,3 @@ - # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # @@ -16,20 +15,23 @@ # # This will define the following variables: # -# EXECUTORCH_FOUND -- True if the system has the ExecuTorch library -# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch -# EXECUTORCH_LIBRARIES -- Libraries to link against +# EXECUTORCH_FOUND -- True if the system has the ExecuTorch library +# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch +# EXECUTORCH_LIBRARIES -- Libraries to link against # -# The actual values for these variables will be different from what executorch-config.cmake -# in executorch pip package gives, but we wanted to keep the contract of exposing these -# CMake variables. +# The actual values for these variables will be different from what +# executorch-config.cmake in executorch pip package gives, but we wanted to keep +# the contract of exposing these CMake variables. cmake_minimum_required(VERSION 3.19) set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..") set(required_lib_list executorch executorch_core portable_kernels) set(EXECUTORCH_LIBRARIES) -set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) +set(EXECUTORCH_INCLUDE_DIRS + ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib +) foreach(lib ${required_lib_list}) set(lib_var "LIB_${lib}") add_library(${lib} STATIC IMPORTED) @@ -40,7 +42,12 @@ foreach(lib ${required_lib_list}) ) set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS) - target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) + target_include_directories( + ${lib} + INTERFACE ${_root}/include + ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib + ) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endforeach() @@ -112,7 +119,12 @@ foreach(lib ${lib_list}) add_library(${lib} STATIC IMPORTED) endif() set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") - target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) + target_include_directories( + ${lib} + INTERFACE ${_root}/include + ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib + ) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endif() endforeach() From 6350e07e6ee0fe130ef804d223b91957a1a0d1c5 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 09:48:06 -0800 Subject: [PATCH 16/38] Update [ghstack-poisoned] --- runtime/kernel/targets.bzl | 1 + runtime/kernel/thread_parallel_interface.h | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl index e67f76728b8..5c95f10276d 100644 --- a/runtime/kernel/targets.bzl +++ b/runtime/kernel/targets.bzl @@ -56,6 +56,7 @@ def define_common_targets(): exported_headers = ["thread_parallel_interface.h"], exported_deps = [ "//executorch/runtime/core:core", + "//executorch/runtime/core/portable_type/c10/c10:c10", "//executorch/runtime/platform:platform", ], visibility = [ diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h index 82e34ecf7c0..bd1d48f52bc 100644 --- a/runtime/kernel/thread_parallel_interface.h +++ b/runtime/kernel/thread_parallel_interface.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -29,7 +30,17 @@ inline bool parallel_for_no_threadpool( begin, end); ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); +#ifndef NDEBUG + // Go backwards through the range elementwise to catch code that + // assumes parallel_for is in order like a regular for loop. + for (const auto i : c10::irange(begin, end)) { + const auto offset = i - begin; + const auto idx = end - offset - 1; + f(idx, idx + 1); + } +#else // NDEBUG f(begin, end); +#endif return true; } From 4dd58a06352c4e895178f9b120816cfe13e67e80 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 12:40:48 -0800 Subject: [PATCH 17/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/broadcast_util.cpp | 22 ---------- kernels/portable/cpu/util/broadcast_util.h | 31 +------------ .../portable/cpu/util/delinearize_index.cpp | 33 ++++++++++++++ kernels/portable/cpu/util/delinearize_index.h | 43 +++++++++++++++++++ kernels/portable/cpu/util/targets.bzl | 6 ++- 5 files changed, 82 insertions(+), 53 deletions(-) create mode 100644 kernels/portable/cpu/util/delinearize_index.cpp create mode 100644 kernels/portable/cpu/util/delinearize_index.h diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp index 381e07cbe30..28a34426b23 100644 --- a/kernels/portable/cpu/util/broadcast_util.cpp +++ b/kernels/portable/cpu/util/broadcast_util.cpp @@ -269,28 +269,6 @@ ET_NODISCARD Error get_broadcast_target_size( a.sizes(), b.sizes(), out_sizes, out_sizes_len, out_dim); } -void delinearize_index( - size_t linear_index, - executorch::aten::ArrayRef shape, - size_t* out_indexes, - const size_t out_indexes_len) { - ET_CHECK(shape.size() <= out_indexes_len); - for (size_t i = 0; i < shape.size(); ++i) { - auto dim = shape.size() - 1 - i; - auto dim_size = shape[dim]; - out_indexes[dim] = linear_index % dim_size; - linear_index /= dim_size; - } -} - -void delinearize_index( - size_t linear_index, - const Tensor& t, - size_t* out_indexes, - const size_t out_indexes_len) { - delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len); -} - size_t linearize_access_indexes( ArrayRef indexes_broadcast_to, ssize_t broadcast_to_ndim, diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h index f6bfae9bdaa..ed536f86c2d 100644 --- a/kernels/portable/cpu/util/broadcast_util.h +++ b/kernels/portable/cpu/util/broadcast_util.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -207,36 +208,6 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size( ET_DEPRECATED void free_broadcast_tensor( const executorch::aten::Tensor& broadcast_tensor); -/** - * Delinearize a flattened index to per-dimension indexes. - * - * @param[in] linear_index The flattened index - * @param[in] shape The tensor shape - * @param[out] out_indexes The per-dimension indexes - * @param[in] out_indexes_len The maximum size of the out_indexes array - * @returns void - */ -void delinearize_index( - size_t linear_index, - executorch::aten::ArrayRef shape, - size_t* out_indexes, - const size_t out_indexes_len); - -/** - * Delinearize a flattened index to per-dimension indexes. - * - * @param[in] linear_index The flattened index - * @param[in] t The tensor object - * @param[out] out_indexes The per-dimension indexes - * @param[in] out_indexes_len The maximum size of the out_indexes array - * @returns void - */ -void delinearize_index( - size_t linear_index, - const Tensor& t, - size_t* out_indexes, - const size_t out_indexes_len); - /** * Return the linear index for broatcast_from tensor, given the indexes and * number of dimensions of broadcast_to tensor, and the shape and strides diff --git a/kernels/portable/cpu/util/delinearize_index.cpp b/kernels/portable/cpu/util/delinearize_index.cpp new file mode 100644 index 00000000000..45378e6b05d --- /dev/null +++ b/kernels/portable/cpu/util/delinearize_index.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include + +namespace torch::executor { +void delinearize_index( + size_t linear_index, + executorch::aten::ArrayRef shape, + size_t* out_indexes, + const size_t out_indexes_len) { + ET_CHECK(shape.size() <= out_indexes_len); + for (size_t i = 0; i < shape.size(); ++i) { + auto dim = shape.size() - 1 - i; + auto dim_size = shape[dim]; + out_indexes[dim] = linear_index % dim_size; + linear_index /= dim_size; + } +} + +void delinearize_index( + size_t linear_index, + const Tensor& t, + size_t* out_indexes, + const size_t out_indexes_len) { + delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len); +} +} // namespace torch::executor diff --git a/kernels/portable/cpu/util/delinearize_index.h b/kernels/portable/cpu/util/delinearize_index.h new file mode 100644 index 00000000000..3441aa6083f --- /dev/null +++ b/kernels/portable/cpu/util/delinearize_index.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace torch::executor { +/** + * Delinearize a flattened index to per-dimension indexes. + * + * @param[in] linear_index The flattened index + * @param[in] shape The tensor shape + * @param[out] out_indexes The per-dimension indexes + * @param[in] out_indexes_len The maximum size of the out_indexes array + * @returns void + */ +void delinearize_index( + size_t linear_index, + executorch::aten::ArrayRef shape, + size_t* out_indexes, + const size_t out_indexes_len); + +/** + * Delinearize a flattened index to per-dimension indexes. + * + * @param[in] linear_index The flattened index + * @param[in] t The tensor object + * @param[out] out_indexes The per-dimension indexes + * @param[in] out_indexes_len The maximum size of the out_indexes array + * @returns void + */ +void delinearize_index( + size_t linear_index, + const Tensor& t, + size_t* out_indexes, + const size_t out_indexes_len); +} // namespace torch::executor diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 1c8edb9d3c7..e1c5cadfe84 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -66,9 +66,13 @@ def define_common_targets(): runtime.cxx_library( name = "broadcast_util", - srcs = ["broadcast_util.cpp"], + srcs = [ + "broadcast_util.cpp", + "delinearize_index.cpp", + ], exported_headers = [ "broadcast_util.h", + "delinearize_index.h", ], exported_deps = [ ":broadcast_indexes_range", From 1b6eb9f3309216b60ad32a09348de516c94bf6c8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 12:40:53 -0800 Subject: [PATCH 18/38] Update [ghstack-poisoned] --- .../cpu/util/broadcast_indexes_range.h | 43 ++++++++++-- .../test/broadcast_indexes_range_test.cpp | 70 ++++++++++++------- 2 files changed, 82 insertions(+), 31 deletions(-) diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h index c623fdb4c31..c749541058a 100644 --- a/kernels/portable/cpu/util/broadcast_indexes_range.h +++ b/kernels/portable/cpu/util/broadcast_indexes_range.h @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -104,11 +105,42 @@ class BroadcastIndexesIterator { return it; } + BroadcastIndexesIterator& operator+=(difference_type n) { + if (n <= 3) { + std::advance(*this, n); + return *this; + } + + output_index() += n; + delinearize_index( + output_index(), + output_shape_, + delinearized_output_index_.data(), + delinearized_output_index_.size()); + for (const auto ii : c10::irange(1, kNumInputs + 1)) { + current_indexes_[ii] = 0; + for (const auto jj : c10::irange(output_dim_)) { + current_indexes_[ii] += delinearized_output_index_[jj] * + effective_input_broadcast_strides_[ii - 1][jj]; + } + } + return *this; + } + + BroadcastIndexesIterator operator+(difference_type n) { + auto it = *this; + it += n; + return it; + } + difference_type operator-(const BroadcastIndexesIterator& rhs) const { return difference_type(output_index() - rhs.output_index()); } private: + using ShapeType = + std::array; + ssize_t output_index() const { return current_indexes_[0]; } @@ -117,11 +149,10 @@ class BroadcastIndexesIterator { return current_indexes_[0]; } - std::array - effective_input_broadcast_stride(const Tensor& output, const Tensor& t) - const { - std::array - result = {0}; + ShapeType effective_input_broadcast_stride( + const Tensor& output, + const Tensor& t) const { + ShapeType result = {0}; ET_CHECK_MSG( t.dim() <= output.dim(), "input to broadcasting op should have dim at most output dim, but %d > %d!", @@ -146,8 +177,6 @@ class BroadcastIndexesIterator { // The 0th entry is the current linear index into the output, // followed by kNumInputs input indexes. std::array current_indexes_ = {0}; - using ShapeType = std:: - array; ShapeType delinearized_output_index_ = {0}; ssize_t output_dim_; ArrayRef output_shape_; diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp index f147958558d..519cd9fe9f9 100644 --- a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp +++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp @@ -68,6 +68,15 @@ TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) { EXPECT_EQ(expected, actual); } +template +void test_operator_plus(const Range& range) { + size_t idx = 0; + for (const auto indexes : range) { + EXPECT_EQ(*(range.begin() + idx), indexes); + idx++; + } +} + // [1] -> [H, W] // [W] -> [H, W] // [1, 1] -> [H, W] @@ -87,14 +96,15 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) { Tensor in_not_broadcast = tf.zeros({3, 4}); - auto actual = range_to_vec(BroadcastIndexesRange<6>( + const auto range = BroadcastIndexesRange<6>( out, in_0d_scalar, in_1d_scalar, in_2d_scalar, in_row, in_col, - in_not_broadcast)); + in_not_broadcast); + auto actual = range_to_vec(range); decltype(actual) expected = { {0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 1, 0, 1}, @@ -110,6 +120,8 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) { {11, 0, 0, 0, 3, 2, 11}, }; EXPECT_EQ(expected, actual); + + test_operator_plus(range); } // Make sure nothing is thrown off by a size-1 dim in the output: @@ -138,20 +150,20 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDWith1InOutputShapeExhaustive) { Tensor in_col = tf.zeros({H, 1}); size_t idx = 0; + const auto range_row = BroadcastIndexesRange<5>( + out_row, + in_0d_scalar, + in_1d_scalar, + in_2d_scalar, + in_row, + in_leading_one_row); for (const auto [out_idx, in_0d_idx, in_1d_idx, in_2d_idx, in_row_idx, - in_leading_one_row_idx] : - BroadcastIndexesRange<5>( - out_row, - in_0d_scalar, - in_1d_scalar, - in_2d_scalar, - in_row, - in_leading_one_row)) { + in_leading_one_row_idx] : range_row) { EXPECT_EQ(out_idx, idx++); EXPECT_EQ(in_0d_idx, 0); EXPECT_EQ(in_1d_idx, 0); @@ -160,16 +172,21 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDWith1InOutputShapeExhaustive) { EXPECT_EQ(in_leading_one_row_idx, out_idx); } + test_operator_plus(range_row); + idx = 0; + const auto range_col = BroadcastIndexesRange<4>( + out_col, in_0d_scalar, in_1d_scalar, in_2d_scalar, in_col); for (const auto [out_idx, in_0d_idx, in_1d_idx, in_2d_idx, in_col_idx] : - BroadcastIndexesRange<4>( - out_col, in_0d_scalar, in_1d_scalar, in_2d_scalar, in_col)) { + range_col) { EXPECT_EQ(out_idx, idx++); EXPECT_EQ(in_0d_idx, 0); EXPECT_EQ(in_1d_idx, 0); EXPECT_EQ(in_2d_idx, 0); EXPECT_EQ(in_col_idx, out_idx); } + + test_operator_plus(range_col); } // [1, 1, 1] -> [C, H, W] @@ -197,16 +214,17 @@ TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) { // take the opportunity to mutation test against delinearize_index // and linearize_access_indexes. int idx = 0; - for (const auto indexes : BroadcastIndexesRange<8>( - out, - input_tensors[0], - input_tensors[1], - input_tensors[2], - input_tensors[3], - input_tensors[4], - input_tensors[5], - input_tensors[6], - input_tensors[7])) { + const auto range = BroadcastIndexesRange<8>( + out, + input_tensors[0], + input_tensors[1], + input_tensors[2], + input_tensors[3], + input_tensors[4], + input_tensors[5], + input_tensors[6], + input_tensors[7]); + for (const auto indexes : range) { const auto out_idx = indexes[0]; EXPECT_EQ(out_idx, idx++); size_t out_indexes[executorch::runtime::kTensorDimensionLimit]; @@ -219,6 +237,7 @@ TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) { out_indexes, out.dim(), input_tensors[tensor_idx])); } } + test_operator_plus(range); } // 4-D should generalize, but we will go ahead and test: @@ -235,8 +254,9 @@ void four_d_broadcasting_test() { // take the opportunity to mutation test against delinearize_index // and linearize_access_indexes. int idx = 0; - for (const auto [out_idx, in_cw_idx, in_nh_idx] : - BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh)) { + const auto range = + BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh); + for (const auto [out_idx, in_cw_idx, in_nh_idx] : range) { EXPECT_EQ(out_idx, idx++); size_t out_indexes[executorch::runtime::kTensorDimensionLimit]; delinearize_index( @@ -248,6 +268,8 @@ void four_d_broadcasting_test() { in_nh_idx, linearize_access_indexes(out_indexes, out.dim(), in_broadcast_nh)); } + + test_operator_plus(range); } TEST(BroadcastIndexesRangeTest, FourDBroadcasting) { From 450e50b0cb810bf52ead3e5a8aed705916919157 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 12:40:57 -0800 Subject: [PATCH 19/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/elementwise_util.h | 208 ++++++++----------- 1 file changed, 89 insertions(+), 119 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 09db5f7180d..a5bcd6ff98b 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -14,6 +14,9 @@ #include #include +#include +#include + namespace torch { namespace executor { namespace native { @@ -46,38 +49,94 @@ inline int64_t scalar_to(const Scalar& s) { : s.to(); } -template -inline void apply_unitensor_elementwise_fn( +namespace internal { +template < + typename CTYPE_COMMON, + const char* op_name, + typename Op, + typename... Args> +inline void apply_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, - const Tensor& a, - SupportedTensorDtypes a_dtypes, const Tensor& out, - SupportedTensorDtypes out_dtypes) { + SupportedTensorDtypes out_dtypes, + Args... inputs) { + static_assert( + (std::is_same_v> && + ...)); + constexpr auto kNumInputs = sizeof...(inputs); constexpr auto compute_type = CppTypeToScalarType::value; - + const auto check_input_dtype = [](auto input, auto compute_type) { + return internal::check_tensor_dtype( + *input.first, input.second, compute_type); + }; ET_KERNEL_CHECK( ctx, - (internal::check_tensor_dtype(a, a_dtypes, compute_type) && - internal::check_tensor_dtype(out, out_dtypes, compute_type)), + (check_input_dtype(inputs, compute_type) && ...) && + internal::check_tensor_dtype(out, out_dtypes, compute_type), InvalidArgument, ); - const auto load_a_to_common = - internal::get_load_to_common_fn(a, a_dtypes); + bool any_is_broadcasted = false; + if constexpr (kNumInputs > 1) { + any_is_broadcasted = (!out.sizes().equals(inputs.first->sizes()) || ...); + } + + struct InputInfo { + load_to_common_fn load_to_common; + const char* data_ptr; + ssize_t element_size; + }; + std::array inputs_info = {(InputInfo{ + internal::get_load_to_common_fn( + *inputs.first, inputs.second), + reinterpret_cast(inputs.first->const_data_ptr()), + inputs.first->element_size(), + })...}; + const auto store_common_to_out = internal::get_store_common_to_tensor_fn( out, out_dtypes); - const char* const data_a = reinterpret_cast(a.const_data_ptr()); - const auto a_element_size = a.element_size(); - const auto out_element_size = out.element_size(); char* const data_out = reinterpret_cast(out.mutable_data_ptr()); + const auto out_element_size = out.element_size(); - auto out_numel = out.numel(); - for (const auto i : c10::irange(out_numel)) { - auto result = compute_fun(load_a_to_common(&data_a[i * a_element_size])); - store_common_to_out(result, &data_out[i * out_element_size]); + if (any_is_broadcasted) { + for (const auto& indexes : + BroadcastIndexesRange(out, (*inputs.first)...)) { + std::array loaded_inputs; + for (const auto idx : c10::irange(kNumInputs)) { + const auto& input_info = inputs_info[idx]; + loaded_inputs[idx] = input_info.load_to_common( + &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]); + } + auto result = std::apply(compute_fun, loaded_inputs); + store_common_to_out(result, &data_out[indexes[0] * out_element_size]); + } + } else { + for (const auto i : c10::irange(out.numel())) { + std::array loaded_inputs; + for (const auto idx : c10::irange(kNumInputs)) { + const auto& input_info = inputs_info[idx]; + loaded_inputs[idx] = input_info.load_to_common( + &input_info.data_ptr[i * input_info.element_size]); + } + auto result = std::apply(compute_fun, loaded_inputs); + store_common_to_out(result, &data_out[i * out_element_size]); + } } } +} // namespace internal + +template +inline void apply_unitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& out, + SupportedTensorDtypes out_dtypes) { + internal::apply_elementwise_fn( + compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); +} /** * Useful for bi-tensor elementwise operators. For each element of the inputs, @@ -94,53 +153,13 @@ inline void apply_bitensor_elementwise_fn( SupportedTensorDtypes b_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { - constexpr auto compute_type = CppTypeToScalarType::value; - - ET_KERNEL_CHECK( + internal::apply_elementwise_fn( + compute_fun, ctx, - (internal::check_tensor_dtype(a, a_dtypes, compute_type) && - internal::check_tensor_dtype(b, b_dtypes, compute_type) && - internal::check_tensor_dtype(out, out_dtypes, compute_type)), - InvalidArgument, ); - - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); - - const auto load_a_to_common = - internal::get_load_to_common_fn(a, a_dtypes); - const auto load_b_to_common = - internal::get_load_to_common_fn(b, b_dtypes); - const auto store_common_to_out = - internal::get_store_common_to_tensor_fn( - out, out_dtypes); - const char* const data_a = reinterpret_cast(a.const_data_ptr()); - const char* const data_b = reinterpret_cast(b.const_data_ptr()); - const auto a_element_size = a.element_size(); - const auto b_element_size = b.element_size(); - const auto out_element_size = out.element_size(); - char* const data_out = reinterpret_cast(out.mutable_data_ptr()); - - auto out_numel = out.numel(); - if (any_is_broadcasted) { - for (const auto [out_index, a_index, b_index] : - BroadcastIndexesRange<2>(out, a, b)) { - auto result = compute_fun( - load_a_to_common(&data_a[a_index * a_element_size]), - load_b_to_common(&data_b[b_index * b_element_size])); - store_common_to_out(result, &data_out[out_index * out_element_size]); - } - } else { - for (const auto i : c10::irange(out_numel)) { - size_t a_linear_index = i; - size_t b_linear_index = i; - - auto result = compute_fun( - load_a_to_common(&data_a[a_linear_index * a_element_size]), - load_b_to_common(&data_b[b_linear_index * b_element_size])); - store_common_to_out(result, &data_out[i * out_element_size]); - } - } + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); } /** @@ -175,63 +194,14 @@ inline void apply_tritensor_elementwise_fn( SupportedTensorDtypes c_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { - constexpr auto compute_type = CppTypeToScalarType::value; - - ET_KERNEL_CHECK( + internal::apply_elementwise_fn( + compute_fun, ctx, - (internal::check_tensor_dtype(a, a_dtypes, compute_type) && - internal::check_tensor_dtype(b, b_dtypes, compute_type) && - internal::check_tensor_dtype(c, c_dtypes, compute_type) && - internal::check_tensor_dtype(out, out_dtypes, compute_type)), - InvalidArgument, ); - - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool c_is_broadcasted = !out.sizes().equals(c.sizes()); - const bool any_is_broadcasted = - (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted); - - const auto load_a_to_common = - internal::get_load_to_common_fn(a, a_dtypes); - const auto load_b_to_common = - internal::get_load_to_common_fn(b, b_dtypes); - const auto load_c_to_common = - internal::get_load_to_common_fn(c, c_dtypes); - const auto store_common_to_out = - internal::get_store_common_to_tensor_fn( - out, out_dtypes); - const char* const data_a = reinterpret_cast(a.const_data_ptr()); - const char* const data_b = reinterpret_cast(b.const_data_ptr()); - const char* const data_c = reinterpret_cast(c.const_data_ptr()); - const auto a_element_size = a.element_size(); - const auto b_element_size = b.element_size(); - const auto c_element_size = c.element_size(); - const auto out_element_size = out.element_size(); - char* const data_out = reinterpret_cast(out.mutable_data_ptr()); - - auto out_numel = out.numel(); - if (any_is_broadcasted) { - for (const auto [out_index, a_index, b_index, c_index] : - BroadcastIndexesRange<3>(out, a, b, c)) { - auto result = compute_fun( - load_a_to_common(&data_a[a_index * a_element_size]), - load_b_to_common(&data_b[b_index * b_element_size]), - load_c_to_common(&data_c[c_index * c_element_size])); - store_common_to_out(result, &data_out[out_index * out_element_size]); - } - } else { - for (const auto i : c10::irange(out_numel)) { - size_t a_linear_index = i; - size_t b_linear_index = i; - size_t c_linear_index = i; - - auto result = compute_fun( - load_a_to_common(&data_a[a_linear_index * a_element_size]), - load_b_to_common(&data_b[b_linear_index * b_element_size]), - load_c_to_common(&data_c[c_linear_index * c_element_size])); - store_common_to_out(result, &data_out[i * out_element_size]); - } - } + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes), + std::make_pair(&c, c_dtypes)); } inline ScalarType get_compute_type(ScalarType& common_type) { From 4459a7e7b069cda3ddd071fde8a7d9232645ceb8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 12:41:02 -0800 Subject: [PATCH 20/38] Update [ghstack-poisoned] --- kernels/optimized/cpu/op_where.cpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/kernels/optimized/cpu/op_where.cpp b/kernels/optimized/cpu/op_where.cpp index 4d897ea6281..0c1958282a5 100644 --- a/kernels/optimized/cpu/op_where.cpp +++ b/kernels/optimized/cpu/op_where.cpp @@ -7,7 +7,7 @@ */ #include #include -#include +#include namespace torch { namespace executor { @@ -58,15 +58,25 @@ Tensor& opt_where_out( const bool* const data_cond = cond.const_data_ptr(); CTYPE_COMPUTE* const data_out = out.data_ptr(); if (any_is_broadcasted) { - for (const auto [out_index, a_index, b_index, cond_index] : - BroadcastIndexesRange<3>(out, a, b, cond)) { - data_out[out_index] = - data_cond[cond_index] ? data_a[a_index] : data_b[b_index]; - } + executorch::extension::parallel_for( + 0, out_numel, [&](const auto begin, const auto end) { + auto range = BroadcastIndexesRange<3>(out, a, b, cond); + auto begin_it = range.begin(); + begin_it += begin; + for (; (*begin_it)[0] < end; ++begin_it) { + const auto [out_index, a_index, b_index, cond_index] = + *begin_it; + data_out[out_index] = + data_cond[cond_index] ? data_a[a_index] : data_b[b_index]; + } + }); } else { - for (const auto i : c10::irange(out_numel)) { - data_out[i] = data_cond[i] ? data_a[i] : data_b[i]; - } + executorch::extension::parallel_for( + 0, out_numel, [&](const auto begin, const auto end) { + for (const auto i : c10::irange(begin, end)) { + data_out[i] = data_cond[i] ? data_a[i] : data_b[i]; + } + }); } }); } else { From fad4ed8a112b8a11be2af2dbbbf40ba98e486150 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 13:39:07 -0800 Subject: [PATCH 21/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/broadcast_indexes_range.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h index c749541058a..5fa50d8d212 100644 --- a/kernels/portable/cpu/util/broadcast_indexes_range.h +++ b/kernels/portable/cpu/util/broadcast_indexes_range.h @@ -79,7 +79,9 @@ class BroadcastIndexesIterator { // You might wonder what happens if output_shape_[ii] == 0. In // that case, output.numel() would be 0, and thus we would have // begin() == end() and no iteration. - if ET_UNLIKELY (delinearized_output_index_[ii] == output_shape_[ii] - 1) { + if ET_UNLIKELY ( + static_cast(delinearized_output_index_[ii]) == + output_shape_[ii] - 1) { const auto old_delinearized_output_index_item = delinearized_output_index_[ii]; delinearized_output_index_[ii] = 0; From 37e42135a2577269da151b1060cc2c30f2f89d43 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 7 Mar 2025 16:10:47 -0800 Subject: [PATCH 22/38] Update [ghstack-poisoned] --- kernels/portable/cpu/op_argmin.cpp | 62 ++++++++++++++++++------------ 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index a0ee82d2612..67f013637c3 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -12,6 +12,7 @@ #include #include +#include #include namespace torch { @@ -47,30 +48,43 @@ Tensor& argmin_out( ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] { long* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { - // the below condition as written is equivalent to !isnan(accval) && - // (isnan(v) || v < acc_val). cases: - // - if neither acc_val nor v is NaN, !(v >= acc_val) is - // trivially equivalent to v < acc_val. - // - if acc_val is NaN, the whole thing is trivially false. - // - if acc_val is not NaN and v is NaN, then v >= acc_val - // - is false because all comparisons involving NaN are - // - false, so the result is true. The result is trivially - // - true for the above condition that uses isnan(v) as - // - well. - if (!std::isnan(acc_val) && !(v >= acc_val)) { - acc_val = v; - acc_ix = ix; - } - return std::tuple{acc_val, acc_ix}; - }, - in, - dim, - out_ix); - out_data[out_ix] = std::get<1>(acc); - } + // REVIEW: this is the parallelization strategy ATen uses + // specifically when the reduction is along the last dimension and + // that dimension is contiguous. Is there any particular reason we + // shouldn't just always use this strategy since we aren't + // otherwise capable of parallelizing reductions? + const auto reduction_size = + dim.has_value() ? in.sizes().at(dim.value()) : in.numel(); + const auto grain_size = std::max( + static_cast(1), + executorch::extension::internal::GRAIN_SIZE / reduction_size); + executorch::extension::parallel_for( + 0, out.numel(), grain_size, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + // the below condition as written is equivalent to + // !isnan(accval) && (isnan(v) || v < acc_val). cases: + // - if neither acc_val nor v is NaN, !(v >= acc_val) is + // trivially equivalent to v < acc_val. + // - if acc_val is NaN, the whole thing is trivially false. + // - if acc_val is not NaN and v is NaN, then v >= acc_val + // - is false because all comparisons involving NaN are + // - false, so the result is true. The result is trivially + // - true for the above condition that uses isnan(v) as + // - well. + if (!std::isnan(acc_val) && !(v >= acc_val)) { + acc_val = v; + acc_ix = ix; + } + return std::tuple{acc_val, acc_ix}; + }, + in, + dim, + out_ix); + out_data[out_ix] = std::get<1>(acc); + } + }); }); return out; From 4917358e55334df6237f2c70f747c5d4141d2702 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 10 Mar 2025 18:48:48 -0700 Subject: [PATCH 23/38] Update [ghstack-poisoned] --- runtime/kernel/thread_parallel_interface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h index 82e34ecf7c0..1e79acc75a4 100644 --- a/runtime/kernel/thread_parallel_interface.h +++ b/runtime/kernel/thread_parallel_interface.h @@ -74,7 +74,7 @@ inline int64_t get_thread_num() { return 0; } -void set_thread_num(int64_t thread_num) { +inline void set_thread_num(int64_t thread_num) { ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); } #endif // ET_USE_THREADPOOL From 477996063f3f954d8273cd17849006a7bae62fec Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 08:40:21 -0700 Subject: [PATCH 24/38] Update [ghstack-poisoned] --- kernels/portable/cpu/op_argmin.cpp | 14 ++------------ kernels/portable/cpu/util/reduce_util.h | 19 +++++++++++++++++++ kernels/portable/cpu/util/targets.bzl | 3 +++ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index 699aee7034f..8b037e37544 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -48,18 +48,8 @@ Tensor& argmin_out( ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] { long* out_data = out.mutable_data_ptr(); - // REVIEW: this is the parallelization strategy ATen uses - // specifically when the reduction is along the last dimension and - // that dimension is contiguous. Is there any particular reason we - // shouldn't just always use this strategy since we aren't - // otherwise capable of parallelizing reductions? - const auto reduction_size = - dim.has_value() ? in.sizes().at(dim.value()) : in.numel(); - const auto grain_size = std::max( - static_cast(1), - executorch::extension::internal::GRAIN_SIZE / reduction_size); - const bool success = executorch::extension::parallel_for( - 0, out.numel(), grain_size, [&](const auto begin, const auto end) { + const bool success = parallel_for_each_reduce_over_dim_output_index( + in, dim, out, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { std::tuple acc = reduce_over_dim( [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 35cfdfbaa72..45db6cb92d3 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -661,6 +662,24 @@ bool check_prod_out_args( optional dtype, Tensor& out); +/** + * parallel_for wrapper for reductions that call reduce_over_dim or + * map_reduce_over_dim for each output element. Automatically + * calculates appropriate grain size. + */ +template +[[nodiscard]] bool parallel_for_each_reduce_over_dim_output_index( + const Tensor& in, + optional dim, + const Tensor& out, + const Func& func) { + const auto reduction_size = + dim.has_value() ? in.sizes().at(dim.value()) : in.numel(); + const auto grain_size = std::max( + static_cast(1), + executorch::extension::internal::GRAIN_SIZE / reduction_size); + return executorch::extension::parallel_for(0, out.numel(), grain_size, func); +} #endif } // namespace executor diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index e1c5cadfe84..bf2fe042a93 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -313,6 +313,9 @@ def define_common_targets(): "//executorch/runtime/kernel:kernel_includes{}".format(suffix), "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix), ], + exported_deps = [ + "//executorch/runtime/kernel:thread_parallel_interface", + ], exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [], visibility = [ "//executorch/extension/llm/custom_ops/...", From e6be3fe5d569cbc14d4ab01631e4b1c181c3b4fb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 08:57:24 -0700 Subject: [PATCH 25/38] Update [ghstack-poisoned] --- kernels/portable/cpu/op_argmin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index 699aee7034f..e2f5cdab163 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -85,7 +85,7 @@ Tensor& argmin_out( out_data[out_ix] = std::get<1>(acc); } }); - ET_KERNEL_CHECK_MSG(ctx, success, Internal, out, "parallel_for failed"); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; From 1ef9dd846df35fadc2fff3c1fa282c1d663f39f2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 08:57:26 -0700 Subject: [PATCH 26/38] Update [ghstack-poisoned] --- kernels/portable/cpu/op_any.cpp | 36 +++++++++++++++------------- kernels/portable/cpu/op_argmax.cpp | 38 +++++++++++++++++------------- kernels/portable/cpu/op_max.cpp | 35 +++++++++++++++------------ kernels/portable/cpu/op_min.cpp | 35 +++++++++++++++------------ kernels/portable/cpu/op_prod.cpp | 36 +++++++++++++++------------- 5 files changed, 101 insertions(+), 79 deletions(-) diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp index 2cfdf36740b..ea5eafb9ba8 100644 --- a/kernels/portable/cpu/op_any.cpp +++ b/kernels/portable/cpu/op_any.cpp @@ -139,22 +139,26 @@ Tensor& any_out( ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] { ET_SWITCH_TWO_TYPES(Bool, Byte, out_type, ctx, name, CTYPE_OUT, [&] { CTYPE_OUT* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT any = false; - if (in.numel() > 0) { - std::tuple acc = - map_reduce_over_dim( - [](CTYPE_IN v) { return static_cast(v); }, - [](bool outv, long, bool acc, long) { - return std::tuple{acc || outv, 0}; - }, - in, - dim, - out_ix); - any = std::get<0>(acc); - } - out_data[out_ix] = any; - } + const bool success = parallel_for_each_reduce_over_dim_output_index( + in, dim, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT any = false; + if (in.numel() > 0) { + std::tuple acc = + map_reduce_over_dim( + [](CTYPE_IN v) { return static_cast(v); }, + [](bool outv, long, bool acc, long) { + return std::tuple{acc || outv, 0}; + }, + in, + dim, + out_ix); + any = std::get<0>(acc); + } + out_data[out_ix] = any; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); }); diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp index a272d4405a8..ffbc469c53d 100644 --- a/kernels/portable/cpu/op_argmax.cpp +++ b/kernels/portable/cpu/op_argmax.cpp @@ -47,23 +47,27 @@ Tensor& argmax_out( ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmax.out", CTYPE, [&] { long* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { - // the below condition as written is equivalent to - // !isnan(accval) && (isnan(v) || v > acc_val). See - // argument in op_argmin.cpp. - if (!std::isnan(acc_val) && !(v <= acc_val)) { - acc_val = v; - acc_ix = ix; - } - return std::tuple{acc_val, acc_ix}; - }, - in, - dim, - out_ix); - out_data[out_ix] = std::get<1>(acc); - } + const bool success = parallel_for_each_reduce_over_dim_output_index( + in, dim, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + // the below condition as written is equivalent to + // !isnan(accval) && (isnan(v) || v > acc_val). See + // argument in op_argmin.cpp. + if (!std::isnan(acc_val) && !(v <= acc_val)) { + acc_val = v; + acc_ix = ix; + } + return std::tuple{acc_val, acc_ix}; + }, + in, + dim, + out_ix); + out_data[out_ix] = std::get<1>(acc); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp index f206ee05b99..3f4a1d27c0e 100644 --- a/kernels/portable/cpu/op_max.cpp +++ b/kernels/portable/cpu/op_max.cpp @@ -83,21 +83,26 @@ std::tuple max_out( CTYPE* max_data = max.mutable_data_ptr(); long* max_indices_data = max_indices.mutable_data_ptr(); - for (const auto out_ix : c10::irange(max.numel())) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { - if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) { - acc_val = v; - acc_ix = ix; - } - return std::tuple{acc_val, acc_ix}; - }, - in, - dim, - out_ix); - max_data[out_ix] = std::get<0>(acc); - max_indices_data[out_ix] = std::get<1>(acc); - } + const bool success = parallel_for_each_reduce_over_dim_output_index( + in, dim, max, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + if (!std::isnan(acc_val) && + (std::isnan(v) || v > acc_val)) { + acc_val = v; + acc_ix = ix; + } + return std::tuple{acc_val, acc_ix}; + }, + in, + dim, + out_ix); + max_data[out_ix] = std::get<0>(acc); + max_indices_data[out_ix] = std::get<1>(acc); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return {max, max_indices}; diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp index 683ef751a9d..8b70bcd40f5 100644 --- a/kernels/portable/cpu/op_min.cpp +++ b/kernels/portable/cpu/op_min.cpp @@ -83,21 +83,26 @@ std::tuple min_out( CTYPE* min_data = min.mutable_data_ptr(); long* min_indices_data = min_indices.mutable_data_ptr(); - for (const auto out_ix : c10::irange(min.numel())) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { - if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) { - acc_val = v; - acc_ix = ix; - } - return std::tuple{acc_val, acc_ix}; - }, - in, - dim, - out_ix); - min_data[out_ix] = std::get<0>(acc); - min_indices_data[out_ix] = std::get<1>(acc); - } + const bool success = parallel_for_each_reduce_over_dim_output_index( + in, dim, min, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + if (!std::isnan(acc_val) && + (std::isnan(v) || v < acc_val)) { + acc_val = v; + acc_ix = ix; + } + return std::tuple{acc_val, acc_ix}; + }, + in, + dim, + out_ix); + min_data[out_ix] = std::get<0>(acc); + min_indices_data[out_ix] = std::get<1>(acc); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return {min, min_indices}; diff --git a/kernels/portable/cpu/op_prod.cpp b/kernels/portable/cpu/op_prod.cpp index 27d18ca2570..54580459d7c 100644 --- a/kernels/portable/cpu/op_prod.cpp +++ b/kernels/portable/cpu/op_prod.cpp @@ -77,22 +77,26 @@ Tensor& prod_int_out( ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { CTYPE_OUT* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT prod = 1; - if (in.numel() > 0) { - std::tuple acc = - map_reduce_over_dim( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, long, CTYPE_OUT acc, long) { - return std::tuple{acc * outv, 0}; - }, - in, - dim, - out_ix); - prod = std::get<0>(acc); - } - out_data[out_ix] = prod; - } + const bool success = parallel_for_each_reduce_over_dim_output_index( + in, dim, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT prod = 1; + if (in.numel() > 0) { + std::tuple acc = + map_reduce_over_dim( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, long, CTYPE_OUT acc, long) { + return std::tuple{acc * outv, 0}; + }, + in, + dim, + out_ix); + prod = std::get<0>(acc); + } + out_data[out_ix] = prod; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); }); From c66f533f421344fc4c38b99dafa6ca7fce594215 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 09:50:04 -0700 Subject: [PATCH 27/38] Update [ghstack-poisoned] --- kernels/portable/cpu/op_argmin.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index e2f5cdab163..87e90de4c04 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -53,8 +53,7 @@ Tensor& argmin_out( // that dimension is contiguous. Is there any particular reason we // shouldn't just always use this strategy since we aren't // otherwise capable of parallelizing reductions? - const auto reduction_size = - dim.has_value() ? in.sizes().at(dim.value()) : in.numel(); + const int64_t reduction_size = get_reduced_dim_product(in, dim); const auto grain_size = std::max( static_cast(1), executorch::extension::internal::GRAIN_SIZE / reduction_size); From e6d6ad619871b1d4af86246607502e174a9b375f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 09:50:04 -0700 Subject: [PATCH 28/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/reduce_util.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp index 09ba508a31d..259269988ed 100644 --- a/kernels/portable/cpu/util/reduce_util.cpp +++ b/kernels/portable/cpu/util/reduce_util.cpp @@ -85,10 +85,7 @@ size_t get_reduced_dim_product( } size_t dim_product = 1; if (!dim.has_value()) { - for (size_t i = 0; i < static_cast(in.dim()); ++i) { - dim_product *= in.size(i); - } - return dim_product; + return in.numel(); } const size_t d = _normalize_non_neg_d(dim.value(), in.dim()); return in.size(d); @@ -107,10 +104,7 @@ size_t get_reduced_dim_product( size_t dim_product = 1; const size_t in_dim = in.dim(); if (!dim_list.has_value() || dim_list.value().size() == 0) { - for (size_t i = 0; i < static_cast(in.dim()); ++i) { - dim_product *= in.size(i); - } - return dim_product; + return in.numel(); } for (const auto& d : dim_list.value()) { const size_t non_neg_d = _normalize_non_neg_d(d, in_dim); From 5dc8b27a9f276f0e27c40095d5112ebb0fca5aff Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 09:50:11 -0700 Subject: [PATCH 29/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/reduce_util.h | 26 ++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 0b4da15dda9..b618ba09563 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -675,10 +675,30 @@ template optional dim, const Tensor& out, const Func& func) { - const int64_t reduction_size = get_reduced_dim_product(in, dim); + const ssize_t reduction_size = get_reduced_dim_product(in, dim); const auto grain_size = std::max( - static_cast(1), - executorch::extension::internal::GRAIN_SIZE / reduction_size); + static_cast(1), + static_cast(executorch::extension::internal::GRAIN_SIZE) / + reduction_size); + return executorch::extension::parallel_for(0, out.numel(), grain_size, func); +} + +/** + * parallel_for wrapper for reductions that call reduce_over_dim_list or + * map_reduce_over_dim_list for each output element. Automatically + * calculates appropriate grain size. + */ +template +[[nodiscard]] bool parallel_for_each_reduce_over_dim_list_output_index( + const Tensor& in, + optional> dim_list, + const Tensor& out, + const Func& func) { + const ssize_t reduction_size = get_reduced_dim_product(in, dim_list); + const auto grain_size = std::max( + static_cast(1), + static_cast(executorch::extension::internal::GRAIN_SIZE) / + reduction_size); return executorch::extension::parallel_for(0, out.numel(), grain_size, func); } From 2dcb6dbb7bd923d9d542ad99ef43610cd944120a Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 11 Mar 2025 09:50:16 -0700 Subject: [PATCH 30/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/reduce_util.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index b618ba09563..b7c7efa50fe 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -675,11 +675,15 @@ template optional dim, const Tensor& out, const Func& func) { +#ifdef ET_USE_THREADPOOL const ssize_t reduction_size = get_reduced_dim_product(in, dim); const auto grain_size = std::max( static_cast(1), static_cast(executorch::extension::internal::GRAIN_SIZE) / reduction_size); +#else // ET_USE_THREADPOOL + const auto grain_size = 1; +#endif // ET_USE_THREADPOOL return executorch::extension::parallel_for(0, out.numel(), grain_size, func); } @@ -694,11 +698,15 @@ template optional> dim_list, const Tensor& out, const Func& func) { +#ifdef ET_UE_THREADPOOL const ssize_t reduction_size = get_reduced_dim_product(in, dim_list); const auto grain_size = std::max( static_cast(1), static_cast(executorch::extension::internal::GRAIN_SIZE) / reduction_size); +#else // ET_USE_THREADPOOL + const auto grain_size = 1; +#endif // ET_USE_THREADPOOL return executorch::extension::parallel_for(0, out.numel(), grain_size, func); } From 01f27904f4bb75cb94b115f5bc4191ba196cb926 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 12 Mar 2025 12:19:34 -0700 Subject: [PATCH 31/38] Update [ghstack-poisoned] --- kernels/portable/cpu/op_amax.cpp | 18 ++++++++------ kernels/portable/cpu/op_amin.cpp | 18 ++++++++------ kernels/portable/cpu/op_any.cpp | 25 +++++++++++-------- kernels/portable/cpu/op_mean.cpp | 35 +++++++++++++++------------ kernels/portable/cpu/op_sum.cpp | 36 +++++++++++++++------------- kernels/portable/cpu/op_var.cpp | 41 ++++++++++++++++++-------------- 6 files changed, 100 insertions(+), 73 deletions(-) diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp index 6030221d883..4ad409d4820 100644 --- a/kernels/portable/cpu/op_amax.cpp +++ b/kernels/portable/cpu/op_amax.cpp @@ -46,13 +46,17 @@ Tensor& amax_out( ReduceOverDimListPlan plan(in, dim_list); ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - out_data[out_ix] = plan.execute( - [](CTYPE v, CTYPE max_v) { - return std::isnan(v) || v > max_v ? v : max_v; - }, - out_ix); - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + out_data[out_ix] = plan.execute( + [](CTYPE v, CTYPE max_v) { + return std::isnan(v) || v > max_v ? v : max_v; + }, + out_ix); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp index e4979390a5d..396cb6c016d 100644 --- a/kernels/portable/cpu/op_amin.cpp +++ b/kernels/portable/cpu/op_amin.cpp @@ -45,13 +45,17 @@ Tensor& amin_out( ReduceOverDimListPlan plan(in, dim_list); ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - out_data[out_ix] = plan.execute( - [](CTYPE v, CTYPE min_v) { - return std::isnan(v) || v < min_v ? v : min_v; - }, - out_ix); - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + out_data[out_ix] = plan.execute( + [](CTYPE v, CTYPE min_v) { + return std::isnan(v) || v < min_v ? v : min_v; + }, + out_ix); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp index a368226db80..ee9e54fc0c3 100644 --- a/kernels/portable/cpu/op_any.cpp +++ b/kernels/portable/cpu/op_any.cpp @@ -96,16 +96,21 @@ Tensor& any_dims_out( static_cast(static_cast(in_data[out_ix])); } } else { - for (const auto out_ix : c10::irange(out.numel())) { - bool any = false; - if (in_not_empty) { - any = plan->execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](bool outv, bool acc) { return acc || outv; }, - out_ix); - } - out_data[out_ix] = static_cast(any); - } + const bool success = + parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + bool any = false; + if (in_not_empty) { + any = plan->execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](bool outv, bool acc) { return acc || outv; }, + out_ix); + } + out_data[out_ix] = static_cast(any); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); } }); }); diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index c13e2a09937..423c2564232 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -46,22 +46,27 @@ Tensor& mean_dim_out( out); MapReduceOverDimListPlan plan(in, dim_list); - ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] { - ET_SWITCH_FLOATHBF16_TYPES( - out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr(); - const size_t num = get_reduced_dim_product(in, dim_list); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = 0; - if (in.numel() > 0) { - sum = plan.execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "add.out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const size_t num = get_reduced_dim_product(in, dim_list); + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = 0; + if (in.numel() > 0) { + sum = plan.execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + } + out_data[out_ix] = sum / static_cast(num); } - out_data[out_ix] = sum / static_cast(num); - } - }); + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); + }); }); return out; diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index f58773a6769..550f6b9572f 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -50,23 +50,27 @@ Tensor& sum_dim_out( if (in.numel() > 0) { plan.emplace(in, dim_list); } - ET_SWITCH_REALHBBF16_TYPES( - in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] { - ET_SWITCH_REALHBBF16_TYPES( - out.scalar_type(), ctx, "sum.IntList_out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr(); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = 0; - if (plan.has_value()) { - sum = plan->execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - } - out_data[out_ix] = sum; + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sum.IntList_out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = 0; + if (plan.has_value()) { + sum = plan->execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); } - }); - }); + out_data[out_ix] = sum; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); + }); + }); return out; } diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index c5be3fdad62..f09f1d92bc9 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -21,6 +21,7 @@ namespace { template void compute_variance( + KernelRuntimeContext& ctx, const Tensor& in, Tensor& out, optional> dim_list, @@ -33,22 +34,26 @@ void compute_variance( } } else { MapReduceOverDimListPlan plan(in, dim_list); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = plan.execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - CTYPE_OUT mean = sum / static_cast(num); - CTYPE_OUT sum2 = plan.execute( - [mean](CTYPE_IN v) { - return ( - (static_cast(v) - mean) * - (static_cast(v) - mean)); - }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - out_data[out_ix] = sum2 / denominator; - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = plan.execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + CTYPE_OUT mean = sum / static_cast(num); + CTYPE_OUT sum2 = plan.execute( + [mean](CTYPE_IN v) { + return ( + (static_cast(v) - mean) * + (static_cast(v) - mean)); + }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + out_data[out_ix] = sum2 / denominator; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); } } @@ -90,7 +95,7 @@ Tensor& var_out( ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { - compute_variance(in, out, dim_list, num, denom); + compute_variance(ctx, in, out, dim_list, num, denom); }); }); @@ -135,7 +140,7 @@ Tensor& var_correction_out( ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { - compute_variance(in, out, dim_list, num, denom); + compute_variance(ctx, in, out, dim_list, num, denom); }); }); From ccfbacf7671083bd79538920e525a222b5c6d9bb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 13 Mar 2025 14:13:28 -0700 Subject: [PATCH 32/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/elementwise_util.h | 61 +++++++++++++------- kernels/portable/cpu/util/targets.bzl | 1 + 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index a5bcd6ff98b..02d9a909c17 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -100,28 +101,46 @@ inline void apply_elementwise_fn( const auto out_element_size = out.element_size(); if (any_is_broadcasted) { - for (const auto& indexes : - BroadcastIndexesRange(out, (*inputs.first)...)) { - std::array loaded_inputs; - for (const auto idx : c10::irange(kNumInputs)) { - const auto& input_info = inputs_info[idx]; - loaded_inputs[idx] = input_info.load_to_common( - &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]); - } - auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[indexes[0] * out_element_size]); - } + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + const auto range = + BroadcastIndexesRange(out, (*inputs.first)...); + auto begin_it = range.begin(); + begin_it += begin; + for (; (*begin_it)[0] < end; ++begin_it) { + const auto& indexes = *begin_it; + std::array loaded_inputs; + for (const auto idx : c10::irange(kNumInputs)) { + const auto& input_info = inputs_info[idx]; + loaded_inputs[idx] = input_info.load_to_common( + &input_info + .data_ptr[indexes[idx + 1] * input_info.element_size]); + } + auto result = std::apply(compute_fun, loaded_inputs); + store_common_to_out( + result, &data_out[indexes[0] * out_element_size]); + } + }); } else { - for (const auto i : c10::irange(out.numel())) { - std::array loaded_inputs; - for (const auto idx : c10::irange(kNumInputs)) { - const auto& input_info = inputs_info[idx]; - loaded_inputs[idx] = input_info.load_to_common( - &input_info.data_ptr[i * input_info.element_size]); - } - auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[i * out_element_size]); - } + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + for (const auto i : c10::irange(begin, end)) { + std::array loaded_inputs; + for (const auto idx : c10::irange(kNumInputs)) { + const auto& input_info = inputs_info[idx]; + loaded_inputs[idx] = input_info.load_to_common( + &input_info.data_ptr[i * input_info.element_size]); + } + auto result = std::apply(compute_fun, loaded_inputs); + store_common_to_out(result, &data_out[i * out_element_size]); + } + }); } } } // namespace internal diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index bf2fe042a93..bffea2140b5 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -110,6 +110,7 @@ def define_common_targets(): ":broadcast_util", ":dtype_util", "//executorch/runtime/kernel:kernel_runtime_context", + "//executorch/runtime/kernel:thread_parallel_interface", ], deps = [ "//executorch/kernels/portable/cpu:scalar_utils", From 21958dc7010b9d07fcbca9a1f18a7ca680fd6e89 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 14 Mar 2025 15:49:28 -0700 Subject: [PATCH 33/38] Update [ghstack-poisoned] --- kernels/optimized/cpu/op_where.cpp | 46 ++++++------------- .../cpu/util/broadcast_indexes_range.h | 26 +++++++---- kernels/portable/cpu/util/broadcast_util.h | 39 +++------------- kernels/portable/cpu/util/elementwise_util.h | 36 ++++----------- 4 files changed, 47 insertions(+), 100 deletions(-) diff --git a/kernels/optimized/cpu/op_where.cpp b/kernels/optimized/cpu/op_where.cpp index 7d58ba4852c..fb14e542891 100644 --- a/kernels/optimized/cpu/op_where.cpp +++ b/kernels/optimized/cpu/op_where.cpp @@ -48,42 +48,24 @@ Tensor& opt_where_out( cond.scalar_type() == ScalarType::Bool) { auto out_numel = out.numel(); ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool cond_is_broadcasted = !out.sizes().equals(cond.sizes()); - const bool any_is_broadcasted = - (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted); const CTYPE_COMPUTE* const data_a = a.const_data_ptr(); const CTYPE_COMPUTE* const data_b = b.const_data_ptr(); const bool* const data_cond = cond.const_data_ptr(); CTYPE_COMPUTE* const data_out = out.data_ptr(); - if (any_is_broadcasted) { - executorch::extension::parallel_for( - 0, - out_numel, - ::executorch::extension::internal::GRAIN_SIZE, - [&](const auto begin, const auto end) { - auto range = BroadcastIndexesRange<3>(out, a, b, cond); - auto begin_it = range.begin(); - begin_it += begin; - for (; (*begin_it)[0] < end; ++begin_it) { - const auto [out_index, a_index, b_index, cond_index] = - *begin_it; - data_out[out_index] = - data_cond[cond_index] ? data_a[a_index] : data_b[b_index]; - } - }); - } else { - executorch::extension::parallel_for( - 0, - out_numel, - ::executorch::extension::internal::GRAIN_SIZE, - [&](const auto begin, const auto end) { - for (const auto i : c10::irange(begin, end)) { - data_out[i] = data_cond[i] ? data_a[i] : data_b[i]; - } - }); - } + executorch::extension::parallel_for( + 0, + out_numel, + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + auto range = BroadcastIndexesRange<3>(out, a, b, cond); + auto begin_it = range.begin(); + begin_it += begin; + for (; (*begin_it)[0] < end; ++begin_it) { + const auto [out_index, a_index, b_index, cond_index] = *begin_it; + data_out[out_index] = + data_cond[cond_index] ? data_a[a_index] : data_b[b_index]; + } + }); }); } else { // Fall back for mixed dtype to keep code size and compile time diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h index 5fa50d8d212..7b78f4c2814 100644 --- a/kernels/portable/cpu/util/broadcast_indexes_range.h +++ b/kernels/portable/cpu/util/broadcast_indexes_range.h @@ -34,14 +34,17 @@ class BroadcastIndexesIterator { template explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args) - : output_dim_(output.dim()), - output_shape_(output.sizes()), - effective_input_broadcast_strides_{ - effective_input_broadcast_stride(output, args)...} { + : output_dim_or_zero_if_no_broadcasting_( + ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()), + output_shape_(output.sizes()) { static_assert( sizeof...(args) == kNumInputs && (std::is_same_v && ...), "BroadcastIndexesIterator constructor requires kNumInputs input tensor" "arguments!"); + if (output_dim_or_zero_if_no_broadcasting_ != 0) { + effective_input_broadcast_strides_ = { + effective_input_broadcast_stride(output, args)...}; + } } struct make_end_t { @@ -73,9 +76,14 @@ class BroadcastIndexesIterator { BroadcastIndexesIterator& operator++() { output_index()++; + if (output_dim_or_zero_if_no_broadcasting_ == 0) { + std::fill( + current_indexes_.begin() + 1, current_indexes_.end(), output_index()); + return *this; + } // TODO: add optimization for particular input tensors not being // broadcasted? - for (auto ii = output_dim_ - 1; ii >= 0; --ii) { + for (auto ii = output_dim_or_zero_if_no_broadcasting_ - 1; ii >= 0; --ii) { // You might wonder what happens if output_shape_[ii] == 0. In // that case, output.numel() would be 0, and thus we would have // begin() == end() and no iteration. @@ -121,7 +129,8 @@ class BroadcastIndexesIterator { delinearized_output_index_.size()); for (const auto ii : c10::irange(1, kNumInputs + 1)) { current_indexes_[ii] = 0; - for (const auto jj : c10::irange(output_dim_)) { + for (const auto jj : + c10::irange(output_dim_or_zero_if_no_broadcasting_)) { current_indexes_[ii] += delinearized_output_index_[jj] * effective_input_broadcast_strides_[ii - 1][jj]; } @@ -180,7 +189,7 @@ class BroadcastIndexesIterator { // followed by kNumInputs input indexes. std::array current_indexes_ = {0}; ShapeType delinearized_output_index_ = {0}; - ssize_t output_dim_; + ssize_t output_dim_or_zero_if_no_broadcasting_; ArrayRef output_shape_; // The linear index for a broadcast tensor is // sum(delinearized_output_index_[i] * input_stride_[i] if @@ -189,8 +198,7 @@ class BroadcastIndexesIterator { // output_dim. This is straightforwardly implementable with an // adjusted stride array that contains 0s where the padded input // shape would contain 1s. - std::array effective_input_broadcast_strides_ = { - {{0}}}; + std::array effective_input_broadcast_strides_; }; } // namespace internal diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h index ed536f86c2d..2b10ee24411 100644 --- a/kernels/portable/cpu/util/broadcast_util.h +++ b/kernels/portable/cpu/util/broadcast_util.h @@ -254,26 +254,13 @@ inline void apply_binary_elementwise_fn( const Tensor& a, const Tensor& b, const Tensor& out) { - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted); - const CTYPE_A* const data_a = a.const_data_ptr(); const CTYPE_B* const data_b = b.const_data_ptr(); CTYPE_OUT* const data_out = out.mutable_data_ptr(); - if (any_is_broadcasted) { - for (const auto [out_index, a_index, b_index] : - BroadcastIndexesRange<2>(out, a, b)) { - data_out[out_index] = compute_fun(data_a[a_index], data_b[b_index]); - } - } else { - for (const auto i : c10::irange(out.numel())) { - size_t a_linear_index = i; - size_t b_linear_index = i; - - data_out[i] = compute_fun(data_a[a_linear_index], data_b[b_linear_index]); - } + for (const auto [out_index, a_index, b_index] : + BroadcastIndexesRange<2>(out, a, b)) { + data_out[out_index] = compute_fun(data_a[a_index], data_b[b_index]); } } @@ -294,27 +281,15 @@ inline void apply_ternary_elementwise_fn( const Tensor& b, const Tensor& c, const Tensor& out) { - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool c_is_broadcasted = !out.sizes().equals(c.sizes()); - const bool any_is_broadcasted = - (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted); - const CTYPE_A* const data_a = a.const_data_ptr(); const CTYPE_B* const data_b = b.const_data_ptr(); const CTYPE_C* const data_c = c.const_data_ptr(); CTYPE_OUT* const data_out = out.mutable_data_ptr(); - if (any_is_broadcasted) { - for (const auto [out_index, a_index, b_index, c_index] : - BroadcastIndexesRange<3>(out, a, b, c)) { - data_out[out_index] = - compute_fun(data_a[a_index], data_b[b_index], data_c[c_index]); - } - } else { - for (const auto i : c10::irange(out.numel())) { - data_out[i] = compute_fun(data_a[i], data_b[i], data_c[i]); - } + for (const auto [out_index, a_index, b_index, c_index] : + BroadcastIndexesRange<3>(out, a, b, c)) { + data_out[out_index] = + compute_fun(data_a[a_index], data_b[b_index], data_c[c_index]); } } diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index a5bcd6ff98b..23ec481bb7f 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -76,11 +76,6 @@ inline void apply_elementwise_fn( internal::check_tensor_dtype(out, out_dtypes, compute_type), InvalidArgument, ); - bool any_is_broadcasted = false; - if constexpr (kNumInputs > 1) { - any_is_broadcasted = (!out.sizes().equals(inputs.first->sizes()) || ...); - } - struct InputInfo { load_to_common_fn load_to_common; const char* data_ptr; @@ -99,29 +94,16 @@ inline void apply_elementwise_fn( char* const data_out = reinterpret_cast(out.mutable_data_ptr()); const auto out_element_size = out.element_size(); - if (any_is_broadcasted) { - for (const auto& indexes : - BroadcastIndexesRange(out, (*inputs.first)...)) { - std::array loaded_inputs; - for (const auto idx : c10::irange(kNumInputs)) { - const auto& input_info = inputs_info[idx]; - loaded_inputs[idx] = input_info.load_to_common( - &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]); - } - auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[indexes[0] * out_element_size]); - } - } else { - for (const auto i : c10::irange(out.numel())) { - std::array loaded_inputs; - for (const auto idx : c10::irange(kNumInputs)) { - const auto& input_info = inputs_info[idx]; - loaded_inputs[idx] = input_info.load_to_common( - &input_info.data_ptr[i * input_info.element_size]); - } - auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[i * out_element_size]); + for (const auto& indexes : + BroadcastIndexesRange(out, (*inputs.first)...)) { + std::array loaded_inputs; + for (const auto idx : c10::irange(kNumInputs)) { + const auto& input_info = inputs_info[idx]; + loaded_inputs[idx] = input_info.load_to_common( + &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]); } + auto result = std::apply(compute_fun, loaded_inputs); + store_common_to_out(result, &data_out[indexes[0] * out_element_size]); } } } // namespace internal From 5cb625abcaf55c2145021cd098d41b45bb1ca184 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 17 Mar 2025 16:03:04 -0700 Subject: [PATCH 34/38] Update [ghstack-poisoned] --- .../executor_runner/executor_runner.cpp | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 1e0241958b9..ad8c159a7be 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -249,18 +249,24 @@ int main(int argc, char** argv) { (uint32_t)method.error()); ET_LOG(Info, "Method loaded."); - // Allocate input tensors and set all of their elements to 1. The `inputs` - // variable owns the allocated memory and must live past the last call to - // `execute()`. - auto inputs = executorch::extension::prepare_input_tensors(*method); - ET_CHECK_MSG( - inputs.ok(), - "Could not prepare inputs: 0x%" PRIx32, - (uint32_t)inputs.error()); - ET_LOG(Info, "Inputs prepared."); - // Run the model. for (uint32_t i = 0; i < FLAGS_num_executions; i++) { + ET_LOG(Info, "Preparing inputs."); + // Allocate input tensors and set all of their elements to 1. The `inputs` + // variable owns the allocated memory and must live past the last call to + // `execute()`. + // + // NOTE: we have to re-prepare input tensors on every execution + // because inputs whose space gets reused by memory planning (if + // any such inputs exist) will not be preserved for the next + // execution. + auto inputs = executorch::extension::prepare_input_tensors(*method); + ET_CHECK_MSG( + inputs.ok(), + "Could not prepare inputs: 0x%" PRIx32, + (uint32_t)inputs.error()); + ET_LOG(Info, "Inputs prepared."); + Error status = method->execute(); ET_CHECK_MSG( status == Error::Ok, From ac789abf1d728bbc39690642a51904aa169b1269 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 17 Mar 2025 16:17:13 -0700 Subject: [PATCH 35/38] Update [ghstack-poisoned] --- examples/portable/executor_runner/executor_runner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index ad8c159a7be..08907d333c4 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -251,7 +251,7 @@ int main(int argc, char** argv) { // Run the model. for (uint32_t i = 0; i < FLAGS_num_executions; i++) { - ET_LOG(Info, "Preparing inputs."); + ET_LOG(Debug, "Preparing inputs."); // Allocate input tensors and set all of their elements to 1. The `inputs` // variable owns the allocated memory and must live past the last call to // `execute()`. @@ -265,7 +265,7 @@ int main(int argc, char** argv) { inputs.ok(), "Could not prepare inputs: 0x%" PRIx32, (uint32_t)inputs.error()); - ET_LOG(Info, "Inputs prepared."); + ET_LOG(Debug, "Inputs prepared."); Error status = method->execute(); ET_CHECK_MSG( From a9a9a1e2e096246d193fe684ab05664b4a5713b0 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 17 Mar 2025 16:17:13 -0700 Subject: [PATCH 36/38] Update [ghstack-poisoned] --- .../portable/executor_runner/executor_runner.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 08907d333c4..7c75c39f0a9 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #ifdef ET_EVENT_TRACER_ENABLED #include @@ -249,6 +250,7 @@ int main(int argc, char** argv) { (uint32_t)method.error()); ET_LOG(Info, "Method loaded."); + et_timestamp_t time_spent_executing = 0; // Run the model. for (uint32_t i = 0; i < FLAGS_num_executions; i++) { ET_LOG(Debug, "Preparing inputs."); @@ -267,17 +269,24 @@ int main(int argc, char** argv) { (uint32_t)inputs.error()); ET_LOG(Debug, "Inputs prepared."); + const et_timestamp_t before_execute = et_pal_current_ticks(); Error status = method->execute(); + const et_timestamp_t after_execute = et_pal_current_ticks(); + time_spent_executing += after_execute - before_execute; ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, method_name, (uint32_t)status); } + const auto tick_ratio = et_pal_ticks_to_ns_multiplier(); + constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000; ET_LOG( Info, - "Model executed successfully %" PRIu32 " time(s).", - FLAGS_num_executions); + "Model executed successfully %" PRIu32 " time(s) in %f ms.", + FLAGS_num_executions, + static_cast(time_spent_executing) * tick_ratio.numerator / + tick_ratio.denominator / NANOSECONDS_PER_MILLISECOND); // Print the outputs. std::vector outputs(method->outputs_size()); From 7369bc2f777b615413abb08b00eab806bfe16993 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 17 Mar 2025 18:55:52 -0700 Subject: [PATCH 37/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/functional_util.h | 13 ++++++++++--- kernels/portable/cpu/util/targets.bzl | 3 +++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/kernels/portable/cpu/util/functional_util.h b/kernels/portable/cpu/util/functional_util.h index 609a1a26fa5..d7ea201dbd2 100644 --- a/kernels/portable/cpu/util/functional_util.h +++ b/kernels/portable/cpu/util/functional_util.h @@ -12,6 +12,7 @@ #include #include +#include namespace torch { namespace executor { @@ -53,9 +54,15 @@ inline void apply_unary_map_fn( CTYPE_OUT* const data_out, const int64_t size, const int64_t stride = 1) { - for (const auto i : c10::irange(size)) { - data_out[i * stride] = map_fun(data_in[i * stride]); - } + executorch::extension::parallel_for( + 0, + size, + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + for (const auto i : c10::irange(begin, end)) { + data_out[i * stride] = map_fun(data_in[i * stride]); + } + }); } // diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index d3274bb3c96..a623b9d4d7a 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -244,6 +244,9 @@ def define_common_targets(): name = "functional_util", srcs = [], exported_headers = ["functional_util.h"], + exported_deps = [ + "//executorch/runtime/kernel:thread_parallel_interface", + ], deps = [ "//executorch/runtime/kernel:kernel_includes", "//executorch/runtime/core/exec_aten/util:tensor_util", From a865349295c476578741b954209bff0899b85ecd Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 13:49:12 -0700 Subject: [PATCH 38/38] Update [ghstack-poisoned] --- kernels/portable/cpu/util/broadcast_indexes_range.h | 5 +++++ .../portable/cpu/util/test/broadcast_indexes_range_test.cpp | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h index 7b78f4c2814..aaf7207d0c9 100644 --- a/kernels/portable/cpu/util/broadcast_indexes_range.h +++ b/kernels/portable/cpu/util/broadcast_indexes_range.h @@ -122,6 +122,11 @@ class BroadcastIndexesIterator { } output_index() += n; + if (output_dim_or_zero_if_no_broadcasting_ == 0) { + std::fill( + current_indexes_.begin() + 1, current_indexes_.end(), output_index()); + return *this; + } delinearize_index( output_index(), output_shape_, diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp index 519cd9fe9f9..1023915ea66 100644 --- a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp +++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp @@ -44,7 +44,9 @@ TEST(BroadcastIndexesRangeTest, OneDNotBroadcasted) { Tensor out = tf.zeros({5}); int idx = 0; - for (const auto& elem : range_to_vec(BroadcastIndexesRange<1>(out, out))) { + const auto range = BroadcastIndexesRange<1>(out, out); + for (const auto& elem : range_to_vec(range)) { + EXPECT_EQ(*(range.begin() + idx), elem); EXPECT_EQ(elem[0], idx++); EXPECT_EQ(elem[0], elem[1]); } @@ -71,7 +73,7 @@ TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) { template void test_operator_plus(const Range& range) { size_t idx = 0; - for (const auto indexes : range) { + for (const auto& indexes : range) { EXPECT_EQ(*(range.begin() + idx), indexes); idx++; }