From d0b11e8bb47dacb6e200ac8b0e2609626ccee0f3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 11:35:29 -0800 Subject: [PATCH 01/14] Update [ghstack-poisoned] --- CMakeLists.txt | 2 +- build/cmake_deps.toml | 18 ++++++++++++++++++ kernels/optimized/CMakeLists.txt | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index de941663a88..73b89b6171e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -749,9 +749,9 @@ endif() if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO - AND CMAKE_CXX_STANDARD GREATER_EQUAL 14 ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel) endif() if(EXECUTORCH_BUILD_PYBIND) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 4b22a09cb5b..4bbfd636a96 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -73,6 +73,7 @@ excludes = [ deps = [ "executorch", "executorch_core", + "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -115,6 +116,7 @@ excludes = [ deps = [ "executorch_core", "executorch", + "extension_parallel", ] [targets.optimized_native_cpu_ops] @@ -129,6 +131,8 @@ excludes = [ deps = [ "executorch_core", "executorch", + "extension_parallel", + "extension_threadpool", "portable_kernels", ] # ---------------------------------- core end ---------------------------------- @@ -208,6 +212,19 @@ deps = [ "extension_runner_util", ] +[targets.extension_parallel] +buck_targets = [ + "//extension/parallel:thread_parallel", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_core", + "extension_threadpool", +] + [targets.extension_tensor] buck_targets = [ "//extension/tensor:tensor", @@ -395,6 +412,7 @@ deps = [ "executorch", "executorch_core", "optimized_kernels", + "extension_parallel", "extension_threadpool", "xnnpack_backend", ] diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 235c6738d9a..d9b19d4f9c2 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -43,7 +43,7 @@ endif() list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(cpublas STATIC ${_optimized_cpublas__srcs}) target_link_libraries( - cpublas PRIVATE executorch_core eigen_blas extension_threadpool + cpublas PRIVATE executorch_core eigen_blas extension_parallel extension_threadpool ) target_compile_options(cpublas PUBLIC ${_common_compile_options}) From 9437be1e7055d5705540d3544955b5d30f72be43 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 12:29:58 -0800 Subject: [PATCH 02/14] Update [ghstack-poisoned] --- extension/parallel/CMakeLists.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 extension/parallel/CMakeLists.txt diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt new file mode 100644 index 00000000000..7f727aafe46 --- /dev/null +++ b/extension/parallel/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Please keep this file formatted by running: +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ + +if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)) + message(FATAL_ERROR "extension/parallel requires extension/threadpool") +endif() + +add_library(extension_parallel thread_parallel.cpp) + +target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool) +target_compile_options(extension_parallel PUBLIC ${_common_compile_options}) + +install( + TARGETS extension_parallel + DESTINATION lib + INCLUDES + DESTINATION ${_common_include_directories}) From 643e10ee081b1ea34f0f5fb49f7df44a9f2f666b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 14:54:47 -0800 Subject: [PATCH 03/14] Update [ghstack-poisoned] --- build/executorch-config.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index d14a1227cd9..539c7c2960e 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -67,6 +67,7 @@ set(lib_list portable_ops_lib extension_module extension_module_static + extension_parallel extension_runner_util extension_tensor extension_threadpool @@ -114,3 +115,7 @@ foreach(lib ${lib_list}) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endif() endforeach() + +# TODO: investigate use of install(EXPORT) to cleanly handle +# target_compile_options/target_compile_definitions for everything. +target_link_libraries(cpublas INTERFACE extension_parallel) From 6f2842b876a5a2310c6bf7311d6f6b76bc54549e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 14:54:48 -0800 Subject: [PATCH 04/14] Update [ghstack-poisoned] --- build/executorch-config.cmake | 1 + examples/models/llama/CMakeLists.txt | 9 --------- examples/models/llava/CMakeLists.txt | 9 --------- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 539c7c2960e..d238db8ca95 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -65,6 +65,7 @@ set(lib_list neuron_backend qnn_executorch_backend portable_ops_lib + custom_ops extension_module extension_module_static extension_parallel diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 5f49581ea25..f5d5a78d430 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -84,14 +84,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) target_link_options_shared_lib(executorch) endif() -# custom ops library -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops - ) -endif() - # llama_runner library add_subdirectory(runner) @@ -119,7 +111,6 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - target_link_options_shared_lib(custom_ops) list(APPEND link_libraries custom_ops) endif() diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index ecd00809fdb..f7fa4bacc04 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -93,14 +93,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) target_link_options_shared_lib(executorch) endif() -# custom ops library -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${EXECUTORCH_ROOT}/extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops - ) -endif() - # llava_runner library add_subdirectory(runner) @@ -132,7 +124,6 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - target_link_options_shared_lib(custom_ops) list(APPEND link_libraries custom_ops) endif() From e47dfeb68a2e8e2ae0e3fa2add553f724f73404d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 14:54:52 -0800 Subject: [PATCH 05/14] Update [ghstack-poisoned] --- build/executorch-config.cmake | 1 + examples/models/llama/CMakeLists.txt | 1 - examples/models/llava/CMakeLists.txt | 1 - examples/models/llava/targets.bzl | 3 --- extension/android/CMakeLists.txt | 1 - extension/llm/custom_ops/CMakeLists.txt | 4 ++-- extension/threadpool/CMakeLists.txt | 1 + 7 files changed, 4 insertions(+), 8 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index d238db8ca95..75e1075d929 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -120,3 +120,4 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. target_link_libraries(cpublas INTERFACE extension_parallel) +target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index f5d5a78d430..b3364be610a 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -131,7 +131,6 @@ endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) - list(APPEND _common_compile_options -DET_USE_THREADPOOL) list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index f7fa4bacc04..5d5857dd5af 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -130,7 +130,6 @@ endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) - list(APPEND _common_compile_options -DET_USE_THREADPOOL) list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl index 5efb099f06f..6f3a370acf4 100644 --- a/examples/models/llava/targets.bzl +++ b/examples/models/llava/targets.bzl @@ -7,9 +7,6 @@ def define_common_targets(): "main.cpp", ], compiler_flags = ["-Wno-global-constructors"], - preprocessor_flags = [ - "-DET_USE_THREADPOOL", - ], deps = [ "//executorch/examples/models/llava/runner:runner", "//executorch/extension/evalue_util:print_evalue", diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 70f21f2751c..849d1d14364 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -124,7 +124,6 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM) endif() if(TARGET pthreadpool) - target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1) target_include_directories( executorch_jni PUBLIC diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index c3969e6f9bf..eeb118d4344 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -78,7 +78,7 @@ target_include_directories( target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core) target_compile_options( - custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL + custom_ops PUBLIC ${_common_compile_options} ) install(TARGETS custom_ops DESTINATION lib) @@ -130,7 +130,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) target_compile_options( custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions - ${_common_compile_options} -DET_USE_THREADPOOL + ${_common_compile_options} ) install(TARGETS custom_ops_aot_lib diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index 90288656674..c1d86acf75d 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -32,6 +32,7 @@ target_include_directories( PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include ) +target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL) target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) # Install libraries From a92958a1f6fbfbc154ec24c6b4ee6c6ebd41aea8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 18:41:25 -0800 Subject: [PATCH 06/14] Update [ghstack-poisoned] --- build/executorch-config.cmake | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 1a2da26416e..8f64c502b47 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -118,6 +118,8 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel -) +if (TARGET cpublas) + set_target_properties( + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel + ) +endif() From 3bd64370f6454f9523b6cc51d05d377fed8a77fb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 19:54:10 -0800 Subject: [PATCH 07/14] Update [ghstack-poisoned] --- build/executorch-config.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 8f64c502b47..2c459b66ac8 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -118,7 +118,12 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -if (TARGET cpublas) +if(TARGET extension_parallel) + set_target_properties( + extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool + ) +endif() +if(TARGET cpublas) set_target_properties( cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel ) From 9fdebee5a3e9a439895df57b49343816fcceee86 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 4 Mar 2025 21:11:25 -0800 Subject: [PATCH 08/14] Update [ghstack-poisoned] --- build/cmake_deps.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 4bbfd636a96..4811563269c 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -449,6 +449,7 @@ deps = [ "executorch_core", "extension_data_loader", "extension_module", + "extension_parallel", "portable_kernels", "quantized_kernels", "xnnpack_backend", From e48e81617b32f3460c5b449b8a524e221f79bef1 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 14:40:02 -0800 Subject: [PATCH 09/14] Update [ghstack-poisoned] --- extension/parallel/targets.bzl | 38 ++++++++++++-------------- extension/parallel/thread_parallel.cpp | 12 +++++--- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl index 82b3deab129..82a8502c034 100644 --- a/extension/parallel/targets.bzl +++ b/extension/parallel/targets.bzl @@ -7,24 +7,20 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ - for aten_mode in get_aten_mode_options(): - aten_suffix = ("_aten" if aten_mode else "") - - runtime.cxx_library( - name = "thread_parallel" + aten_suffix, - srcs = [ - "thread_parallel.cpp", - ], - exported_headers = [ - "thread_parallel.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/extension/threadpool:threadpool", - "//executorch/runtime/core:core", - "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ], - ) + runtime.cxx_library( + name = "thread_parallel", + srcs = [ + "thread_parallel.cpp", + ], + exported_headers = [ + "thread_parallel.h", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/runtime/core:core", + ], + ) diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp index dfbb911d3a9..5d481ccd44c 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/parallel/thread_parallel.cpp @@ -6,11 +6,12 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include -#include +#include #include namespace executorch { @@ -53,9 +54,12 @@ bool parallel_for( const int64_t end, const int64_t grain_size, const std::function& f) { - ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0); - ET_LOG_AND_RETURN_IF_FALSE(end >= begin); - ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0); + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); int64_t num_tasks = 0, chunk_size = 0; std::tie(num_tasks, chunk_size) = calc_num_tasks_and_chunk_size(begin, end, grain_size); From 3351d50555714d72d46d0ff4f096eff4ab4e61c4 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 16:38:46 -0800 Subject: [PATCH 10/14] Update [ghstack-poisoned] --- .lintrunner.toml | 2 + CMakeLists.txt | 1 - Test.cmake | 1 - build/cmake_deps.toml | 19 +--- build/executorch-config.cmake | 8 +- extension/llm/custom_ops/op_sdpa.cpp | 2 +- extension/llm/custom_ops/targets.bzl | 1 - extension/parallel/CMakeLists.txt | 25 ----- extension/parallel/TARGETS | 8 -- extension/parallel/targets.bzl | 26 ------ extension/parallel/test/TARGETS | 8 -- extension/parallel/test/targets.bzl | 19 ---- extension/parallel/thread_parallel.h | 49 ++-------- extension/threadpool/CMakeLists.txt | 7 +- extension/threadpool/targets.bzl | 3 + .../test/CMakeLists.txt | 20 +--- extension/threadpool/test/targets.bzl | 12 +++ .../test/thread_parallel_test.cpp | 41 ++++++--- .../thread_parallel.cpp | 2 +- kernels/optimized/CMakeLists.txt | 2 +- kernels/optimized/blas/BlasKernel.h | 2 +- kernels/optimized/lib_defs.bzl | 6 +- runtime/kernel/targets.bzl | 13 +++ runtime/kernel/thread_parallel_interface.h | 92 +++++++++++++++++++ test/utils/OSSTestConfig.json | 10 ++ 25 files changed, 185 insertions(+), 194 deletions(-) delete mode 100644 extension/parallel/CMakeLists.txt delete mode 100644 extension/parallel/TARGETS delete mode 100644 extension/parallel/targets.bzl delete mode 100644 extension/parallel/test/TARGETS delete mode 100644 extension/parallel/test/targets.bzl rename extension/{parallel => threadpool}/test/CMakeLists.txt (53%) rename extension/{parallel => threadpool}/test/thread_parallel_test.cpp (77%) rename extension/{parallel => threadpool}/thread_parallel.cpp (97%) create mode 100644 runtime/kernel/thread_parallel_interface.h diff --git a/.lintrunner.toml b/.lintrunner.toml index 7667ac430d1..1a27228d266 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -218,6 +218,8 @@ exclude_patterns = [ 'examples/**', 'extension/**', 'kernels/optimized/**', + # Justified include. + 'runtime/kernel/thread_parallel_interface.h', 'scripts/**', 'third-party/**', 'util/**', diff --git a/CMakeLists.txt b/CMakeLists.txt index 73b89b6171e..fabf667cbe1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel) endif() if(EXECUTORCH_BUILD_PYBIND) diff --git a/Test.cmake b/Test.cmake index d4b5f6aa1db..6bd7a86e70b 100644 --- a/Test.cmake +++ b/Test.cmake @@ -13,7 +13,6 @@ if(BUILD_TESTING) add_subdirectory(extension/evalue_util/test) add_subdirectory(extension/kernel_util/test) add_subdirectory(extension/memory_allocator/test) - add_subdirectory(extension/parallel/test) add_subdirectory(extension/pytree/test) add_subdirectory(kernels/portable/cpu/util/test) add_subdirectory(kernels/prim_ops/test) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index b1ed81b6a7e..9937f1b882f 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -88,7 +88,6 @@ excludes = [ deps = [ "executorch", "executorch_core", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -131,7 +130,6 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", ] [targets.optimized_native_cpu_ops] @@ -146,7 +144,6 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -227,19 +224,6 @@ deps = [ "extension_runner_util", ] -[targets.extension_parallel] -buck_targets = [ - "//extension/parallel:thread_parallel", -] -filters = [ - ".cpp$", -] -deps = [ - "executorch", - "executorch_core", - "extension_threadpool", -] - [targets.extension_tensor] buck_targets = [ "//extension/tensor:tensor", @@ -393,6 +377,7 @@ filters = [ deps = [ "executorch", "executorch_core", + "extension_threadpool", ] [targets.xnnpack_schema] @@ -427,7 +412,6 @@ deps = [ "executorch", "executorch_core", "optimized_kernels", - "extension_parallel", "extension_threadpool", "reduce_util", "xnnpack_backend", @@ -465,7 +449,6 @@ deps = [ "executorch_core", "extension_data_loader", "extension_module", - "extension_parallel", "portable_kernels", "quantized_kernels", "xnnpack_backend", diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 35fe03467f2..2e8cb95b70f 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -68,7 +68,6 @@ set(lib_list custom_ops extension_module extension_module_static - extension_parallel extension_runner_util extension_tensor extension_threadpool @@ -119,14 +118,9 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -if(TARGET extension_parallel) - set_target_properties( - extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool - ) -endif() if(TARGET cpublas) set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool ) endif() target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index f0a7775e803..db7cb42f6d0 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -19,7 +19,7 @@ #include #ifdef ET_USE_THREADPOOL -#include +#include #include #endif #include diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index e3e8b30520f..1c4686fe3d0 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -37,7 +37,6 @@ def define_common_targets(): "//executorch/kernels/optimized:libblas{}".format(mkl_dep), "//executorch/kernels/optimized:libvec", "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", "//executorch/extension/threadpool:threadpool", ], deps = [ diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt deleted file mode 100644 index 7f727aafe46..00000000000 --- a/extension/parallel/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Please keep this file formatted by running: -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ - -if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)) - message(FATAL_ERROR "extension/parallel requires extension/threadpool") -endif() - -add_library(extension_parallel thread_parallel.cpp) - -target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool) -target_compile_options(extension_parallel PUBLIC ${_common_compile_options}) - -install( - TARGETS extension_parallel - DESTINATION lib - INCLUDES - DESTINATION ${_common_include_directories}) diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl deleted file mode 100644 index 82a8502c034..00000000000 --- a/extension/parallel/targets.bzl +++ /dev/null @@ -1,26 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_library( - name = "thread_parallel", - srcs = [ - "thread_parallel.cpp", - ], - exported_headers = [ - "thread_parallel.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/extension/threadpool:threadpool", - "//executorch/runtime/core:core", - ], - ) diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl deleted file mode 100644 index 791c0727471..00000000000 --- a/extension/parallel/test/targets.bzl +++ /dev/null @@ -1,19 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_test( - name = "thread_parallel_test", - srcs = [ - "thread_parallel_test.cpp", - ], - deps = [ - "//executorch/extension/parallel:thread_parallel", - "//executorch/runtime/platform:platform", - ], - ) diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 8b174075ae9..5f4edeb333c 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -8,46 +8,9 @@ #pragma once -#include -#include - -namespace executorch { -namespace extension { - -/** - * A helper to run function in parallel. - * - * begin, end: describe the extent of the workitems via first and last workitem - * to be processed - * grain_size: number of workitems processed by user callback which is - * described below - * f: user function applied in parallel to the chunks, signature: - * void f(int64_t begin, int64_t end) - * Returns true if all work items are processed successfully, false otherwise - * - * Warning: parallel_for does NOT copy thread local states from the current - * thread to the worker threads. Users need to protect the access to captured - * data if they mutate them in f. - */ -bool parallel_for( - const int64_t begin, - const int64_t end, - const int64_t grain_size, - const std::function& f); - -int64_t get_thread_num(); - -void set_thread_num(int64_t thread_num); - -} // namespace extension -} // namespace executorch - -namespace torch { -namespace executor { -// TODO(T197294990): Remove these deprecated aliases once all users have moved -// to the new `::executorch` namespaces. -using ::executorch::extension::get_thread_num; -using ::executorch::extension::parallel_for; -using ::executorch::extension::set_thread_num; -} // namespace executor -} // namespace torch +// This header is a stub left behind after the move to +// executorch/runtime/kernel. Depend on this target and include this +// header if you have a hard requirement for threading; if you want to +// cleanly use parallelization if available, then depend on and use +// the below header instead. +#include diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index c1d86acf75d..6e107cb6634 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD) endif() add_library( - extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp + extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp + cpuinfo_utils.cpp ) target_link_libraries( extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool @@ -42,3 +43,7 @@ install( INCLUDES DESTINATION ${_common_include_directories} ) + +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl index 8bb0398b385..1c34dbbc7d4 100644 --- a/extension/threadpool/targets.bzl +++ b/extension/threadpool/targets.bzl @@ -9,6 +9,7 @@ def define_common_targets(): """ _THREADPOOL_SRCS = [ + "thread_parallel.cpp", "threadpool.cpp", "threadpool_guard.cpp", ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else []) @@ -29,6 +30,8 @@ def define_common_targets(): exported_deps = [ third_party_dep("pthreadpool"), third_party_dep("cpuinfo"), + # Allow users to use the header without an extra deps entry. + "//executorch/runtime/kernel:thread_parallel_interface", ], exported_preprocessor_flags = [ "-DET_USE_THREADPOOL", diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt similarity index 53% rename from extension/parallel/test/CMakeLists.txt rename to extension/threadpool/test/CMakeLists.txt index ab37f66c17d..3f9b13f2ab4 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/threadpool/test/CMakeLists.txt @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# @generated by test/utils/generate_gtest_cmakelists.py +# # This file should be formatted with # ~~~ # cmake-format -i CMakeLists.txt @@ -12,28 +14,14 @@ # cmake_minimum_required(VERSION 3.19) -project(extension_parallel_test) - -# Use C++17 for test. -set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp) +set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp) et_cxx_test( - extension_parallel_test - SOURCES - ${_test_srcs} - EXTRA_LIBS - pthreadpool - cpuinfo + extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS extension_threadpool ) -target_include_directories( - extension_parallel_test - PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include - ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include -) diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl index b8a39d8969a..8bdf776c825 100644 --- a/extension/threadpool/test/targets.bzl +++ b/extension/threadpool/test/targets.bzl @@ -18,3 +18,15 @@ def define_common_targets(): "//executorch/extension/threadpool:threadpool", ], ) + + runtime.cxx_test( + name = "thread_parallel_test", + srcs = [ + "thread_parallel_test.cpp", + ], + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/platform:platform", + ], + ) diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp similarity index 77% rename from extension/parallel/test/thread_parallel_test.cpp rename to extension/threadpool/test/thread_parallel_test.cpp index d386429100d..63581be29e8 100644 --- a/extension/parallel/test/thread_parallel_test.cpp +++ b/extension/threadpool/test/thread_parallel_test.cpp @@ -11,13 +11,16 @@ #include #include -#include +#include #include using namespace ::testing; using ::executorch::extension::parallel_for; -class ParallelTest : public ::testing::Test { +#ifndef ET_USE_THREADPOOL +#endif + +class ParallelTest : public ::testing::TestWithParam { protected: void SetUp() override { data_.fill(0); @@ -42,12 +45,20 @@ class ParallelTest : public ::testing::Test { } } + template + bool parallel_for(const int64_t begin, const int64_t end, const int64_t grain_size, const Func& func) { + if (GetParam()) { + return executorch::extension::parallel_for(begin, end, grain_size, func); + } + return executorch::extension::internal::parallel_for_no_threadpool(begin, end, grain_size, func); + } + std::array data_; std::mutex mutex_; int sum_of_all_elements_; }; -TEST_F(ParallelTest, TestAllInvoked) { +TEST_P(ParallelTest, TestAllInvoked) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -57,7 +68,7 @@ TEST_F(ParallelTest, TestAllInvoked) { } } -TEST_F(ParallelTest, TestAllInvokedWithMutex) { +TEST_P(ParallelTest, TestAllInvokedWithMutex) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); })); @@ -70,7 +81,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) { EXPECT_EQ(sum_of_all_elements_, expected_sum); } -TEST_F(ParallelTest, TestInvalidRange) { +TEST_P(ParallelTest, TestInvalidRange) { et_pal_init(); EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -82,7 +93,7 @@ TEST_F(ParallelTest, TestInvalidRange) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvalidRange2) { +TEST_P(ParallelTest, TestInvalidRange2) { et_pal_init(); EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -94,7 +105,7 @@ TEST_F(ParallelTest, TestInvalidRange2) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvokePartialFromBeginning) { +TEST_P(ParallelTest, TestInvokePartialFromBeginning) { EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -107,7 +118,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) { } } -TEST_F(ParallelTest, TestInvokePartialToEnd) { +TEST_P(ParallelTest, TestInvokePartialToEnd) { EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -120,7 +131,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) { } } -TEST_F(ParallelTest, TestInvokePartialMiddle) { +TEST_P(ParallelTest, TestInvokePartialMiddle) { EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -136,7 +147,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) { } } -TEST_F(ParallelTest, TestChunkSize2) { +TEST_P(ParallelTest, TestChunkSize2) { EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -146,7 +157,7 @@ TEST_F(ParallelTest, TestChunkSize2) { } } -TEST_F(ParallelTest, TestChunkSize2Middle) { +TEST_P(ParallelTest, TestChunkSize2Middle) { EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -162,7 +173,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) { } } -TEST_F(ParallelTest, TestChunkSize3) { +TEST_P(ParallelTest, TestChunkSize3) { EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -172,7 +183,7 @@ TEST_F(ParallelTest, TestChunkSize3) { } } -TEST_F(ParallelTest, TestChunkSize6) { +TEST_P(ParallelTest, TestChunkSize6) { EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -182,7 +193,7 @@ TEST_F(ParallelTest, TestChunkSize6) { } } -TEST_F(ParallelTest, TestChunkSizeTooLarge) { +TEST_P(ParallelTest, TestChunkSizeTooLarge) { EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -191,3 +202,5 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) { EXPECT_EQ(data_[i], i); } } + +INSTANTIATE_TEST_SUITE_P(ParallelTestWithOrWithoutThreadpool, ParallelTest, ::testing::Values(true, false)); diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp similarity index 97% rename from extension/parallel/thread_parallel.cpp rename to extension/threadpool/thread_parallel.cpp index 5d481ccd44c..fa26742368f 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/threadpool/thread_parallel.cpp @@ -9,9 +9,9 @@ #include #include -#include #include #include +#include #include namespace executorch { diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index c6d31c20263..23e26bfa72b 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -43,7 +43,7 @@ endif() list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(cpublas STATIC ${_optimized_cpublas__srcs}) target_link_libraries( - cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool + cpublas PUBLIC executorch_core eigen_blas extension_threadpool ) target_compile_options(cpublas PUBLIC ${_common_compile_options}) diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h index c2b03cfebdd..fc47b4482d6 100644 --- a/kernels/optimized/blas/BlasKernel.h +++ b/kernels/optimized/blas/BlasKernel.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 659c7afe090..dd246f38984 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -186,7 +186,10 @@ def define_libs(is_fbcode=False): ], ) - LIBBLAS_DEPS = [third_party_dep("cpuinfo")] + LIBBLAS_DEPS = [ + third_party_dep("cpuinfo"), + "//executorch/extension/threadpool:threadpool", + ] for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]: runtime.cxx_library( @@ -229,7 +232,6 @@ def define_libs(is_fbcode=False): "DEFAULT": [], }) + LIBBLAS_DEPS, exported_deps = [ - "//executorch/extension/parallel:thread_parallel", "//executorch/kernels/optimized:libutils", "//executorch/runtime/core/exec_aten:lib", ], diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl index d49435f2825..e67f76728b8 100644 --- a/runtime/kernel/targets.bzl +++ b/runtime/kernel/targets.bzl @@ -51,6 +51,19 @@ def define_common_targets(): preprocessor_flags = ["-DMAX_KERNEL_NUM=1"], ) + runtime.cxx_library( + name = "thread_parallel_interface", + exported_headers = ["thread_parallel_interface.h"], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) + for aten_mode in get_aten_mode_options(): aten_suffix = "_aten" if aten_mode else "" diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h new file mode 100644 index 00000000000..82e34ecf7c0 --- /dev/null +++ b/runtime/kernel/thread_parallel_interface.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace executorch { +namespace extension { +namespace internal { +template +inline bool parallel_for_no_threadpool( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& f) { + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); + f(begin, end); + return true; +} + +} // namespace internal + +#ifdef ET_USE_THREADPOOL +/** + * A helper to run a function in parallel. + * + * begin, end: describe the extent of the workitems via first and last workitem + * to be processed + * grain_size: number of workitems processed by user callback which is + * described below + * f: user function applied in parallel to the chunks, signature: + * void f(int64_t begin, int64_t end) + * Returns true if all work items are processed successfully, false otherwise + * + * Warning: parallel_for does NOT copy thread local states from the current + * thread to the worker threads. Users need to protect the access to captured + * data if they mutate them in f. + */ +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f); + +int64_t get_thread_num(); + +void set_thread_num(int64_t thread_num); +#else // ET_USE_THREADPOOL +template +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { + return internal::parallel_for_no_threadpool(begin, end, grain_size, func); +} + +inline int64_t get_thread_num() { + return 0; +} + +void set_thread_num(int64_t thread_num) { + ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); +} +#endif // ET_USE_THREADPOOL +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::get_thread_num; +using ::executorch::extension::parallel_for; +using ::executorch::extension::set_thread_num; +} // namespace executor +} // namespace torch diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index cc5e625f1e8..be594f9d5f4 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -59,6 +59,16 @@ "extension_tensor" ] }, + { + "directory": "extension/threadpool/test", + "sources": [ + "thread_parallel_test.cpp", + "threadpool_test.cpp" + ], + "additional_libs": [ + "extension_threadpool" + ] + }, { "directory": "kernels/portable/cpu/util/test", "sources": [ From 0102e256c1b5dff99bad9ef25e6ab3982d2ab9b3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 17:36:57 -0800 Subject: [PATCH 11/14] Update [ghstack-poisoned] --- extension/llm/custom_ops/op_sdpa.cpp | 2 +- extension/threadpool/test/thread_parallel_test.cpp | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index db7cb42f6d0..371fcf38a24 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -19,8 +19,8 @@ #include #ifdef ET_USE_THREADPOOL -#include #include +#include #endif #include diff --git a/extension/threadpool/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp index 63581be29e8..e31f16eee22 100644 --- a/extension/threadpool/test/thread_parallel_test.cpp +++ b/extension/threadpool/test/thread_parallel_test.cpp @@ -46,11 +46,16 @@ class ParallelTest : public ::testing::TestWithParam { } template - bool parallel_for(const int64_t begin, const int64_t end, const int64_t grain_size, const Func& func) { + bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { if (GetParam()) { return executorch::extension::parallel_for(begin, end, grain_size, func); } - return executorch::extension::internal::parallel_for_no_threadpool(begin, end, grain_size, func); + return executorch::extension::internal::parallel_for_no_threadpool( + begin, end, grain_size, func); } std::array data_; @@ -203,4 +208,7 @@ TEST_P(ParallelTest, TestChunkSizeTooLarge) { } } -INSTANTIATE_TEST_SUITE_P(ParallelTestWithOrWithoutThreadpool, ParallelTest, ::testing::Values(true, false)); +INSTANTIATE_TEST_SUITE_P( + ParallelTestWithOrWithoutThreadpool, + ParallelTest, + ::testing::Values(true, false)); From 956f8a5ec412862697753db5c2d8f84decb990bb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 17:36:57 -0800 Subject: [PATCH 12/14] Update [ghstack-poisoned] --- kernels/portable/cpu/util/reduce_util.h | 20 ++++++++++++++------ kernels/portable/cpu/util/targets.bzl | 6 +++++- runtime/kernel/thread_parallel_interface.h | 14 +++++++++++++- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 35cfdfbaa72..6d7b17443ee 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -8,8 +8,10 @@ #pragma once +#include #include #include +#include #include #include @@ -24,9 +26,12 @@ void apply_on_flat_ix_with_stride_and_base( const size_t base, const size_t start, const size_t end) { - for (size_t i = start; i <= end; i++) { - fn(base + i * stride); - } + executorch::extension::parallel_for( + start, end + 1, [&](auto start_, auto end_) { + for (const auto i : c10::irange(start_, end_)) { + fn(base + i * stride); + } + }); } template @@ -36,9 +41,12 @@ void apply_on_flat_and_dim_ix_with_stride_and_base( const size_t base, const size_t start, const size_t end) { - for (size_t i = start; i <= end; i++) { - fn(base + i * stride, i); - } + executorch::extension::parallel_for( + start, end + 1, [&](auto start_, auto end_) { + for (const auto i : c10::irange(start_, end_)) { + fn(base + i * stride, i); + } + }); } template diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index c42f38fd8b0..3a7e4e1f9bc 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -299,8 +299,12 @@ def define_common_targets(): srcs = ["reduce_util.cpp"], exported_headers = ["reduce_util.h"], deps = [ - "//executorch/runtime/kernel:kernel_includes{}".format(suffix), "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix), + "//executorch/runtime/kernel:kernel_includes{}".format(suffix), + ], + exported_deps = [ + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/core/portable_type/c10/c10:c10", ], exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [], visibility = [ diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h index 82e34ecf7c0..ad90218fd22 100644 --- a/runtime/kernel/thread_parallel_interface.h +++ b/runtime/kernel/thread_parallel_interface.h @@ -33,6 +33,10 @@ inline bool parallel_for_no_threadpool( return true; } +// Match GRAIN_SIZE from PyTorch core. +// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78 +constexpr int64_t GRAIN_SIZE = 32768; + } // namespace internal #ifdef ET_USE_THREADPOOL @@ -74,10 +78,18 @@ inline int64_t get_thread_num() { return 0; } -void set_thread_num(int64_t thread_num) { +inline void set_thread_num(int64_t thread_num) { ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); } #endif // ET_USE_THREADPOOL + +/** + * Convenience version of parallel_for that sets the grain size to internal::GRAIN_SIZE. + */ +template +bool parallel_for(const int64_t begin, const int64_t end, const Func& func) { + return parallel_for(begin, end, internal::GRAIN_SIZE, func); +} } // namespace extension } // namespace executorch From c130224cbfd24d014f1e758264b9c974426dc683 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 5 Mar 2025 20:09:42 -0800 Subject: [PATCH 13/14] Update [ghstack-poisoned] --- extension/parallel/thread_parallel.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 5f4edeb333c..8bd1a572cd7 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -9,8 +9,6 @@ #pragma once // This header is a stub left behind after the move to -// executorch/runtime/kernel. Depend on this target and include this -// header if you have a hard requirement for threading; if you want to -// cleanly use parallelization if available, then depend on and use -// the below header instead. +// executorch/runtime/kernel. As such, it is deprecated; include and +// use the below header directly instead. #include From 754a4f6db525a3c33327b8ca9c00e6f22326266d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 6 Mar 2025 09:56:39 -0800 Subject: [PATCH 14/14] Update [ghstack-poisoned] --- build/executorch-config.cmake | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index e2cff7da6b5..b49f45aa241 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -1,4 +1,3 @@ - # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # @@ -16,20 +15,23 @@ # # This will define the following variables: # -# EXECUTORCH_FOUND -- True if the system has the ExecuTorch library -# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch -# EXECUTORCH_LIBRARIES -- Libraries to link against +# EXECUTORCH_FOUND -- True if the system has the ExecuTorch library +# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch +# EXECUTORCH_LIBRARIES -- Libraries to link against # -# The actual values for these variables will be different from what executorch-config.cmake -# in executorch pip package gives, but we wanted to keep the contract of exposing these -# CMake variables. +# The actual values for these variables will be different from what +# executorch-config.cmake in executorch pip package gives, but we wanted to keep +# the contract of exposing these CMake variables. cmake_minimum_required(VERSION 3.19) set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..") set(required_lib_list executorch executorch_core portable_kernels) set(EXECUTORCH_LIBRARIES) -set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) +set(EXECUTORCH_INCLUDE_DIRS + ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib +) foreach(lib ${required_lib_list}) set(lib_var "LIB_${lib}") add_library(${lib} STATIC IMPORTED) @@ -40,7 +42,12 @@ foreach(lib ${required_lib_list}) ) set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS) - target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) + target_include_directories( + ${lib} + INTERFACE ${_root}/include + ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib + ) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endforeach() @@ -112,7 +119,12 @@ foreach(lib ${lib_list}) add_library(${lib} STATIC IMPORTED) endif() set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") - target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) + target_include_directories( + ${lib} + INTERFACE ${_root}/include + ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib + ) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endif() endforeach()