diff --git a/.lintrunner.toml b/.lintrunner.toml index 7667ac430d1..1a27228d266 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -218,6 +218,8 @@ exclude_patterns = [ 'examples/**', 'extension/**', 'kernels/optimized/**', + # Justified include. + 'runtime/kernel/thread_parallel_interface.h', 'scripts/**', 'third-party/**', 'util/**', diff --git a/CMakeLists.txt b/CMakeLists.txt index 73b89b6171e..fabf667cbe1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel) endif() if(EXECUTORCH_BUILD_PYBIND) diff --git a/Test.cmake b/Test.cmake index d4b5f6aa1db..6bd7a86e70b 100644 --- a/Test.cmake +++ b/Test.cmake @@ -13,7 +13,6 @@ if(BUILD_TESTING) add_subdirectory(extension/evalue_util/test) add_subdirectory(extension/kernel_util/test) add_subdirectory(extension/memory_allocator/test) - add_subdirectory(extension/parallel/test) add_subdirectory(extension/pytree/test) add_subdirectory(kernels/portable/cpu/util/test) add_subdirectory(kernels/prim_ops/test) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index b1ed81b6a7e..bbc9eec3a0e 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -88,7 +88,6 @@ excludes = [ deps = [ "executorch", "executorch_core", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -131,7 +130,7 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", + "extension_threadpool", ] [targets.optimized_native_cpu_ops] @@ -146,7 +145,6 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -227,19 +225,6 @@ deps = [ "extension_runner_util", ] -[targets.extension_parallel] -buck_targets = [ - "//extension/parallel:thread_parallel", -] -filters = [ - ".cpp$", -] -deps = [ - "executorch", - "executorch_core", - "extension_threadpool", -] - [targets.extension_tensor] buck_targets = [ "//extension/tensor:tensor", @@ -379,6 +364,7 @@ excludes = [ deps = [ "executorch", "executorch_core", + "extension_threadpool", "xnnpack_backend", "portable_kernels", ] @@ -393,6 +379,7 @@ filters = [ deps = [ "executorch", "executorch_core", + "extension_threadpool", ] [targets.xnnpack_schema] @@ -427,7 +414,6 @@ deps = [ "executorch", "executorch_core", "optimized_kernels", - "extension_parallel", "extension_threadpool", "reduce_util", "xnnpack_backend", @@ -465,7 +451,7 @@ deps = [ "executorch_core", "extension_data_loader", "extension_module", - "extension_parallel", + "extension_threadpool", "portable_kernels", "quantized_kernels", "xnnpack_backend", diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 9d429490d58..931d31de8ef 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -75,7 +75,6 @@ set(lib_list custom_ops extension_module extension_module_static - extension_parallel extension_runner_util extension_tensor extension_threadpool @@ -131,14 +130,9 @@ endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -if(TARGET extension_parallel) - set_target_properties( - extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool - ) -endif() if(TARGET cpublas) set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool ) endif() if(TARGET extension_threadpool) diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index f0a7775e803..371fcf38a24 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -19,8 +19,8 @@ #include #ifdef ET_USE_THREADPOOL -#include #include +#include #endif #include diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index e3e8b30520f..1c4686fe3d0 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -37,7 +37,6 @@ def define_common_targets(): "//executorch/kernels/optimized:libblas{}".format(mkl_dep), "//executorch/kernels/optimized:libvec", "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", "//executorch/extension/threadpool:threadpool", ], deps = [ diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt deleted file mode 100644 index 7f727aafe46..00000000000 --- a/extension/parallel/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Please keep this file formatted by running: -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ - -if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)) - message(FATAL_ERROR "extension/parallel requires extension/threadpool") -endif() - -add_library(extension_parallel thread_parallel.cpp) - -target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool) -target_compile_options(extension_parallel PUBLIC ${_common_compile_options}) - -install( - TARGETS extension_parallel - DESTINATION lib - INCLUDES - DESTINATION ${_common_include_directories}) diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl deleted file mode 100644 index 82a8502c034..00000000000 --- a/extension/parallel/targets.bzl +++ /dev/null @@ -1,26 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_library( - name = "thread_parallel", - srcs = [ - "thread_parallel.cpp", - ], - exported_headers = [ - "thread_parallel.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/extension/threadpool:threadpool", - "//executorch/runtime/core:core", - ], - ) diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl deleted file mode 100644 index 791c0727471..00000000000 --- a/extension/parallel/test/targets.bzl +++ /dev/null @@ -1,19 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_test( - name = "thread_parallel_test", - srcs = [ - "thread_parallel_test.cpp", - ], - deps = [ - "//executorch/extension/parallel:thread_parallel", - "//executorch/runtime/platform:platform", - ], - ) diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 8b174075ae9..8bd1a572cd7 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -8,46 +8,7 @@ #pragma once -#include -#include - -namespace executorch { -namespace extension { - -/** - * A helper to run function in parallel. - * - * begin, end: describe the extent of the workitems via first and last workitem - * to be processed - * grain_size: number of workitems processed by user callback which is - * described below - * f: user function applied in parallel to the chunks, signature: - * void f(int64_t begin, int64_t end) - * Returns true if all work items are processed successfully, false otherwise - * - * Warning: parallel_for does NOT copy thread local states from the current - * thread to the worker threads. Users need to protect the access to captured - * data if they mutate them in f. - */ -bool parallel_for( - const int64_t begin, - const int64_t end, - const int64_t grain_size, - const std::function& f); - -int64_t get_thread_num(); - -void set_thread_num(int64_t thread_num); - -} // namespace extension -} // namespace executorch - -namespace torch { -namespace executor { -// TODO(T197294990): Remove these deprecated aliases once all users have moved -// to the new `::executorch` namespaces. -using ::executorch::extension::get_thread_num; -using ::executorch::extension::parallel_for; -using ::executorch::extension::set_thread_num; -} // namespace executor -} // namespace torch +// This header is a stub left behind after the move to +// executorch/runtime/kernel. As such, it is deprecated; include and +// use the below header directly instead. +#include diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index c1d86acf75d..6e107cb6634 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD) endif() add_library( - extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp + extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp + cpuinfo_utils.cpp ) target_link_libraries( extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool @@ -42,3 +43,7 @@ install( INCLUDES DESTINATION ${_common_include_directories} ) + +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl index 8bb0398b385..1c34dbbc7d4 100644 --- a/extension/threadpool/targets.bzl +++ b/extension/threadpool/targets.bzl @@ -9,6 +9,7 @@ def define_common_targets(): """ _THREADPOOL_SRCS = [ + "thread_parallel.cpp", "threadpool.cpp", "threadpool_guard.cpp", ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else []) @@ -29,6 +30,8 @@ def define_common_targets(): exported_deps = [ third_party_dep("pthreadpool"), third_party_dep("cpuinfo"), + # Allow users to use the header without an extra deps entry. + "//executorch/runtime/kernel:thread_parallel_interface", ], exported_preprocessor_flags = [ "-DET_USE_THREADPOOL", diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt similarity index 53% rename from extension/parallel/test/CMakeLists.txt rename to extension/threadpool/test/CMakeLists.txt index ab37f66c17d..3f9b13f2ab4 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/threadpool/test/CMakeLists.txt @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# @generated by test/utils/generate_gtest_cmakelists.py +# # This file should be formatted with # ~~~ # cmake-format -i CMakeLists.txt @@ -12,28 +14,14 @@ # cmake_minimum_required(VERSION 3.19) -project(extension_parallel_test) - -# Use C++17 for test. -set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp) +set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp) et_cxx_test( - extension_parallel_test - SOURCES - ${_test_srcs} - EXTRA_LIBS - pthreadpool - cpuinfo + extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS extension_threadpool ) -target_include_directories( - extension_parallel_test - PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include - ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include -) diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl index b8a39d8969a..8bdf776c825 100644 --- a/extension/threadpool/test/targets.bzl +++ b/extension/threadpool/test/targets.bzl @@ -18,3 +18,15 @@ def define_common_targets(): "//executorch/extension/threadpool:threadpool", ], ) + + runtime.cxx_test( + name = "thread_parallel_test", + srcs = [ + "thread_parallel_test.cpp", + ], + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/platform:platform", + ], + ) diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp similarity index 77% rename from extension/parallel/test/thread_parallel_test.cpp rename to extension/threadpool/test/thread_parallel_test.cpp index d386429100d..fd72211a789 100644 --- a/extension/parallel/test/thread_parallel_test.cpp +++ b/extension/threadpool/test/thread_parallel_test.cpp @@ -11,13 +11,13 @@ #include #include -#include +#include #include using namespace ::testing; using ::executorch::extension::parallel_for; -class ParallelTest : public ::testing::Test { +class ParallelTest : public ::testing::TestWithParam { protected: void SetUp() override { data_.fill(0); @@ -42,12 +42,25 @@ class ParallelTest : public ::testing::Test { } } + template + bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { + if (GetParam()) { + return executorch::extension::parallel_for(begin, end, grain_size, func); + } + return executorch::extension::internal::parallel_for_no_threadpool( + begin, end, grain_size, func); + } + std::array data_; std::mutex mutex_; int sum_of_all_elements_; }; -TEST_F(ParallelTest, TestAllInvoked) { +TEST_P(ParallelTest, TestAllInvoked) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -57,7 +70,7 @@ TEST_F(ParallelTest, TestAllInvoked) { } } -TEST_F(ParallelTest, TestAllInvokedWithMutex) { +TEST_P(ParallelTest, TestAllInvokedWithMutex) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); })); @@ -70,7 +83,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) { EXPECT_EQ(sum_of_all_elements_, expected_sum); } -TEST_F(ParallelTest, TestInvalidRange) { +TEST_P(ParallelTest, TestInvalidRange) { et_pal_init(); EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -82,7 +95,7 @@ TEST_F(ParallelTest, TestInvalidRange) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvalidRange2) { +TEST_P(ParallelTest, TestInvalidRange2) { et_pal_init(); EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -94,7 +107,7 @@ TEST_F(ParallelTest, TestInvalidRange2) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvokePartialFromBeginning) { +TEST_P(ParallelTest, TestInvokePartialFromBeginning) { EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -107,7 +120,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) { } } -TEST_F(ParallelTest, TestInvokePartialToEnd) { +TEST_P(ParallelTest, TestInvokePartialToEnd) { EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -120,7 +133,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) { } } -TEST_F(ParallelTest, TestInvokePartialMiddle) { +TEST_P(ParallelTest, TestInvokePartialMiddle) { EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -136,7 +149,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) { } } -TEST_F(ParallelTest, TestChunkSize2) { +TEST_P(ParallelTest, TestChunkSize2) { EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -146,7 +159,7 @@ TEST_F(ParallelTest, TestChunkSize2) { } } -TEST_F(ParallelTest, TestChunkSize2Middle) { +TEST_P(ParallelTest, TestChunkSize2Middle) { EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -162,7 +175,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) { } } -TEST_F(ParallelTest, TestChunkSize3) { +TEST_P(ParallelTest, TestChunkSize3) { EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -172,7 +185,7 @@ TEST_F(ParallelTest, TestChunkSize3) { } } -TEST_F(ParallelTest, TestChunkSize6) { +TEST_P(ParallelTest, TestChunkSize6) { EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -182,7 +195,7 @@ TEST_F(ParallelTest, TestChunkSize6) { } } -TEST_F(ParallelTest, TestChunkSizeTooLarge) { +TEST_P(ParallelTest, TestChunkSizeTooLarge) { EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -191,3 +204,8 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) { EXPECT_EQ(data_[i], i); } } + +INSTANTIATE_TEST_SUITE_P( + ParallelTestWithOrWithoutThreadpool, + ParallelTest, + ::testing::Values(true, false)); diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp similarity index 97% rename from extension/parallel/thread_parallel.cpp rename to extension/threadpool/thread_parallel.cpp index fa09b240ad1..3c79a6775e6 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/threadpool/thread_parallel.cpp @@ -10,9 +10,9 @@ #include #include -#include #include #include +#include #include namespace executorch { diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index c6d31c20263..23e26bfa72b 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -43,7 +43,7 @@ endif() list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(cpublas STATIC ${_optimized_cpublas__srcs}) target_link_libraries( - cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool + cpublas PUBLIC executorch_core eigen_blas extension_threadpool ) target_compile_options(cpublas PUBLIC ${_common_compile_options}) diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h index c2b03cfebdd..fc47b4482d6 100644 --- a/kernels/optimized/blas/BlasKernel.h +++ b/kernels/optimized/blas/BlasKernel.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 659c7afe090..dd246f38984 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -186,7 +186,10 @@ def define_libs(is_fbcode=False): ], ) - LIBBLAS_DEPS = [third_party_dep("cpuinfo")] + LIBBLAS_DEPS = [ + third_party_dep("cpuinfo"), + "//executorch/extension/threadpool:threadpool", + ] for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]: runtime.cxx_library( @@ -229,7 +232,6 @@ def define_libs(is_fbcode=False): "DEFAULT": [], }) + LIBBLAS_DEPS, exported_deps = [ - "//executorch/extension/parallel:thread_parallel", "//executorch/kernels/optimized:libutils", "//executorch/runtime/core/exec_aten:lib", ], diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp index 381e07cbe30..28a34426b23 100644 --- a/kernels/portable/cpu/util/broadcast_util.cpp +++ b/kernels/portable/cpu/util/broadcast_util.cpp @@ -269,28 +269,6 @@ ET_NODISCARD Error get_broadcast_target_size( a.sizes(), b.sizes(), out_sizes, out_sizes_len, out_dim); } -void delinearize_index( - size_t linear_index, - executorch::aten::ArrayRef shape, - size_t* out_indexes, - const size_t out_indexes_len) { - ET_CHECK(shape.size() <= out_indexes_len); - for (size_t i = 0; i < shape.size(); ++i) { - auto dim = shape.size() - 1 - i; - auto dim_size = shape[dim]; - out_indexes[dim] = linear_index % dim_size; - linear_index /= dim_size; - } -} - -void delinearize_index( - size_t linear_index, - const Tensor& t, - size_t* out_indexes, - const size_t out_indexes_len) { - delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len); -} - size_t linearize_access_indexes( ArrayRef indexes_broadcast_to, ssize_t broadcast_to_ndim, diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h index f6bfae9bdaa..ed536f86c2d 100644 --- a/kernels/portable/cpu/util/broadcast_util.h +++ b/kernels/portable/cpu/util/broadcast_util.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -207,36 +208,6 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size( ET_DEPRECATED void free_broadcast_tensor( const executorch::aten::Tensor& broadcast_tensor); -/** - * Delinearize a flattened index to per-dimension indexes. - * - * @param[in] linear_index The flattened index - * @param[in] shape The tensor shape - * @param[out] out_indexes The per-dimension indexes - * @param[in] out_indexes_len The maximum size of the out_indexes array - * @returns void - */ -void delinearize_index( - size_t linear_index, - executorch::aten::ArrayRef shape, - size_t* out_indexes, - const size_t out_indexes_len); - -/** - * Delinearize a flattened index to per-dimension indexes. - * - * @param[in] linear_index The flattened index - * @param[in] t The tensor object - * @param[out] out_indexes The per-dimension indexes - * @param[in] out_indexes_len The maximum size of the out_indexes array - * @returns void - */ -void delinearize_index( - size_t linear_index, - const Tensor& t, - size_t* out_indexes, - const size_t out_indexes_len); - /** * Return the linear index for broatcast_from tensor, given the indexes and * number of dimensions of broadcast_to tensor, and the shape and strides diff --git a/kernels/portable/cpu/util/delinearize_index.cpp b/kernels/portable/cpu/util/delinearize_index.cpp new file mode 100644 index 00000000000..45378e6b05d --- /dev/null +++ b/kernels/portable/cpu/util/delinearize_index.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include + +namespace torch::executor { +void delinearize_index( + size_t linear_index, + executorch::aten::ArrayRef shape, + size_t* out_indexes, + const size_t out_indexes_len) { + ET_CHECK(shape.size() <= out_indexes_len); + for (size_t i = 0; i < shape.size(); ++i) { + auto dim = shape.size() - 1 - i; + auto dim_size = shape[dim]; + out_indexes[dim] = linear_index % dim_size; + linear_index /= dim_size; + } +} + +void delinearize_index( + size_t linear_index, + const Tensor& t, + size_t* out_indexes, + const size_t out_indexes_len) { + delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len); +} +} // namespace torch::executor diff --git a/kernels/portable/cpu/util/delinearize_index.h b/kernels/portable/cpu/util/delinearize_index.h new file mode 100644 index 00000000000..3441aa6083f --- /dev/null +++ b/kernels/portable/cpu/util/delinearize_index.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace torch::executor { +/** + * Delinearize a flattened index to per-dimension indexes. + * + * @param[in] linear_index The flattened index + * @param[in] shape The tensor shape + * @param[out] out_indexes The per-dimension indexes + * @param[in] out_indexes_len The maximum size of the out_indexes array + * @returns void + */ +void delinearize_index( + size_t linear_index, + executorch::aten::ArrayRef shape, + size_t* out_indexes, + const size_t out_indexes_len); + +/** + * Delinearize a flattened index to per-dimension indexes. + * + * @param[in] linear_index The flattened index + * @param[in] t The tensor object + * @param[out] out_indexes The per-dimension indexes + * @param[in] out_indexes_len The maximum size of the out_indexes array + * @returns void + */ +void delinearize_index( + size_t linear_index, + const Tensor& t, + size_t* out_indexes, + const size_t out_indexes_len); +} // namespace torch::executor diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 1c8edb9d3c7..e1c5cadfe84 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -66,9 +66,13 @@ def define_common_targets(): runtime.cxx_library( name = "broadcast_util", - srcs = ["broadcast_util.cpp"], + srcs = [ + "broadcast_util.cpp", + "delinearize_index.cpp", + ], exported_headers = [ "broadcast_util.h", + "delinearize_index.h", ], exported_deps = [ ":broadcast_indexes_range", diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl index d49435f2825..5c95f10276d 100644 --- a/runtime/kernel/targets.bzl +++ b/runtime/kernel/targets.bzl @@ -51,6 +51,20 @@ def define_common_targets(): preprocessor_flags = ["-DMAX_KERNEL_NUM=1"], ) + runtime.cxx_library( + name = "thread_parallel_interface", + exported_headers = ["thread_parallel_interface.h"], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/core/portable_type/c10/c10:c10", + "//executorch/runtime/platform:platform", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) + for aten_mode in get_aten_mode_options(): aten_suffix = "_aten" if aten_mode else "" diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h new file mode 100644 index 00000000000..52100475c7b --- /dev/null +++ b/runtime/kernel/thread_parallel_interface.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace executorch { +namespace extension { +namespace internal { +template +inline bool parallel_for_no_threadpool( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& f) { + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); +#ifndef NDEBUG + // Go backwards through the range elementwise to catch code that + // assumes parallel_for is in order like a regular for loop. + for (const auto i : c10::irange(begin, end)) { + const auto offset = i - begin; + const auto idx = end - offset - 1; + f(idx, idx + 1); + } +#else // NDEBUG + f(begin, end); +#endif + return true; +} + +} // namespace internal + +#ifdef ET_USE_THREADPOOL +/** + * A helper to run a function in parallel. + * + * begin, end: describe the extent of the workitems via first and last workitem + * to be processed + * grain_size: number of workitems processed by user callback which is + * described below + * f: user function applied in parallel to the chunks, signature: + * void f(int64_t begin, int64_t end) + * Returns true if all work items are processed successfully, false otherwise + * + * Warning: parallel_for does NOT copy thread local states from the current + * thread to the worker threads. Users need to protect the access to captured + * data if they mutate them in f. + */ +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f); + +int64_t get_thread_num(); + +void set_thread_num(int64_t thread_num); +#else // ET_USE_THREADPOOL +template +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { + return internal::parallel_for_no_threadpool(begin, end, grain_size, func); +} + +inline int64_t get_thread_num() { + return 0; +} + +inline void set_thread_num(int64_t thread_num) { + ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); +} +#endif // ET_USE_THREADPOOL +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::get_thread_num; +using ::executorch::extension::parallel_for; +using ::executorch::extension::set_thread_num; +} // namespace executor +} // namespace torch diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index cc5e625f1e8..be594f9d5f4 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -59,6 +59,16 @@ "extension_tensor" ] }, + { + "directory": "extension/threadpool/test", + "sources": [ + "thread_parallel_test.cpp", + "threadpool_test.cpp" + ], + "additional_libs": [ + "extension_threadpool" + ] + }, { "directory": "kernels/portable/cpu/util/test", "sources": [