From d0b11e8bb47dacb6e200ac8b0e2609626ccee0f3 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 11:35:29 -0800
Subject: [PATCH 01/38] Update

[ghstack-poisoned]
---
 CMakeLists.txt                   |  2 +-
 build/cmake_deps.toml            | 18 ++++++++++++++++++
 kernels/optimized/CMakeLists.txt |  2 +-
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de941663a88..73b89b6171e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -749,9 +749,9 @@ endif()
 
 if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
-   AND CMAKE_CXX_STANDARD GREATER_EQUAL 14
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 4b22a09cb5b..4bbfd636a96 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -73,6 +73,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -115,6 +116,7 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
+  "extension_parallel",
 ]
 
 [targets.optimized_native_cpu_ops]
@@ -129,6 +131,8 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
+  "extension_parallel",
+  "extension_threadpool",
   "portable_kernels",
 ]
 # ---------------------------------- core end ----------------------------------
@@ -208,6 +212,19 @@ deps = [
   "extension_runner_util",
 ]
 
+[targets.extension_parallel]
+buck_targets = [
+  "//extension/parallel:thread_parallel",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_core",
+  "extension_threadpool",
+]
+
 [targets.extension_tensor]
 buck_targets = [
   "//extension/tensor:tensor",
@@ -395,6 +412,7 @@ deps = [
   "executorch",
   "executorch_core",
   "optimized_kernels",
+  "extension_parallel",
   "extension_threadpool",
   "xnnpack_backend",
 ]
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 235c6738d9a..d9b19d4f9c2 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -43,7 +43,7 @@ endif()
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
 target_link_libraries(
-  cpublas PRIVATE executorch_core eigen_blas extension_threadpool
+  cpublas PRIVATE executorch_core eigen_blas extension_parallel extension_threadpool
 )
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 

From 9437be1e7055d5705540d3544955b5d30f72be43 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 12:29:58 -0800
Subject: [PATCH 02/38] Update

[ghstack-poisoned]
---
 extension/parallel/CMakeLists.txt | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 extension/parallel/CMakeLists.txt

diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt
new file mode 100644
index 00000000000..7f727aafe46
--- /dev/null
+++ b/extension/parallel/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please keep this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO))
+  message(FATAL_ERROR "extension/parallel requires extension/threadpool")
+endif()
+
+add_library(extension_parallel thread_parallel.cpp)
+
+target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool)
+target_compile_options(extension_parallel PUBLIC ${_common_compile_options})
+
+install(
+  TARGETS extension_parallel
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories})

From 643e10ee081b1ea34f0f5fb49f7df44a9f2f666b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 14:54:47 -0800
Subject: [PATCH 03/38] Update

[ghstack-poisoned]
---
 build/executorch-config.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index d14a1227cd9..539c7c2960e 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -67,6 +67,7 @@ set(lib_list
     portable_ops_lib
     extension_module
     extension_module_static
+    extension_parallel
     extension_runner_util
     extension_tensor
     extension_threadpool
@@ -114,3 +115,7 @@ foreach(lib ${lib_list})
     list(APPEND EXECUTORCH_LIBRARIES ${lib})
   endif()
 endforeach()
+
+# TODO: investigate use of install(EXPORT) to cleanly handle
+# target_compile_options/target_compile_definitions for everything.
+target_link_libraries(cpublas INTERFACE extension_parallel)

From 6f2842b876a5a2310c6bf7311d6f6b76bc54549e Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 14:54:48 -0800
Subject: [PATCH 04/38] Update

[ghstack-poisoned]
---
 build/executorch-config.cmake        | 1 +
 examples/models/llama/CMakeLists.txt | 9 ---------
 examples/models/llava/CMakeLists.txt | 9 ---------
 3 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 539c7c2960e..d238db8ca95 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -65,6 +65,7 @@ set(lib_list
     neuron_backend
     qnn_executorch_backend
     portable_ops_lib
+    custom_ops
     extension_module
     extension_module_static
     extension_parallel
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 5f49581ea25..f5d5a78d430 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -84,14 +84,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
   target_link_options_shared_lib(executorch)
 endif()
 
-# custom ops library
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
-  )
-endif()
-
 # llama_runner library
 add_subdirectory(runner)
 
@@ -119,7 +111,6 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  target_link_options_shared_lib(custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()
 
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index ecd00809fdb..f7fa4bacc04 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -93,14 +93,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
   target_link_options_shared_lib(executorch)
 endif()
 
-# custom ops library
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
-  )
-endif()
-
 # llava_runner library
 add_subdirectory(runner)
 
@@ -132,7 +124,6 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  target_link_options_shared_lib(custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()
 

From e47dfeb68a2e8e2ae0e3fa2add553f724f73404d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 14:54:52 -0800
Subject: [PATCH 05/38] Update

[ghstack-poisoned]
---
 build/executorch-config.cmake           | 1 +
 examples/models/llama/CMakeLists.txt    | 1 -
 examples/models/llava/CMakeLists.txt    | 1 -
 examples/models/llava/targets.bzl       | 3 ---
 extension/android/CMakeLists.txt        | 1 -
 extension/llm/custom_ops/CMakeLists.txt | 4 ++--
 extension/threadpool/CMakeLists.txt     | 1 +
 7 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index d238db8ca95..75e1075d929 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -120,3 +120,4 @@ endforeach()
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
 target_link_libraries(cpublas INTERFACE extension_parallel)
+target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index f5d5a78d430..b3364be610a 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -131,7 +131,6 @@ endif()
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
-  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
   list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index f7fa4bacc04..5d5857dd5af 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -130,7 +130,6 @@ endif()
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
-  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
   list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl
index 5efb099f06f..6f3a370acf4 100644
--- a/examples/models/llava/targets.bzl
+++ b/examples/models/llava/targets.bzl
@@ -7,9 +7,6 @@ def define_common_targets():
             "main.cpp",
         ],
         compiler_flags = ["-Wno-global-constructors"],
-        preprocessor_flags = [
-            "-DET_USE_THREADPOOL",
-        ],
         deps = [
             "//executorch/examples/models/llava/runner:runner",
             "//executorch/extension/evalue_util:print_evalue",
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 70f21f2751c..849d1d14364 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -124,7 +124,6 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
 endif()
 
 if(TARGET pthreadpool)
-  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
   target_include_directories(
     executorch_jni
     PUBLIC
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index c3969e6f9bf..eeb118d4344 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -78,7 +78,7 @@ target_include_directories(
 target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)
 
 target_compile_options(
-  custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL
+  custom_ops PUBLIC ${_common_compile_options}
 )
 
 install(TARGETS custom_ops DESTINATION lib)
@@ -130,7 +130,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   target_compile_options(
     custom_ops_aot_lib
     PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
-           ${_common_compile_options} -DET_USE_THREADPOOL
+           ${_common_compile_options}
   )
 
   install(TARGETS custom_ops_aot_lib
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index 90288656674..c1d86acf75d 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -32,6 +32,7 @@ target_include_directories(
   PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
 )
+target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries

From a92958a1f6fbfbc154ec24c6b4ee6c6ebd41aea8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 18:41:25 -0800
Subject: [PATCH 06/38] Update

[ghstack-poisoned]
---
 build/executorch-config.cmake | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 1a2da26416e..8f64c502b47 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -118,6 +118,8 @@ endforeach()
 
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
-set_target_properties(
-  cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
-)
+if (TARGET cpublas)
+  set_target_properties(
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
+  )
+endif()

From 3bd64370f6454f9523b6cc51d05d377fed8a77fb Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 19:54:10 -0800
Subject: [PATCH 07/38] Update

[ghstack-poisoned]
---
 build/executorch-config.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 8f64c502b47..2c459b66ac8 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -118,7 +118,12 @@ endforeach()
 
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
-if (TARGET cpublas)
+if(TARGET extension_parallel)
+  set_target_properties(
+    extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
+  )
+endif()
+if(TARGET cpublas)
   set_target_properties(
     cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
   )

From 9fdebee5a3e9a439895df57b49343816fcceee86 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 21:11:25 -0800
Subject: [PATCH 08/38] Update

[ghstack-poisoned]
---
 build/cmake_deps.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 4bbfd636a96..4811563269c 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -449,6 +449,7 @@ deps = [
   "executorch_core",
   "extension_data_loader",
   "extension_module",
+  "extension_parallel",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",

From e48e81617b32f3460c5b449b8a524e221f79bef1 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 5 Mar 2025 14:40:02 -0800
Subject: [PATCH 09/38] Update

[ghstack-poisoned]
---
 extension/parallel/targets.bzl         | 38 ++++++++++++--------------
 extension/parallel/thread_parallel.cpp | 12 +++++---
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
index 82b3deab129..82a8502c034 100644
--- a/extension/parallel/targets.bzl
+++ b/extension/parallel/targets.bzl
@@ -7,24 +7,20 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
-    for aten_mode in get_aten_mode_options():
-        aten_suffix = ("_aten" if aten_mode else "")
-
-        runtime.cxx_library(
-            name = "thread_parallel" + aten_suffix,
-            srcs = [
-                "thread_parallel.cpp",
-            ],
-            exported_headers = [
-                "thread_parallel.h",
-            ],
-            visibility = [
-                "//executorch/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/extension/threadpool:threadpool",
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ],
-        )
+    runtime.cxx_library(
+        name = "thread_parallel",
+        srcs = [
+            "thread_parallel.cpp",
+        ],
+        exported_headers = [
+            "thread_parallel.h",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/runtime/core:core",
+        ],
+    )
diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp
index dfbb911d3a9..5d481ccd44c 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/parallel/thread_parallel.cpp
@@ -6,11 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cinttypes>
 #include <tuple>
 
 #include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
@@ -53,9 +54,12 @@ bool parallel_for(
     const int64_t end,
     const int64_t grain_size,
     const std::function<void(int64_t, int64_t)>& f) {
-  ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0);
-  ET_LOG_AND_RETURN_IF_FALSE(end >= begin);
-  ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0);
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
   int64_t num_tasks = 0, chunk_size = 0;
   std::tie(num_tasks, chunk_size) =
       calc_num_tasks_and_chunk_size(begin, end, grain_size);

From 3351d50555714d72d46d0ff4f096eff4ab4e61c4 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 5 Mar 2025 16:38:46 -0800
Subject: [PATCH 10/38] Update

[ghstack-poisoned]
---
 .lintrunner.toml                              |  2 +
 CMakeLists.txt                                |  1 -
 Test.cmake                                    |  1 -
 build/cmake_deps.toml                         | 19 +---
 build/executorch-config.cmake                 |  8 +-
 extension/llm/custom_ops/op_sdpa.cpp          |  2 +-
 extension/llm/custom_ops/targets.bzl          |  1 -
 extension/parallel/CMakeLists.txt             | 25 -----
 extension/parallel/TARGETS                    |  8 --
 extension/parallel/targets.bzl                | 26 ------
 extension/parallel/test/TARGETS               |  8 --
 extension/parallel/test/targets.bzl           | 19 ----
 extension/parallel/thread_parallel.h          | 49 ++--------
 extension/threadpool/CMakeLists.txt           |  7 +-
 extension/threadpool/targets.bzl              |  3 +
 .../test/CMakeLists.txt                       | 20 +---
 extension/threadpool/test/targets.bzl         | 12 +++
 .../test/thread_parallel_test.cpp             | 41 ++++++---
 .../thread_parallel.cpp                       |  2 +-
 kernels/optimized/CMakeLists.txt              |  2 +-
 kernels/optimized/blas/BlasKernel.h           |  2 +-
 kernels/optimized/lib_defs.bzl                |  6 +-
 runtime/kernel/targets.bzl                    | 13 +++
 runtime/kernel/thread_parallel_interface.h    | 92 +++++++++++++++++++
 test/utils/OSSTestConfig.json                 | 10 ++
 25 files changed, 185 insertions(+), 194 deletions(-)
 delete mode 100644 extension/parallel/CMakeLists.txt
 delete mode 100644 extension/parallel/TARGETS
 delete mode 100644 extension/parallel/targets.bzl
 delete mode 100644 extension/parallel/test/TARGETS
 delete mode 100644 extension/parallel/test/targets.bzl
 rename extension/{parallel => threadpool}/test/CMakeLists.txt (53%)
 rename extension/{parallel => threadpool}/test/thread_parallel_test.cpp (77%)
 rename extension/{parallel => threadpool}/thread_parallel.cpp (97%)
 create mode 100644 runtime/kernel/thread_parallel_interface.h

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 7667ac430d1..1a27228d266 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -218,6 +218,8 @@ exclude_patterns = [
     'examples/**',
     'extension/**',
     'kernels/optimized/**',
+    # Justified <functional> include.
+    'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
     'util/**',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73b89b6171e..fabf667cbe1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
diff --git a/Test.cmake b/Test.cmake
index d4b5f6aa1db..6bd7a86e70b 100644
--- a/Test.cmake
+++ b/Test.cmake
@@ -13,7 +13,6 @@ if(BUILD_TESTING)
   add_subdirectory(extension/evalue_util/test)
   add_subdirectory(extension/kernel_util/test)
   add_subdirectory(extension/memory_allocator/test)
-  add_subdirectory(extension/parallel/test)
   add_subdirectory(extension/pytree/test)
   add_subdirectory(kernels/portable/cpu/util/test)
   add_subdirectory(kernels/prim_ops/test)
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index b1ed81b6a7e..9937f1b882f 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -88,7 +88,6 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -131,7 +130,6 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
 ]
 
 [targets.optimized_native_cpu_ops]
@@ -146,7 +144,6 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -227,19 +224,6 @@ deps = [
   "extension_runner_util",
 ]
 
-[targets.extension_parallel]
-buck_targets = [
-  "//extension/parallel:thread_parallel",
-]
-filters = [
-  ".cpp$",
-]
-deps = [
-  "executorch",
-  "executorch_core",
-  "extension_threadpool",
-]
-
 [targets.extension_tensor]
 buck_targets = [
   "//extension/tensor:tensor",
@@ -393,6 +377,7 @@ filters = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
 ]
 
 [targets.xnnpack_schema]
@@ -427,7 +412,6 @@ deps = [
   "executorch",
   "executorch_core",
   "optimized_kernels",
-  "extension_parallel",
   "extension_threadpool",
   "reduce_util",
   "xnnpack_backend",
@@ -465,7 +449,6 @@ deps = [
   "executorch_core",
   "extension_data_loader",
   "extension_module",
-  "extension_parallel",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 35fe03467f2..2e8cb95b70f 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -68,7 +68,6 @@ set(lib_list
     custom_ops
     extension_module
     extension_module_static
-    extension_parallel
     extension_runner_util
     extension_tensor
     extension_threadpool
@@ -119,14 +118,9 @@ endforeach()
 
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
-if(TARGET extension_parallel)
-  set_target_properties(
-    extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
-  )
-endif()
 if(TARGET cpublas)
   set_target_properties(
-    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
   )
 endif()
 target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index f0a7775e803..db7cb42f6d0 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -19,7 +19,7 @@
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
-#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index e3e8b30520f..1c4686fe3d0 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -37,7 +37,6 @@ def define_common_targets():
                 "//executorch/kernels/optimized:libblas{}".format(mkl_dep),
                 "//executorch/kernels/optimized:libvec",
                 "//executorch/extension/kernel_util:kernel_util",
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/extension/threadpool:threadpool",
             ],
             deps = [
diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt
deleted file mode 100644
index 7f727aafe46..00000000000
--- a/extension/parallel/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Please keep this file formatted by running:
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-
-if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO))
-  message(FATAL_ERROR "extension/parallel requires extension/threadpool")
-endif()
-
-add_library(extension_parallel thread_parallel.cpp)
-
-target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool)
-target_compile_options(extension_parallel PUBLIC ${_common_compile_options})
-
-install(
-  TARGETS extension_parallel
-  DESTINATION lib
-  INCLUDES
-  DESTINATION ${_common_include_directories})
diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/parallel/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
deleted file mode 100644
index 82a8502c034..00000000000
--- a/extension/parallel/targets.bzl
+++ /dev/null
@@ -1,26 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    runtime.cxx_library(
-        name = "thread_parallel",
-        srcs = [
-            "thread_parallel.cpp",
-        ],
-        exported_headers = [
-            "thread_parallel.h",
-        ],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        deps = [
-            "//executorch/extension/threadpool:threadpool",
-            "//executorch/runtime/core:core",
-        ],
-    )
diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/parallel/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl
deleted file mode 100644
index 791c0727471..00000000000
--- a/extension/parallel/test/targets.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    runtime.cxx_test(
-        name = "thread_parallel_test",
-        srcs = [
-            "thread_parallel_test.cpp",
-        ],
-        deps = [
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/runtime/platform:platform",
-        ],
-    )
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index 8b174075ae9..5f4edeb333c 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -8,46 +8,9 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
-
-namespace executorch {
-namespace extension {
-
-/**
- * A helper to run function in parallel.
- *
- * begin, end: describe the extent of the workitems via first and last workitem
- * to be processed
- * grain_size: number of workitems processed by user callback which is
- * described below
- * f: user function applied in parallel to the chunks, signature:
- *   void f(int64_t begin, int64_t end)
- * Returns true if all work items are processed successfully, false otherwise
- *
- * Warning: parallel_for does NOT copy thread local states from the current
- * thread to the worker threads. Users need to protect the access to captured
- * data if they mutate them in f.
- */
-bool parallel_for(
-    const int64_t begin,
-    const int64_t end,
-    const int64_t grain_size,
-    const std::function<void(int64_t, int64_t)>& f);
-
-int64_t get_thread_num();
-
-void set_thread_num(int64_t thread_num);
-
-} // namespace extension
-} // namespace executorch
-
-namespace torch {
-namespace executor {
-// TODO(T197294990): Remove these deprecated aliases once all users have moved
-// to the new `::executorch` namespaces.
-using ::executorch::extension::get_thread_num;
-using ::executorch::extension::parallel_for;
-using ::executorch::extension::set_thread_num;
-} // namespace executor
-} // namespace torch
+// This header is a stub left behind after the move to
+// executorch/runtime/kernel. Depend on this target and include this
+// header if you have a hard requirement for threading; if you want to
+// cleanly use parallelization if available, then depend on and use
+// the below header instead.
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index c1d86acf75d..6e107cb6634 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 add_library(
-  extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp
+  extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
+                       cpuinfo_utils.cpp
 )
 target_link_libraries(
   extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool
@@ -42,3 +43,7 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
+
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 8bb0398b385..1c34dbbc7d4 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -9,6 +9,7 @@ def define_common_targets():
     """
 
     _THREADPOOL_SRCS = [
+        "thread_parallel.cpp",
         "threadpool.cpp",
         "threadpool_guard.cpp",
     ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else [])
@@ -29,6 +30,8 @@ def define_common_targets():
         exported_deps = [
             third_party_dep("pthreadpool"),
             third_party_dep("cpuinfo"),
+            # Allow users to use the header without an extra deps entry.
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         exported_preprocessor_flags = [
             "-DET_USE_THREADPOOL",
diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt
similarity index 53%
rename from extension/parallel/test/CMakeLists.txt
rename to extension/threadpool/test/CMakeLists.txt
index ab37f66c17d..3f9b13f2ab4 100644
--- a/extension/parallel/test/CMakeLists.txt
+++ b/extension/threadpool/test/CMakeLists.txt
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
 # This file should be formatted with
 # ~~~
 # cmake-format -i CMakeLists.txt
@@ -12,28 +14,14 @@
 #
 
 cmake_minimum_required(VERSION 3.19)
-project(extension_parallel_test)
-
-# Use C++17 for test.
-set(CMAKE_CXX_STANDARD 17)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp)
+set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp)
 
 et_cxx_test(
-  extension_parallel_test
-  SOURCES
-  ${_test_srcs}
-  EXTRA_LIBS
-  pthreadpool
-  cpuinfo
+  extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS
   extension_threadpool
 )
-target_include_directories(
-  extension_parallel_test
-  PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-)
diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl
index b8a39d8969a..8bdf776c825 100644
--- a/extension/threadpool/test/targets.bzl
+++ b/extension/threadpool/test/targets.bzl
@@ -18,3 +18,15 @@ def define_common_targets():
             "//executorch/extension/threadpool:threadpool",
         ],
     )
+
+    runtime.cxx_test(
+        name = "thread_parallel_test",
+        srcs = [
+            "thread_parallel_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp
similarity index 77%
rename from extension/parallel/test/thread_parallel_test.cpp
rename to extension/threadpool/test/thread_parallel_test.cpp
index d386429100d..63581be29e8 100644
--- a/extension/parallel/test/thread_parallel_test.cpp
+++ b/extension/threadpool/test/thread_parallel_test.cpp
@@ -11,13 +11,16 @@
 #include <array>
 #include <mutex>
 
-#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/platform.h>
 
 using namespace ::testing;
 using ::executorch::extension::parallel_for;
 
-class ParallelTest : public ::testing::Test {
+#ifndef ET_USE_THREADPOOL
+#endif
+
+class ParallelTest : public ::testing::TestWithParam<bool> {
  protected:
   void SetUp() override {
     data_.fill(0);
@@ -42,12 +45,20 @@ class ParallelTest : public ::testing::Test {
     }
   }
 
+  template <typename Func>
+  bool parallel_for(const int64_t begin, const int64_t end, const int64_t grain_size, const Func& func) {
+    if (GetParam()) {
+      return executorch::extension::parallel_for(begin, end, grain_size, func);
+    }
+    return executorch::extension::internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+  }
+
   std::array<int, 10> data_;
   std::mutex mutex_;
   int sum_of_all_elements_;
 };
 
-TEST_F(ParallelTest, TestAllInvoked) {
+TEST_P(ParallelTest, TestAllInvoked) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -57,7 +68,7 @@ TEST_F(ParallelTest, TestAllInvoked) {
   }
 }
 
-TEST_F(ParallelTest, TestAllInvokedWithMutex) {
+TEST_P(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
   }));
@@ -70,7 +81,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_EQ(sum_of_all_elements_, expected_sum);
 }
 
-TEST_F(ParallelTest, TestInvalidRange) {
+TEST_P(ParallelTest, TestInvalidRange) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -82,7 +93,7 @@ TEST_F(ParallelTest, TestInvalidRange) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvalidRange2) {
+TEST_P(ParallelTest, TestInvalidRange2) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -94,7 +105,7 @@ TEST_F(ParallelTest, TestInvalidRange2) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
+TEST_P(ParallelTest, TestInvokePartialFromBeginning) {
   EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -107,7 +118,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialToEnd) {
+TEST_P(ParallelTest, TestInvokePartialToEnd) {
   EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -120,7 +131,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialMiddle) {
+TEST_P(ParallelTest, TestInvokePartialMiddle) {
   EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -136,7 +147,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2) {
+TEST_P(ParallelTest, TestChunkSize2) {
   EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -146,7 +157,7 @@ TEST_F(ParallelTest, TestChunkSize2) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2Middle) {
+TEST_P(ParallelTest, TestChunkSize2Middle) {
   EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -162,7 +173,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize3) {
+TEST_P(ParallelTest, TestChunkSize3) {
   EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -172,7 +183,7 @@ TEST_F(ParallelTest, TestChunkSize3) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize6) {
+TEST_P(ParallelTest, TestChunkSize6) {
   EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -182,7 +193,7 @@ TEST_F(ParallelTest, TestChunkSize6) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSizeTooLarge) {
+TEST_P(ParallelTest, TestChunkSizeTooLarge) {
   EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -191,3 +202,5 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) {
     EXPECT_EQ(data_[i], i);
   }
 }
+
+INSTANTIATE_TEST_SUITE_P(ParallelTestWithOrWithoutThreadpool, ParallelTest, ::testing::Values(true, false));
diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp
similarity index 97%
rename from extension/parallel/thread_parallel.cpp
rename to extension/threadpool/thread_parallel.cpp
index 5d481ccd44c..fa26742368f 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/threadpool/thread_parallel.cpp
@@ -9,9 +9,9 @@
 #include <cinttypes>
 #include <tuple>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index c6d31c20263..23e26bfa72b 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -43,7 +43,7 @@ endif()
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
 target_link_libraries(
-  cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool
+  cpublas PUBLIC executorch_core eigen_blas extension_threadpool
 )
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index c2b03cfebdd..fc47b4482d6 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -11,8 +11,8 @@
 #include <executorch/kernels/optimized/utils/math_utils.h>
 #include <executorch/kernels/optimized/utils/unroll.h>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 #include <array>
 
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 659c7afe090..dd246f38984 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -186,7 +186,10 @@ def define_libs(is_fbcode=False):
         ],
     )
 
-    LIBBLAS_DEPS = [third_party_dep("cpuinfo")]
+    LIBBLAS_DEPS = [
+        third_party_dep("cpuinfo"),
+        "//executorch/extension/threadpool:threadpool",
+    ]
 
     for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
         runtime.cxx_library(
@@ -229,7 +232,6 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
             ],
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index d49435f2825..e67f76728b8 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -51,6 +51,19 @@ def define_common_targets():
         preprocessor_flags = ["-DMAX_KERNEL_NUM=1"],
     )
 
+    runtime.cxx_library(
+        name = "thread_parallel_interface",
+        exported_headers = ["thread_parallel_interface.h"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
new file mode 100644
index 00000000000..82e34ecf7c0
--- /dev/null
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace extension {
+namespace internal {
+template <typename Func>
+inline bool parallel_for_no_threadpool(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& f) {
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
+  f(begin, end);
+  return true;
+}
+
+} // namespace internal
+
+#ifdef ET_USE_THREADPOOL
+/**
+ * A helper to run a function in parallel.
+ *
+ * begin, end: describe the extent of the workitems via first and last workitem
+ * to be processed
+ * grain_size: number of workitems processed by user callback which is
+ * described below
+ * f: user function applied in parallel to the chunks, signature:
+ *   void f(int64_t begin, int64_t end)
+ * Returns true if all work items are processed successfully, false otherwise
+ *
+ * Warning: parallel_for does NOT copy thread local states from the current
+ * thread to the worker threads. Users need to protect the access to captured
+ * data if they mutate them in f.
+ */
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+int64_t get_thread_num();
+
+void set_thread_num(int64_t thread_num);
+#else // ET_USE_THREADPOOL
+template <typename Func>
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& func) {
+  return internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+}
+
+inline int64_t get_thread_num() {
+  return 0;
+}
+
+void set_thread_num(int64_t thread_num) {
+  ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
+}
+#endif // ET_USE_THREADPOOL
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::get_thread_num;
+using ::executorch::extension::parallel_for;
+using ::executorch::extension::set_thread_num;
+} // namespace executor
+} // namespace torch
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index cc5e625f1e8..be594f9d5f4 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -59,6 +59,16 @@
             "extension_tensor"
         ]
     },
+    {
+        "directory": "extension/threadpool/test",
+        "sources": [
+            "thread_parallel_test.cpp",
+            "threadpool_test.cpp"
+        ],
+        "additional_libs": [
+            "extension_threadpool"
+        ]
+    },
     {
         "directory": "kernels/portable/cpu/util/test",
         "sources": [

From 0102e256c1b5dff99bad9ef25e6ab3982d2ab9b3 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 5 Mar 2025 17:36:57 -0800
Subject: [PATCH 11/38] Update

[ghstack-poisoned]
---
 extension/llm/custom_ops/op_sdpa.cpp               |  2 +-
 extension/threadpool/test/thread_parallel_test.cpp | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index db7cb42f6d0..371fcf38a24 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -19,8 +19,8 @@
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
-#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
diff --git a/extension/threadpool/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp
index 63581be29e8..e31f16eee22 100644
--- a/extension/threadpool/test/thread_parallel_test.cpp
+++ b/extension/threadpool/test/thread_parallel_test.cpp
@@ -46,11 +46,16 @@ class ParallelTest : public ::testing::TestWithParam<bool> {
   }
 
   template <typename Func>
-  bool parallel_for(const int64_t begin, const int64_t end, const int64_t grain_size, const Func& func) {
+  bool parallel_for(
+      const int64_t begin,
+      const int64_t end,
+      const int64_t grain_size,
+      const Func& func) {
     if (GetParam()) {
       return executorch::extension::parallel_for(begin, end, grain_size, func);
     }
-    return executorch::extension::internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+    return executorch::extension::internal::parallel_for_no_threadpool(
+        begin, end, grain_size, func);
   }
 
   std::array<int, 10> data_;
@@ -203,4 +208,7 @@ TEST_P(ParallelTest, TestChunkSizeTooLarge) {
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(ParallelTestWithOrWithoutThreadpool, ParallelTest, ::testing::Values(true, false));
+INSTANTIATE_TEST_SUITE_P(
+    ParallelTestWithOrWithoutThreadpool,
+    ParallelTest,
+    ::testing::Values(true, false));

From 956f8a5ec412862697753db5c2d8f84decb990bb Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 5 Mar 2025 17:36:57 -0800
Subject: [PATCH 12/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/reduce_util.h    | 20 ++++++++++++++------
 kernels/portable/cpu/util/targets.bzl      |  6 +++++-
 runtime/kernel/thread_parallel_interface.h | 14 +++++++++++++-
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 35cfdfbaa72..6d7b17443ee 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -8,8 +8,10 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -24,9 +26,12 @@ void apply_on_flat_ix_with_stride_and_base(
     const size_t base,
     const size_t start,
     const size_t end) {
-  for (size_t i = start; i <= end; i++) {
-    fn(base + i * stride);
-  }
+  executorch::extension::parallel_for(
+      start, end + 1, [&](auto start_, auto end_) {
+        for (const auto i : c10::irange(start_, end_)) {
+          fn(base + i * stride);
+        }
+      });
 }
 
 template <typename Fn>
@@ -36,9 +41,12 @@ void apply_on_flat_and_dim_ix_with_stride_and_base(
     const size_t base,
     const size_t start,
     const size_t end) {
-  for (size_t i = start; i <= end; i++) {
-    fn(base + i * stride, i);
-  }
+  executorch::extension::parallel_for(
+      start, end + 1, [&](auto start_, auto end_) {
+        for (const auto i : c10::irange(start_, end_)) {
+          fn(base + i * stride, i);
+        }
+      });
 }
 
 template <typename Fn>
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index c42f38fd8b0..3a7e4e1f9bc 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -299,8 +299,12 @@ def define_common_targets():
             srcs = ["reduce_util.cpp"],
             exported_headers = ["reduce_util.h"],
             deps = [
-                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
+                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
+            ],
+            exported_deps = [
+                "//executorch/runtime/kernel:thread_parallel_interface",
+                "//executorch/runtime/core/portable_type/c10/c10:c10",
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
index 82e34ecf7c0..ad90218fd22 100644
--- a/runtime/kernel/thread_parallel_interface.h
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -33,6 +33,10 @@ inline bool parallel_for_no_threadpool(
   return true;
 }
 
+// Match GRAIN_SIZE from PyTorch core.
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78
+constexpr int64_t GRAIN_SIZE = 32768;
+
 } // namespace internal
 
 #ifdef ET_USE_THREADPOOL
@@ -74,10 +78,18 @@ inline int64_t get_thread_num() {
   return 0;
 }
 
-void set_thread_num(int64_t thread_num) {
+inline void set_thread_num(int64_t thread_num) {
   ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
 }
 #endif // ET_USE_THREADPOOL
+
+/**
+ * Convenience version of parallel_for that sets the grain size to internal::GRAIN_SIZE.
+ */
+template <typename Func>
+bool parallel_for(const int64_t begin, const int64_t end, const Func& func) {
+  return parallel_for(begin, end, internal::GRAIN_SIZE, func);
+}
 } // namespace extension
 } // namespace executorch
 

From 9f7f0c1fb07bb90be79c8aeccc787853c361dad1 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 5 Mar 2025 18:29:03 -0800
Subject: [PATCH 13/38] Update

[ghstack-poisoned]
---
 configurations/CMakeLists.txt    | 13 ++++++++++++-
 extension/android/CMakeLists.txt |  4 ----
 kernels/portable/CMakeLists.txt  | 17 +++++++++++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index 462124a6ea6..a63999f8833 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -47,12 +47,23 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   message("Generated files ${gen_command_sources}")
 
   # optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime
+  if(NOT DEFINED EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS)
+    message(FATAL_ERROR "EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS was not defined!")
+  endif()
+  if(${EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS})
+    if(NOT TARGET optimized_portable_kernels)
+      message(FATAL_ERROR "optimized_portable_kernels missing")
+    endif()
+    set(_optimized_native_cpu_ops_lib_portable_kernels_lib optimized_portable_kernels)
+  else()
+    set(_optimized_native_cpu_ops_lib_portable_kernels_lib portable_kernels)
+  endif()
   gen_operators_lib(
     LIB_NAME
     "optimized_native_cpu_ops_lib"
     KERNEL_LIBS
-    portable_kernels
     optimized_kernels
+    ${_optimized_native_cpu_ops_lib_portable_kernels_lib}
     DEPS
     executorch
   )
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index ba722d9c791..03595efbfea 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -84,10 +84,6 @@ if(TARGET optimized_native_cpu_ops_lib)
     APPEND
     link_libraries
     optimized_native_cpu_ops_lib
-    optimized_kernels
-    portable_kernels
-    cpublas
-    eigen_blas
   )
   target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 else()
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 885c509246b..2cecefd6d24 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -63,6 +63,23 @@ gen_operators_lib(
   LIB_NAME "portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
 )
 
+# Portable kernels support optional parallelization (and, in the
+# future, perhaps other performance features). If support is present,
+# produce an optimized version.
+set(EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL PARENT_SCOPE)
+set(EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
+
+if(${EXECUTORCH_HAVE_OPTIMIZED_PORTABLE_KERNELS})
+  add_library(optimized_portable_kernels ${_portable_kernels__srcs})
+  target_link_libraries(optimized_portable_kernels PRIVATE executorch)
+  target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
+  target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  install(
+    TARGETS optimized_portable_kernels
+    DESTINATION lib
+  )
+endif()
+
 install(
   TARGETS portable_kernels portable_ops_lib
   DESTINATION lib

From c130224cbfd24d014f1e758264b9c974426dc683 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 5 Mar 2025 20:09:42 -0800
Subject: [PATCH 14/38] Update

[ghstack-poisoned]
---
 extension/parallel/thread_parallel.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index 5f4edeb333c..8bd1a572cd7 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -9,8 +9,6 @@
 #pragma once
 
 // This header is a stub left behind after the move to
-// executorch/runtime/kernel. Depend on this target and include this
-// header if you have a hard requirement for threading; if you want to
-// cleanly use parallelization if available, then depend on and use
-// the below header instead.
+// executorch/runtime/kernel. As such, it is deprecated; include and
+// use the below header directly instead.
 #include <executorch/runtime/kernel/thread_parallel_interface.h>

From 754a4f6db525a3c33327b8ca9c00e6f22326266d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 6 Mar 2025 09:56:39 -0800
Subject: [PATCH 15/38] Update

[ghstack-poisoned]
---
 build/executorch-config.cmake | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index e2cff7da6b5..b49f45aa241 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -1,4 +1,3 @@
-
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -16,20 +15,23 @@
 #
 # This will define the following variables:
 #
-#   EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
-#   EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
-#   EXECUTORCH_LIBRARIES    -- Libraries to link against
+# EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
+# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
+# EXECUTORCH_LIBRARIES    -- Libraries to link against
 #
-# The actual values for these variables will be different from what executorch-config.cmake
-# in executorch pip package gives, but we wanted to keep the contract of exposing these
-# CMake variables.
+# The actual values for these variables will be different from what
+# executorch-config.cmake in executorch pip package gives, but we wanted to keep
+# the contract of exposing these CMake variables.
 
 cmake_minimum_required(VERSION 3.19)
 
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..")
 set(required_lib_list executorch executorch_core portable_kernels)
 set(EXECUTORCH_LIBRARIES)
-set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+set(EXECUTORCH_INCLUDE_DIRS
+    ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10
+    ${_root}/lib
+)
 foreach(lib ${required_lib_list})
   set(lib_var "LIB_${lib}")
   add_library(${lib} STATIC IMPORTED)
@@ -40,7 +42,12 @@ foreach(lib ${required_lib_list})
   )
   set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
   target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS)
-  target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+  target_include_directories(
+    ${lib}
+    INTERFACE ${_root}/include
+              ${_root}/include/executorch/runtime/core/portable_type/c10
+              ${_root}/lib
+  )
   list(APPEND EXECUTORCH_LIBRARIES ${lib})
 endforeach()
 
@@ -112,7 +119,12 @@ foreach(lib ${lib_list})
       add_library(${lib} STATIC IMPORTED)
     endif()
     set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-    target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+    target_include_directories(
+      ${lib}
+      INTERFACE ${_root}/include
+                ${_root}/include/executorch/runtime/core/portable_type/c10
+                ${_root}/lib
+    )
     list(APPEND EXECUTORCH_LIBRARIES ${lib})
   endif()
 endforeach()

From 6350e07e6ee0fe130ef804d223b91957a1a0d1c5 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 09:48:06 -0800
Subject: [PATCH 16/38] Update

[ghstack-poisoned]
---
 runtime/kernel/targets.bzl                 |  1 +
 runtime/kernel/thread_parallel_interface.h | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index e67f76728b8..5c95f10276d 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -56,6 +56,7 @@ def define_common_targets():
         exported_headers = ["thread_parallel_interface.h"],
         exported_deps = [
             "//executorch/runtime/core:core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
             "//executorch/runtime/platform:platform",
         ],
         visibility = [
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
index 82e34ecf7c0..bd1d48f52bc 100644
--- a/runtime/kernel/thread_parallel_interface.h
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -11,6 +11,7 @@
 #include <cstdint>
 #include <functional>
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/assert.h>
 
@@ -29,7 +30,17 @@ inline bool parallel_for_no_threadpool(
       begin,
       end);
   ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
+#ifndef NDEBUG
+  // Go backwards through the range elementwise to catch code that
+  // assumes parallel_for is in order like a regular for loop.
+  for (const auto i : c10::irange(begin, end)) {
+    const auto offset = i - begin;
+    const auto idx = end - offset - 1;
+    f(idx, idx + 1);
+  }
+#else // NDEBUG
   f(begin, end);
+#endif
   return true;
 }
 

From 4dd58a06352c4e895178f9b120816cfe13e67e80 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 12:40:48 -0800
Subject: [PATCH 17/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/broadcast_util.cpp  | 22 ----------
 kernels/portable/cpu/util/broadcast_util.h    | 31 +------------
 .../portable/cpu/util/delinearize_index.cpp   | 33 ++++++++++++++
 kernels/portable/cpu/util/delinearize_index.h | 43 +++++++++++++++++++
 kernels/portable/cpu/util/targets.bzl         |  6 ++-
 5 files changed, 82 insertions(+), 53 deletions(-)
 create mode 100644 kernels/portable/cpu/util/delinearize_index.cpp
 create mode 100644 kernels/portable/cpu/util/delinearize_index.h

diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index 381e07cbe30..28a34426b23 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -269,28 +269,6 @@ ET_NODISCARD Error get_broadcast_target_size(
       a.sizes(), b.sizes(), out_sizes, out_sizes_len, out_dim);
 }
 
-void delinearize_index(
-    size_t linear_index,
-    executorch::aten::ArrayRef<Tensor::SizesType> shape,
-    size_t* out_indexes,
-    const size_t out_indexes_len) {
-  ET_CHECK(shape.size() <= out_indexes_len);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    auto dim = shape.size() - 1 - i;
-    auto dim_size = shape[dim];
-    out_indexes[dim] = linear_index % dim_size;
-    linear_index /= dim_size;
-  }
-}
-
-void delinearize_index(
-    size_t linear_index,
-    const Tensor& t,
-    size_t* out_indexes,
-    const size_t out_indexes_len) {
-  delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len);
-}
-
 size_t linearize_access_indexes(
     ArrayRef<size_t> indexes_broadcast_to,
     ssize_t broadcast_to_ndim,
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index f6bfae9bdaa..ed536f86c2d 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -10,6 +10,7 @@
 
 #include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
@@ -207,36 +208,6 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size(
 ET_DEPRECATED void free_broadcast_tensor(
     const executorch::aten::Tensor& broadcast_tensor);
 
-/**
- * Delinearize a flattened index to per-dimension indexes.
- *
- * @param[in] linear_index The flattened index
- * @param[in] shape The tensor shape
- * @param[out] out_indexes The per-dimension indexes
- * @param[in] out_indexes_len The maximum size of the out_indexes array
- * @returns void
- */
-void delinearize_index(
-    size_t linear_index,
-    executorch::aten::ArrayRef<Tensor::SizesType> shape,
-    size_t* out_indexes,
-    const size_t out_indexes_len);
-
-/**
- * Delinearize a flattened index to per-dimension indexes.
- *
- * @param[in] linear_index The flattened index
- * @param[in] t The tensor object
- * @param[out] out_indexes The per-dimension indexes
- * @param[in] out_indexes_len The maximum size of the out_indexes array
- * @returns void
- */
-void delinearize_index(
-    size_t linear_index,
-    const Tensor& t,
-    size_t* out_indexes,
-    const size_t out_indexes_len);
-
 /**
  * Return the linear index for broatcast_from tensor, given the indexes and
  * number of dimensions of broadcast_to tensor, and the shape and strides
diff --git a/kernels/portable/cpu/util/delinearize_index.cpp b/kernels/portable/cpu/util/delinearize_index.cpp
new file mode 100644
index 00000000000..45378e6b05d
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearize_index.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch::executor {
+void delinearize_index(
+    size_t linear_index,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
+    size_t* out_indexes,
+    const size_t out_indexes_len) {
+  ET_CHECK(shape.size() <= out_indexes_len);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    auto dim = shape.size() - 1 - i;
+    auto dim_size = shape[dim];
+    out_indexes[dim] = linear_index % dim_size;
+    linear_index /= dim_size;
+  }
+}
+
+void delinearize_index(
+    size_t linear_index,
+    const Tensor& t,
+    size_t* out_indexes,
+    const size_t out_indexes_len) {
+  delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len);
+}
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/delinearize_index.h b/kernels/portable/cpu/util/delinearize_index.h
new file mode 100644
index 00000000000..3441aa6083f
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearize_index.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch::executor {
+/**
+ * Delinearize a flattened index to per-dimension indexes.
+ *
+ * @param[in] linear_index The flattened index
+ * @param[in] shape The tensor shape
+ * @param[out] out_indexes The per-dimension indexes
+ * @param[in] out_indexes_len The maximum size of the out_indexes array
+ * @returns void
+ */
+void delinearize_index(
+    size_t linear_index,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
+    size_t* out_indexes,
+    const size_t out_indexes_len);
+
+/**
+ * Delinearize a flattened index to per-dimension indexes.
+ *
+ * @param[in] linear_index The flattened index
+ * @param[in] t The tensor object
+ * @param[out] out_indexes The per-dimension indexes
+ * @param[in] out_indexes_len The maximum size of the out_indexes array
+ * @returns void
+ */
+void delinearize_index(
+    size_t linear_index,
+    const Tensor& t,
+    size_t* out_indexes,
+    const size_t out_indexes_len);
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 1c8edb9d3c7..e1c5cadfe84 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -66,9 +66,13 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "broadcast_util",
-        srcs = ["broadcast_util.cpp"],
+        srcs = [
+            "broadcast_util.cpp",
+            "delinearize_index.cpp",
+        ],
         exported_headers = [
             "broadcast_util.h",
+            "delinearize_index.h",
         ],
         exported_deps = [
             ":broadcast_indexes_range",

From 1b6eb9f3309216b60ad32a09348de516c94bf6c8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 12:40:53 -0800
Subject: [PATCH 18/38] Update

[ghstack-poisoned]
---
 .../cpu/util/broadcast_indexes_range.h        | 43 ++++++++++--
 .../test/broadcast_indexes_range_test.cpp     | 70 ++++++++++++-------
 2 files changed, 82 insertions(+), 31 deletions(-)

diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
index c623fdb4c31..c749541058a 100644
--- a/kernels/portable/cpu/util/broadcast_indexes_range.h
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -14,6 +14,7 @@
 #include <iterator>
 #include <tuple>
 
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
 
@@ -104,11 +105,42 @@ class BroadcastIndexesIterator {
     return it;
   }
 
+  BroadcastIndexesIterator& operator+=(difference_type n) {
+    if (n <= 3) {
+      std::advance(*this, n);
+      return *this;
+    }
+
+    output_index() += n;
+    delinearize_index(
+        output_index(),
+        output_shape_,
+        delinearized_output_index_.data(),
+        delinearized_output_index_.size());
+    for (const auto ii : c10::irange(1, kNumInputs + 1)) {
+      current_indexes_[ii] = 0;
+      for (const auto jj : c10::irange(output_dim_)) {
+        current_indexes_[ii] += delinearized_output_index_[jj] *
+            effective_input_broadcast_strides_[ii - 1][jj];
+      }
+    }
+    return *this;
+  }
+
+  BroadcastIndexesIterator operator+(difference_type n) {
+    auto it = *this;
+    it += n;
+    return it;
+  }
+
   difference_type operator-(const BroadcastIndexesIterator& rhs) const {
     return difference_type(output_index() - rhs.output_index());
   }
 
  private:
+  using ShapeType =
+      std::array<std::size_t, executorch::runtime::kTensorDimensionLimit>;
+
   ssize_t output_index() const {
     return current_indexes_[0];
   }
@@ -117,11 +149,10 @@ class BroadcastIndexesIterator {
     return current_indexes_[0];
   }
 
-  std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
-  effective_input_broadcast_stride(const Tensor& output, const Tensor& t)
-      const {
-    std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
-        result = {0};
+  ShapeType effective_input_broadcast_stride(
+      const Tensor& output,
+      const Tensor& t) const {
+    ShapeType result = {0};
     ET_CHECK_MSG(
         t.dim() <= output.dim(),
         "input to broadcasting op should have dim at most output dim, but %d > %d!",
@@ -146,8 +177,6 @@ class BroadcastIndexesIterator {
   // The 0th entry is the current linear index into the output,
   // followed by kNumInputs input indexes.
   std::array<ssize_t, kNumInputs + 1> current_indexes_ = {0};
-  using ShapeType = std::
-      array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>;
   ShapeType delinearized_output_index_ = {0};
   ssize_t output_dim_;
   ArrayRef<exec_aten::SizesType> output_shape_;
diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
index f147958558d..519cd9fe9f9 100644
--- a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
@@ -68,6 +68,15 @@ TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) {
   EXPECT_EQ(expected, actual);
 }
 
+template <typename Range>
+void test_operator_plus(const Range& range) {
+  size_t idx = 0;
+  for (const auto indexes : range) {
+    EXPECT_EQ(*(range.begin() + idx), indexes);
+    idx++;
+  }
+}
+
 // [1] -> [H, W]
 // [W] -> [H, W]
 // [1, 1] -> [H, W]
@@ -87,14 +96,15 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) {
 
   Tensor in_not_broadcast = tf.zeros({3, 4});
 
-  auto actual = range_to_vec(BroadcastIndexesRange<6>(
+  const auto range = BroadcastIndexesRange<6>(
       out,
       in_0d_scalar,
       in_1d_scalar,
       in_2d_scalar,
       in_row,
       in_col,
-      in_not_broadcast));
+      in_not_broadcast);
+  auto actual = range_to_vec(range);
   decltype(actual) expected = {
       {0, 0, 0, 0, 0, 0, 0},
       {1, 0, 0, 0, 1, 0, 1},
@@ -110,6 +120,8 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDExhaustive) {
       {11, 0, 0, 0, 3, 2, 11},
   };
   EXPECT_EQ(expected, actual);
+
+  test_operator_plus(range);
 }
 
 // Make sure nothing is thrown off by a size-1 dim in the output:
@@ -138,20 +150,20 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDWith1InOutputShapeExhaustive) {
   Tensor in_col = tf.zeros({H, 1});
 
   size_t idx = 0;
+  const auto range_row = BroadcastIndexesRange<5>(
+      out_row,
+      in_0d_scalar,
+      in_1d_scalar,
+      in_2d_scalar,
+      in_row,
+      in_leading_one_row);
   for (const auto
        [out_idx,
         in_0d_idx,
         in_1d_idx,
         in_2d_idx,
         in_row_idx,
-        in_leading_one_row_idx] :
-       BroadcastIndexesRange<5>(
-           out_row,
-           in_0d_scalar,
-           in_1d_scalar,
-           in_2d_scalar,
-           in_row,
-           in_leading_one_row)) {
+        in_leading_one_row_idx] : range_row) {
     EXPECT_EQ(out_idx, idx++);
     EXPECT_EQ(in_0d_idx, 0);
     EXPECT_EQ(in_1d_idx, 0);
@@ -160,16 +172,21 @@ TEST(BroadcastIndexesRangeTest, OneAndTwoDWith1InOutputShapeExhaustive) {
     EXPECT_EQ(in_leading_one_row_idx, out_idx);
   }
 
+  test_operator_plus(range_row);
+
   idx = 0;
+  const auto range_col = BroadcastIndexesRange<4>(
+      out_col, in_0d_scalar, in_1d_scalar, in_2d_scalar, in_col);
   for (const auto [out_idx, in_0d_idx, in_1d_idx, in_2d_idx, in_col_idx] :
-       BroadcastIndexesRange<4>(
-           out_col, in_0d_scalar, in_1d_scalar, in_2d_scalar, in_col)) {
+       range_col) {
     EXPECT_EQ(out_idx, idx++);
     EXPECT_EQ(in_0d_idx, 0);
     EXPECT_EQ(in_1d_idx, 0);
     EXPECT_EQ(in_2d_idx, 0);
     EXPECT_EQ(in_col_idx, out_idx);
   }
+
+  test_operator_plus(range_col);
 }
 
 // [1, 1, 1] -> [C, H, W]
@@ -197,16 +214,17 @@ TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) {
   // take the opportunity to mutation test against delinearize_index
   // and linearize_access_indexes.
   int idx = 0;
-  for (const auto indexes : BroadcastIndexesRange<8>(
-           out,
-           input_tensors[0],
-           input_tensors[1],
-           input_tensors[2],
-           input_tensors[3],
-           input_tensors[4],
-           input_tensors[5],
-           input_tensors[6],
-           input_tensors[7])) {
+  const auto range = BroadcastIndexesRange<8>(
+      out,
+      input_tensors[0],
+      input_tensors[1],
+      input_tensors[2],
+      input_tensors[3],
+      input_tensors[4],
+      input_tensors[5],
+      input_tensors[6],
+      input_tensors[7]);
+  for (const auto indexes : range) {
     const auto out_idx = indexes[0];
     EXPECT_EQ(out_idx, idx++);
     size_t out_indexes[executorch::runtime::kTensorDimensionLimit];
@@ -219,6 +237,7 @@ TEST(BroadcastIndexesRangeTest, ThreeDBroadcasting) {
               out_indexes, out.dim(), input_tensors[tensor_idx]));
     }
   }
+  test_operator_plus(range);
 }
 
 // 4-D should generalize, but we will go ahead and test:
@@ -235,8 +254,9 @@ void four_d_broadcasting_test() {
   // take the opportunity to mutation test against delinearize_index
   // and linearize_access_indexes.
   int idx = 0;
-  for (const auto [out_idx, in_cw_idx, in_nh_idx] :
-       BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh)) {
+  const auto range =
+      BroadcastIndexesRange<2>(out, in_broadcast_cw, in_broadcast_nh);
+  for (const auto [out_idx, in_cw_idx, in_nh_idx] : range) {
     EXPECT_EQ(out_idx, idx++);
     size_t out_indexes[executorch::runtime::kTensorDimensionLimit];
     delinearize_index(
@@ -248,6 +268,8 @@ void four_d_broadcasting_test() {
         in_nh_idx,
         linearize_access_indexes(out_indexes, out.dim(), in_broadcast_nh));
   }
+
+  test_operator_plus(range);
 }
 
 TEST(BroadcastIndexesRangeTest, FourDBroadcasting) {

From 450e50b0cb810bf52ead3e5a8aed705916919157 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 12:40:57 -0800
Subject: [PATCH 19/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/elementwise_util.h | 208 ++++++++-----------
 1 file changed, 89 insertions(+), 119 deletions(-)

diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 09db5f7180d..a5bcd6ff98b 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -14,6 +14,9 @@
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 
+#include <array>
+#include <utility>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -46,38 +49,94 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
                              : s.to<int64_t>();
 }
 
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
-inline void apply_unitensor_elementwise_fn(
+namespace internal {
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
-    const Tensor& a,
-    SupportedTensorDtypes a_dtypes,
     const Tensor& out,
-    SupportedTensorDtypes out_dtypes) {
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  static_assert(
+      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
+       ...));
+  constexpr auto kNumInputs = sizeof...(inputs);
   constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-
+  const auto check_input_dtype = [](auto input, auto compute_type) {
+    return internal::check_tensor_dtype(
+        *input.first, input.second, compute_type);
+  };
   ET_KERNEL_CHECK(
       ctx,
-      (internal::check_tensor_dtype(a, a_dtypes, compute_type) &&
-       internal::check_tensor_dtype(out, out_dtypes, compute_type)),
+      (check_input_dtype(inputs, compute_type) && ...) &&
+          internal::check_tensor_dtype(out, out_dtypes, compute_type),
       InvalidArgument, );
 
-  const auto load_a_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
+  bool any_is_broadcasted = false;
+  if constexpr (kNumInputs > 1) {
+    any_is_broadcasted = (!out.sizes().equals(inputs.first->sizes()) || ...);
+  }
+
+  struct InputInfo {
+    load_to_common_fn<CTYPE_COMMON> load_to_common;
+    const char* data_ptr;
+    ssize_t element_size;
+  };
+  std::array<InputInfo, kNumInputs> inputs_info = {(InputInfo{
+      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(
+          *inputs.first, inputs.second),
+      reinterpret_cast<const char*>(inputs.first->const_data_ptr()),
+      inputs.first->element_size(),
+  })...};
+
   const auto store_common_to_out =
       internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
           out, out_dtypes);
-  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
-  const auto a_element_size = a.element_size();
-  const auto out_element_size = out.element_size();
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
+  const auto out_element_size = out.element_size();
 
-  auto out_numel = out.numel();
-  for (const auto i : c10::irange(out_numel)) {
-    auto result = compute_fun(load_a_to_common(&data_a[i * a_element_size]));
-    store_common_to_out(result, &data_out[i * out_element_size]);
+  if (any_is_broadcasted) {
+    for (const auto& indexes :
+         BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...)) {
+      std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+      for (const auto idx : c10::irange(kNumInputs)) {
+        const auto& input_info = inputs_info[idx];
+        loaded_inputs[idx] = input_info.load_to_common(
+            &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]);
+      }
+      auto result = std::apply(compute_fun, loaded_inputs);
+      store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+    }
+  } else {
+    for (const auto i : c10::irange(out.numel())) {
+      std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+      for (const auto idx : c10::irange(kNumInputs)) {
+        const auto& input_info = inputs_info[idx];
+        loaded_inputs[idx] = input_info.load_to_common(
+            &input_info.data_ptr[i * input_info.element_size]);
+      }
+      auto result = std::apply(compute_fun, loaded_inputs);
+      store_common_to_out(result, &data_out[i * out_element_size]);
+    }
   }
 }
+} // namespace internal
+
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
+inline void apply_unitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
+}
 
 /**
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
@@ -94,53 +153,13 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-
-  ET_KERNEL_CHECK(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
       ctx,
-      (internal::check_tensor_dtype(a, a_dtypes, compute_type) &&
-       internal::check_tensor_dtype(b, b_dtypes, compute_type) &&
-       internal::check_tensor_dtype(out, out_dtypes, compute_type)),
-      InvalidArgument, );
-
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted);
-
-  const auto load_a_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
-  const auto load_b_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(b, b_dtypes);
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
-  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
-  const char* const data_b = reinterpret_cast<const char*>(b.const_data_ptr());
-  const auto a_element_size = a.element_size();
-  const auto b_element_size = b.element_size();
-  const auto out_element_size = out.element_size();
-  char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
-
-  auto out_numel = out.numel();
-  if (any_is_broadcasted) {
-    for (const auto [out_index, a_index, b_index] :
-         BroadcastIndexesRange<2>(out, a, b)) {
-      auto result = compute_fun(
-          load_a_to_common(&data_a[a_index * a_element_size]),
-          load_b_to_common(&data_b[b_index * b_element_size]));
-      store_common_to_out(result, &data_out[out_index * out_element_size]);
-    }
-  } else {
-    for (const auto i : c10::irange(out_numel)) {
-      size_t a_linear_index = i;
-      size_t b_linear_index = i;
-
-      auto result = compute_fun(
-          load_a_to_common(&data_a[a_linear_index * a_element_size]),
-          load_b_to_common(&data_b[b_linear_index * b_element_size]));
-      store_common_to_out(result, &data_out[i * out_element_size]);
-    }
-  }
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
 }
 
 /**
@@ -175,63 +194,14 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-
-  ET_KERNEL_CHECK(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
       ctx,
-      (internal::check_tensor_dtype(a, a_dtypes, compute_type) &&
-       internal::check_tensor_dtype(b, b_dtypes, compute_type) &&
-       internal::check_tensor_dtype(c, c_dtypes, compute_type) &&
-       internal::check_tensor_dtype(out, out_dtypes, compute_type)),
-      InvalidArgument, );
-
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool c_is_broadcasted = !out.sizes().equals(c.sizes());
-  const bool any_is_broadcasted =
-      (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted);
-
-  const auto load_a_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(a, a_dtypes);
-  const auto load_b_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(b, b_dtypes);
-  const auto load_c_to_common =
-      internal::get_load_to_common_fn<CTYPE_COMMON, op_name>(c, c_dtypes);
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
-  const char* const data_a = reinterpret_cast<const char*>(a.const_data_ptr());
-  const char* const data_b = reinterpret_cast<const char*>(b.const_data_ptr());
-  const char* const data_c = reinterpret_cast<const char*>(c.const_data_ptr());
-  const auto a_element_size = a.element_size();
-  const auto b_element_size = b.element_size();
-  const auto c_element_size = c.element_size();
-  const auto out_element_size = out.element_size();
-  char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
-
-  auto out_numel = out.numel();
-  if (any_is_broadcasted) {
-    for (const auto [out_index, a_index, b_index, c_index] :
-         BroadcastIndexesRange<3>(out, a, b, c)) {
-      auto result = compute_fun(
-          load_a_to_common(&data_a[a_index * a_element_size]),
-          load_b_to_common(&data_b[b_index * b_element_size]),
-          load_c_to_common(&data_c[c_index * c_element_size]));
-      store_common_to_out(result, &data_out[out_index * out_element_size]);
-    }
-  } else {
-    for (const auto i : c10::irange(out_numel)) {
-      size_t a_linear_index = i;
-      size_t b_linear_index = i;
-      size_t c_linear_index = i;
-
-      auto result = compute_fun(
-          load_a_to_common(&data_a[a_linear_index * a_element_size]),
-          load_b_to_common(&data_b[b_linear_index * b_element_size]),
-          load_c_to_common(&data_c[c_linear_index * c_element_size]));
-      store_common_to_out(result, &data_out[i * out_element_size]);
-    }
-  }
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes),
+      std::make_pair(&c, c_dtypes));
 }
 
 inline ScalarType get_compute_type(ScalarType& common_type) {

From 4459a7e7b069cda3ddd071fde8a7d9232645ceb8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 12:41:02 -0800
Subject: [PATCH 20/38] Update

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_where.cpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/kernels/optimized/cpu/op_where.cpp b/kernels/optimized/cpu/op_where.cpp
index 4d897ea6281..0c1958282a5 100644
--- a/kernels/optimized/cpu/op_where.cpp
+++ b/kernels/optimized/cpu/op_where.cpp
@@ -7,7 +7,7 @@
  */
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <iostream>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 namespace torch {
 namespace executor {
@@ -58,15 +58,25 @@ Tensor& opt_where_out(
       const bool* const data_cond = cond.const_data_ptr<bool>();
       CTYPE_COMPUTE* const data_out = out.data_ptr<CTYPE_COMPUTE>();
       if (any_is_broadcasted) {
-        for (const auto [out_index, a_index, b_index, cond_index] :
-             BroadcastIndexesRange<3>(out, a, b, cond)) {
-          data_out[out_index] =
-              data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
-        }
+        executorch::extension::parallel_for(
+            0, out_numel, [&](const auto begin, const auto end) {
+              auto range = BroadcastIndexesRange<3>(out, a, b, cond);
+              auto begin_it = range.begin();
+              begin_it += begin;
+              for (; (*begin_it)[0] < end; ++begin_it) {
+                const auto [out_index, a_index, b_index, cond_index] =
+                    *begin_it;
+                data_out[out_index] =
+                    data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
+              }
+            });
       } else {
-        for (const auto i : c10::irange(out_numel)) {
-          data_out[i] = data_cond[i] ? data_a[i] : data_b[i];
-        }
+        executorch::extension::parallel_for(
+            0, out_numel, [&](const auto begin, const auto end) {
+              for (const auto i : c10::irange(begin, end)) {
+                data_out[i] = data_cond[i] ? data_a[i] : data_b[i];
+              }
+            });
       }
     });
   } else {

From fad4ed8a112b8a11be2af2dbbbf40ba98e486150 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 13:39:07 -0800
Subject: [PATCH 21/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/broadcast_indexes_range.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
index c749541058a..5fa50d8d212 100644
--- a/kernels/portable/cpu/util/broadcast_indexes_range.h
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -79,7 +79,9 @@ class BroadcastIndexesIterator {
       // You might wonder what happens if output_shape_[ii] == 0. In
       // that case, output.numel() would be 0, and thus we would have
       // begin() == end() and no iteration.
-      if ET_UNLIKELY (delinearized_output_index_[ii] == output_shape_[ii] - 1) {
+      if ET_UNLIKELY (
+          static_cast<exec_aten::SizesType>(delinearized_output_index_[ii]) ==
+          output_shape_[ii] - 1) {
         const auto old_delinearized_output_index_item =
             delinearized_output_index_[ii];
         delinearized_output_index_[ii] = 0;

From 37e42135a2577269da151b1060cc2c30f2f89d43 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 7 Mar 2025 16:10:47 -0800
Subject: [PATCH 22/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmin.cpp | 62 ++++++++++++++++++------------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index a0ee82d2612..67f013637c3 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
@@ -47,30 +48,43 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (const auto out_ix : c10::irange(out.numel())) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            // the below condition as written is equivalent to !isnan(accval) &&
-            // (isnan(v) || v < acc_val). cases:
-            // - if neither acc_val nor v is NaN, !(v >= acc_val) is
-            //   trivially equivalent to v < acc_val.
-            // - if acc_val is NaN, the whole thing is trivially false.
-            // - if acc_val is not NaN and v is NaN, then v >= acc_val
-            // - is false because all comparisons involving NaN are
-            // - false, so the result is true. The result is trivially
-            // - true for the above condition that uses isnan(v) as
-            // - well.
-            if (!std::isnan(acc_val) && !(v >= acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    // REVIEW: this is the parallelization strategy ATen uses
+    // specifically when the reduction is along the last dimension and
+    // that dimension is contiguous. Is there any particular reason we
+    // shouldn't just always use this strategy since we aren't
+    // otherwise capable of parallelizing reductions?
+    const auto reduction_size =
+        dim.has_value() ? in.sizes().at(dim.value()) : in.numel();
+    const auto grain_size = std::max(
+        static_cast<int64_t>(1),
+        executorch::extension::internal::GRAIN_SIZE / reduction_size);
+    executorch::extension::parallel_for(
+        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v < acc_val). cases:
+                  // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+                  //   trivially equivalent to v < acc_val.
+                  // - if acc_val is NaN, the whole thing is trivially false.
+                  // - if acc_val is not NaN and v is NaN, then v >= acc_val
+                  // - is false because all comparisons involving NaN are
+                  // - false, so the result is true. The result is trivially
+                  // - true for the above condition that uses isnan(v) as
+                  // - well.
+                  if (!std::isnan(acc_val) && !(v >= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
   });
 
   return out;

From 4917358e55334df6237f2c70f747c5d4141d2702 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 10 Mar 2025 18:48:48 -0700
Subject: [PATCH 23/38] Update

[ghstack-poisoned]
---
 runtime/kernel/thread_parallel_interface.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
index 82e34ecf7c0..1e79acc75a4 100644
--- a/runtime/kernel/thread_parallel_interface.h
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -74,7 +74,7 @@ inline int64_t get_thread_num() {
   return 0;
 }
 
-void set_thread_num(int64_t thread_num) {
+inline void set_thread_num(int64_t thread_num) {
   ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
 }
 #endif // ET_USE_THREADPOOL

From 477996063f3f954d8273cd17849006a7bae62fec Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 08:40:21 -0700
Subject: [PATCH 24/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmin.cpp      | 14 ++------------
 kernels/portable/cpu/util/reduce_util.h | 19 +++++++++++++++++++
 kernels/portable/cpu/util/targets.bzl   |  3 +++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 699aee7034f..8b037e37544 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -48,18 +48,8 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    // REVIEW: this is the parallelization strategy ATen uses
-    // specifically when the reduction is along the last dimension and
-    // that dimension is contiguous. Is there any particular reason we
-    // shouldn't just always use this strategy since we aren't
-    // otherwise capable of parallelizing reductions?
-    const auto reduction_size =
-        dim.has_value() ? in.sizes().at(dim.value()) : in.numel();
-    const auto grain_size = std::max(
-        static_cast<int64_t>(1),
-        executorch::extension::internal::GRAIN_SIZE / reduction_size);
-    const bool success = executorch::extension::parallel_for(
-        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
+    const bool success = parallel_for_each_reduce_over_dim_output_index(
+        in, dim, out, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
             std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
                 [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 35cfdfbaa72..45db6cb92d3 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -661,6 +662,24 @@ bool check_prod_out_args(
     optional<ScalarType> dtype,
     Tensor& out);
 
+/**
+ * parallel_for wrapper for reductions that call reduce_over_dim or
+ * map_reduce_over_dim for each output element. Automatically
+ * calculates appropriate grain size.
+ */
+template <typename Func>
+[[nodiscard]] bool parallel_for_each_reduce_over_dim_output_index(
+    const Tensor& in,
+    optional<int64_t> dim,
+    const Tensor& out,
+    const Func& func) {
+  const auto reduction_size =
+      dim.has_value() ? in.sizes().at(dim.value()) : in.numel();
+  const auto grain_size = std::max(
+      static_cast<int64_t>(1),
+      executorch::extension::internal::GRAIN_SIZE / reduction_size);
+  return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
+}
 #endif
 
 } // namespace executor
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index e1c5cadfe84..bf2fe042a93 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -313,6 +313,9 @@ def define_common_targets():
                 "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
+            exported_deps = [
+                "//executorch/runtime/kernel:thread_parallel_interface",
+            ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
                 "//executorch/extension/llm/custom_ops/...",

From e6be3fe5d569cbc14d4ab01631e4b1c181c3b4fb Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 08:57:24 -0700
Subject: [PATCH 25/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 699aee7034f..e2f5cdab163 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -85,7 +85,7 @@ Tensor& argmin_out(
             out_data[out_ix] = std::get<1>(acc);
           }
         });
-    ET_KERNEL_CHECK_MSG(ctx, success, Internal, out, "parallel_for failed");
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;

From 1ef9dd846df35fadc2fff3c1fa282c1d663f39f2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 08:57:26 -0700
Subject: [PATCH 26/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_any.cpp    | 36 +++++++++++++++-------------
 kernels/portable/cpu/op_argmax.cpp | 38 +++++++++++++++++-------------
 kernels/portable/cpu/op_max.cpp    | 35 +++++++++++++++------------
 kernels/portable/cpu/op_min.cpp    | 35 +++++++++++++++------------
 kernels/portable/cpu/op_prod.cpp   | 36 +++++++++++++++-------------
 5 files changed, 101 insertions(+), 79 deletions(-)

diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp
index 2cfdf36740b..ea5eafb9ba8 100644
--- a/kernels/portable/cpu/op_any.cpp
+++ b/kernels/portable/cpu/op_any.cpp
@@ -139,22 +139,26 @@ Tensor& any_out(
   ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_TWO_TYPES(Bool, Byte, out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-      for (const auto out_ix : c10::irange(out.numel())) {
-        CTYPE_OUT any = false;
-        if (in.numel() > 0) {
-          std::tuple<CTYPE_OUT, long> acc =
-              map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<bool>(v); },
-                  [](bool outv, long, bool acc, long) {
-                    return std::tuple<bool, long>{acc || outv, 0};
-                  },
-                  in,
-                  dim,
-                  out_ix);
-          any = std::get<0>(acc);
-        }
-        out_data[out_ix] = any;
-      }
+      const bool success = parallel_for_each_reduce_over_dim_output_index(
+          in, dim, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT any = false;
+              if (in.numel() > 0) {
+                std::tuple<CTYPE_OUT, long> acc =
+                    map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
+                        [](CTYPE_IN v) { return static_cast<bool>(v); },
+                        [](bool outv, long, bool acc, long) {
+                          return std::tuple<bool, long>{acc || outv, 0};
+                        },
+                        in,
+                        dim,
+                        out_ix);
+                any = std::get<0>(acc);
+              }
+              out_data[out_ix] = any;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
     });
   });
 
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index a272d4405a8..ffbc469c53d 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -47,23 +47,27 @@ Tensor& argmax_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmax.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (const auto out_ix : c10::irange(out.numel())) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            // the below condition as written is equivalent to
-            // !isnan(accval) && (isnan(v) || v > acc_val). See
-            // argument in op_argmin.cpp.
-            if (!std::isnan(acc_val) && !(v <= acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_output_index(
+        in, dim, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v > acc_val). See
+                  // argument in op_argmin.cpp.
+                  if (!std::isnan(acc_val) && !(v <= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
index f206ee05b99..3f4a1d27c0e 100644
--- a/kernels/portable/cpu/op_max.cpp
+++ b/kernels/portable/cpu/op_max.cpp
@@ -83,21 +83,26 @@ std::tuple<Tensor&, Tensor&> max_out(
         CTYPE* max_data = max.mutable_data_ptr<CTYPE>();
         long* max_indices_data = max_indices.mutable_data_ptr<long>();
 
-        for (const auto out_ix : c10::irange(max.numel())) {
-          std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-              [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-                if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) {
-                  acc_val = v;
-                  acc_ix = ix;
-                }
-                return std::tuple<CTYPE, long>{acc_val, acc_ix};
-              },
-              in,
-              dim,
-              out_ix);
-          max_data[out_ix] = std::get<0>(acc);
-          max_indices_data[out_ix] = std::get<1>(acc);
-        }
+        const bool success = parallel_for_each_reduce_over_dim_output_index(
+            in, dim, max, [&](const auto begin, const auto end) {
+              for (const auto out_ix : c10::irange(begin, end)) {
+                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                      if (!std::isnan(acc_val) &&
+                          (std::isnan(v) || v > acc_val)) {
+                        acc_val = v;
+                        acc_ix = ix;
+                      }
+                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                    },
+                    in,
+                    dim,
+                    out_ix);
+                max_data[out_ix] = std::get<0>(acc);
+                max_indices_data[out_ix] = std::get<1>(acc);
+              }
+            });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       });
 
   return {max, max_indices};
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
index 683ef751a9d..8b70bcd40f5 100644
--- a/kernels/portable/cpu/op_min.cpp
+++ b/kernels/portable/cpu/op_min.cpp
@@ -83,21 +83,26 @@ std::tuple<Tensor&, Tensor&> min_out(
         CTYPE* min_data = min.mutable_data_ptr<CTYPE>();
         long* min_indices_data = min_indices.mutable_data_ptr<long>();
 
-        for (const auto out_ix : c10::irange(min.numel())) {
-          std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-              [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-                if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) {
-                  acc_val = v;
-                  acc_ix = ix;
-                }
-                return std::tuple<CTYPE, long>{acc_val, acc_ix};
-              },
-              in,
-              dim,
-              out_ix);
-          min_data[out_ix] = std::get<0>(acc);
-          min_indices_data[out_ix] = std::get<1>(acc);
-        }
+        const bool success = parallel_for_each_reduce_over_dim_output_index(
+            in, dim, min, [&](const auto begin, const auto end) {
+              for (const auto out_ix : c10::irange(begin, end)) {
+                std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                    [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                      if (!std::isnan(acc_val) &&
+                          (std::isnan(v) || v < acc_val)) {
+                        acc_val = v;
+                        acc_ix = ix;
+                      }
+                      return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                    },
+                    in,
+                    dim,
+                    out_ix);
+                min_data[out_ix] = std::get<0>(acc);
+                min_indices_data[out_ix] = std::get<1>(acc);
+              }
+            });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       });
 
   return {min, min_indices};
diff --git a/kernels/portable/cpu/op_prod.cpp b/kernels/portable/cpu/op_prod.cpp
index 27d18ca2570..54580459d7c 100644
--- a/kernels/portable/cpu/op_prod.cpp
+++ b/kernels/portable/cpu/op_prod.cpp
@@ -77,22 +77,26 @@ Tensor& prod_int_out(
   ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-      for (const auto out_ix : c10::irange(out.numel())) {
-        CTYPE_OUT prod = 1;
-        if (in.numel() > 0) {
-          std::tuple<CTYPE_OUT, long> acc =
-              map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                  [](CTYPE_OUT outv, long, CTYPE_OUT acc, long) {
-                    return std::tuple<CTYPE_OUT, long>{acc * outv, 0};
-                  },
-                  in,
-                  dim,
-                  out_ix);
-          prod = std::get<0>(acc);
-        }
-        out_data[out_ix] = prod;
-      }
+      const bool success = parallel_for_each_reduce_over_dim_output_index(
+          in, dim, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT prod = 1;
+              if (in.numel() > 0) {
+                std::tuple<CTYPE_OUT, long> acc =
+                    map_reduce_over_dim<CTYPE_IN, CTYPE_OUT>(
+                        [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                        [](CTYPE_OUT outv, long, CTYPE_OUT acc, long) {
+                          return std::tuple<CTYPE_OUT, long>{acc * outv, 0};
+                        },
+                        in,
+                        dim,
+                        out_ix);
+                prod = std::get<0>(acc);
+              }
+              out_data[out_ix] = prod;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
     });
   });
 

From c66f533f421344fc4c38b99dafa6ca7fce594215 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 09:50:04 -0700
Subject: [PATCH 27/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmin.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index e2f5cdab163..87e90de4c04 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -53,8 +53,7 @@ Tensor& argmin_out(
     // that dimension is contiguous. Is there any particular reason we
     // shouldn't just always use this strategy since we aren't
     // otherwise capable of parallelizing reductions?
-    const auto reduction_size =
-        dim.has_value() ? in.sizes().at(dim.value()) : in.numel();
+    const int64_t reduction_size = get_reduced_dim_product(in, dim);
     const auto grain_size = std::max(
         static_cast<int64_t>(1),
         executorch::extension::internal::GRAIN_SIZE / reduction_size);

From e6d6ad619871b1d4af86246607502e174a9b375f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 09:50:04 -0700
Subject: [PATCH 28/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/reduce_util.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
index 09ba508a31d..259269988ed 100644
--- a/kernels/portable/cpu/util/reduce_util.cpp
+++ b/kernels/portable/cpu/util/reduce_util.cpp
@@ -85,10 +85,7 @@ size_t get_reduced_dim_product(
   }
   size_t dim_product = 1;
   if (!dim.has_value()) {
-    for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
-      dim_product *= in.size(i);
-    }
-    return dim_product;
+    return in.numel();
   }
   const size_t d = _normalize_non_neg_d(dim.value(), in.dim());
   return in.size(d);
@@ -107,10 +104,7 @@ size_t get_reduced_dim_product(
   size_t dim_product = 1;
   const size_t in_dim = in.dim();
   if (!dim_list.has_value() || dim_list.value().size() == 0) {
-    for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
-      dim_product *= in.size(i);
-    }
-    return dim_product;
+    return in.numel();
   }
   for (const auto& d : dim_list.value()) {
     const size_t non_neg_d = _normalize_non_neg_d(d, in_dim);

From 5dc8b27a9f276f0e27c40095d5112ebb0fca5aff Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 09:50:11 -0700
Subject: [PATCH 29/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/reduce_util.h | 26 ++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 0b4da15dda9..b618ba09563 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -675,10 +675,30 @@ template <typename Func>
     optional<int64_t> dim,
     const Tensor& out,
     const Func& func) {
-  const int64_t reduction_size = get_reduced_dim_product(in, dim);
+  const ssize_t reduction_size = get_reduced_dim_product(in, dim);
   const auto grain_size = std::max(
-      static_cast<int64_t>(1),
-      executorch::extension::internal::GRAIN_SIZE / reduction_size);
+      static_cast<ssize_t>(1),
+      static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
+          reduction_size);
+  return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
+}
+
+/**
+ * parallel_for wrapper for reductions that call reduce_over_dim_list or
+ * map_reduce_over_dim_list for each output element. Automatically
+ * calculates appropriate grain size.
+ */
+template <typename Func>
+[[nodiscard]] bool parallel_for_each_reduce_over_dim_list_output_index(
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    const Tensor& out,
+    const Func& func) {
+  const ssize_t reduction_size = get_reduced_dim_product(in, dim_list);
+  const auto grain_size = std::max(
+      static_cast<ssize_t>(1),
+      static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
+          reduction_size);
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }
 

From 2dcb6dbb7bd923d9d542ad99ef43610cd944120a Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 11 Mar 2025 09:50:16 -0700
Subject: [PATCH 30/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/reduce_util.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index b618ba09563..b7c7efa50fe 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -675,11 +675,15 @@ template <typename Func>
     optional<int64_t> dim,
     const Tensor& out,
     const Func& func) {
+#ifdef ET_USE_THREADPOOL
   const ssize_t reduction_size = get_reduced_dim_product(in, dim);
   const auto grain_size = std::max(
       static_cast<ssize_t>(1),
       static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
           reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }
 
@@ -694,11 +698,15 @@ template <typename Func>
     optional<ArrayRef<int64_t>> dim_list,
     const Tensor& out,
     const Func& func) {
+#ifdef ET_UE_THREADPOOL
   const ssize_t reduction_size = get_reduced_dim_product(in, dim_list);
   const auto grain_size = std::max(
       static_cast<ssize_t>(1),
       static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
           reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }
 

From 01f27904f4bb75cb94b115f5bc4191ba196cb926 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 12 Mar 2025 12:19:34 -0700
Subject: [PATCH 31/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_amax.cpp | 18 ++++++++------
 kernels/portable/cpu/op_amin.cpp | 18 ++++++++------
 kernels/portable/cpu/op_any.cpp  | 25 +++++++++++--------
 kernels/portable/cpu/op_mean.cpp | 35 +++++++++++++++------------
 kernels/portable/cpu/op_sum.cpp  | 36 +++++++++++++++-------------
 kernels/portable/cpu/op_var.cpp  | 41 ++++++++++++++++++--------------
 6 files changed, 100 insertions(+), 73 deletions(-)

diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
index 6030221d883..4ad409d4820 100644
--- a/kernels/portable/cpu/op_amax.cpp
+++ b/kernels/portable/cpu/op_amax.cpp
@@ -46,13 +46,17 @@ Tensor& amax_out(
   ReduceOverDimListPlan plan(in, dim_list);
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (const auto out_ix : c10::irange(out.numel())) {
-      out_data[out_ix] = plan.execute<CTYPE>(
-          [](CTYPE v, CTYPE max_v) {
-            return std::isnan(v) || v > max_v ? v : max_v;
-          },
-          out_ix);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            out_data[out_ix] = plan.execute<CTYPE>(
+                [](CTYPE v, CTYPE max_v) {
+                  return std::isnan(v) || v > max_v ? v : max_v;
+                },
+                out_ix);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
index e4979390a5d..396cb6c016d 100644
--- a/kernels/portable/cpu/op_amin.cpp
+++ b/kernels/portable/cpu/op_amin.cpp
@@ -45,13 +45,17 @@ Tensor& amin_out(
   ReduceOverDimListPlan plan(in, dim_list);
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (const auto out_ix : c10::irange(out.numel())) {
-      out_data[out_ix] = plan.execute<CTYPE>(
-          [](CTYPE v, CTYPE min_v) {
-            return std::isnan(v) || v < min_v ? v : min_v;
-          },
-          out_ix);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            out_data[out_ix] = plan.execute<CTYPE>(
+                [](CTYPE v, CTYPE min_v) {
+                  return std::isnan(v) || v < min_v ? v : min_v;
+                },
+                out_ix);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp
index a368226db80..ee9e54fc0c3 100644
--- a/kernels/portable/cpu/op_any.cpp
+++ b/kernels/portable/cpu/op_any.cpp
@@ -96,16 +96,21 @@ Tensor& any_dims_out(
               static_cast<CTYPE_OUT>(static_cast<bool>(in_data[out_ix]));
         }
       } else {
-        for (const auto out_ix : c10::irange(out.numel())) {
-          bool any = false;
-          if (in_not_empty) {
-            any = plan->execute<CTYPE_IN, bool>(
-                [](CTYPE_IN v) { return static_cast<bool>(v); },
-                [](bool outv, bool acc) { return acc || outv; },
-                out_ix);
-          }
-          out_data[out_ix] = static_cast<CTYPE_OUT>(any);
-        }
+        const bool success =
+            parallel_for_each_reduce_over_dim_list_output_index(
+                in, dim_list, out, [&](const auto begin, const auto end) {
+                  for (const auto out_ix : c10::irange(begin, end)) {
+                    bool any = false;
+                    if (in_not_empty) {
+                      any = plan->execute<CTYPE_IN, bool>(
+                          [](CTYPE_IN v) { return static_cast<bool>(v); },
+                          [](bool outv, bool acc) { return acc || outv; },
+                          out_ix);
+                    }
+                    out_data[out_ix] = static_cast<CTYPE_OUT>(any);
+                  }
+                });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       }
     });
   });
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index c13e2a09937..423c2564232 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -46,22 +46,27 @@ Tensor& mean_dim_out(
       out);
 
   MapReduceOverDimListPlan plan(in, dim_list);
-  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
-    ET_SWITCH_FLOATHBF16_TYPES(
-        out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
-          CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-          const size_t num = get_reduced_dim_product(in, dim_list);
-          for (const auto out_ix : c10::irange(out.numel())) {
-            CTYPE_OUT sum = 0;
-            if (in.numel() > 0) {
-              sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                  [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-                  out_ix);
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "add.out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const size_t num = get_reduced_dim_product(in, dim_list);
+      const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+          in, dim_list, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT sum = 0;
+              if (in.numel() > 0) {
+                sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                    out_ix);
+              }
+              out_data[out_ix] = sum / static_cast<float>(num);
             }
-            out_data[out_ix] = sum / static_cast<float>(num);
-          }
-        });
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+    });
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index f58773a6769..550f6b9572f 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -50,23 +50,27 @@ Tensor& sum_dim_out(
   if (in.numel() > 0) {
     plan.emplace(in, dim_list);
   }
-  ET_SWITCH_REALHBBF16_TYPES(
-      in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] {
-        ET_SWITCH_REALHBBF16_TYPES(
-            out.scalar_type(), ctx, "sum.IntList_out", CTYPE_OUT, [&] {
-              CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-              for (const auto out_ix : c10::irange(out.numel())) {
-                CTYPE_OUT sum = 0;
-                if (plan.has_value()) {
-                  sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                      [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                      [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-                      out_ix);
-                }
-                out_data[out_ix] = sum;
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sum.IntList_out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+          in, dim_list, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT sum = 0;
+              if (plan.has_value()) {
+                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
+                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                    out_ix);
               }
-            });
-      });
+              out_data[out_ix] = sum;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+    });
+  });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp
index c5be3fdad62..f09f1d92bc9 100644
--- a/kernels/portable/cpu/op_var.cpp
+++ b/kernels/portable/cpu/op_var.cpp
@@ -21,6 +21,7 @@ namespace {
 
 template <typename CTYPE_IN, typename CTYPE_OUT>
 void compute_variance(
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out,
     optional<ArrayRef<int64_t>> dim_list,
@@ -33,22 +34,26 @@ void compute_variance(
     }
   } else {
     MapReduceOverDimListPlan plan(in, dim_list);
-    for (const auto out_ix : c10::irange(out.numel())) {
-      CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
-          [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-          [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-          out_ix);
-      CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
-      CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>(
-          [mean](CTYPE_IN v) {
-            return (
-                (static_cast<CTYPE_OUT>(v) - mean) *
-                (static_cast<CTYPE_OUT>(v) - mean));
-          },
-          [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-          out_ix);
-      out_data[out_ix] = sum2 / denominator;
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
+            CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [mean](CTYPE_IN v) {
+                  return (
+                      (static_cast<CTYPE_OUT>(v) - mean) *
+                      (static_cast<CTYPE_OUT>(v) - mean));
+                },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            out_data[out_ix] = sum2 / denominator;
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   }
 }
 
@@ -90,7 +95,7 @@ Tensor& var_out(
 
   ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
-      compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom);
+      compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom);
     });
   });
 
@@ -135,7 +140,7 @@ Tensor& var_correction_out(
 
   ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
-      compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom);
+      compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom);
     });
   });
 

From ccfbacf7671083bd79538920e525a222b5c6d9bb Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 13 Mar 2025 14:13:28 -0700
Subject: [PATCH 32/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/elementwise_util.h | 61 +++++++++++++-------
 kernels/portable/cpu/util/targets.bzl        |  1 +
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index a5bcd6ff98b..02d9a909c17 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -13,6 +13,7 @@
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 #include <array>
 #include <utility>
@@ -100,28 +101,46 @@ inline void apply_elementwise_fn(
   const auto out_element_size = out.element_size();
 
   if (any_is_broadcasted) {
-    for (const auto& indexes :
-         BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...)) {
-      std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
-      for (const auto idx : c10::irange(kNumInputs)) {
-        const auto& input_info = inputs_info[idx];
-        loaded_inputs[idx] = input_info.load_to_common(
-            &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]);
-      }
-      auto result = std::apply(compute_fun, loaded_inputs);
-      store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
-    }
+    ::executorch::extension::parallel_for(
+        0,
+        out.numel(),
+        ::executorch::extension::internal::GRAIN_SIZE,
+        [&](const auto begin, const auto end) {
+          const auto range =
+              BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...);
+          auto begin_it = range.begin();
+          begin_it += begin;
+          for (; (*begin_it)[0] < end; ++begin_it) {
+            const auto& indexes = *begin_it;
+            std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+            for (const auto idx : c10::irange(kNumInputs)) {
+              const auto& input_info = inputs_info[idx];
+              loaded_inputs[idx] = input_info.load_to_common(
+                  &input_info
+                       .data_ptr[indexes[idx + 1] * input_info.element_size]);
+            }
+            auto result = std::apply(compute_fun, loaded_inputs);
+            store_common_to_out(
+                result, &data_out[indexes[0] * out_element_size]);
+          }
+        });
   } else {
-    for (const auto i : c10::irange(out.numel())) {
-      std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
-      for (const auto idx : c10::irange(kNumInputs)) {
-        const auto& input_info = inputs_info[idx];
-        loaded_inputs[idx] = input_info.load_to_common(
-            &input_info.data_ptr[i * input_info.element_size]);
-      }
-      auto result = std::apply(compute_fun, loaded_inputs);
-      store_common_to_out(result, &data_out[i * out_element_size]);
-    }
+    ::executorch::extension::parallel_for(
+        0,
+        out.numel(),
+        ::executorch::extension::internal::GRAIN_SIZE,
+        [&](const auto begin, const auto end) {
+          for (const auto i : c10::irange(begin, end)) {
+            std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+            for (const auto idx : c10::irange(kNumInputs)) {
+              const auto& input_info = inputs_info[idx];
+              loaded_inputs[idx] = input_info.load_to_common(
+                  &input_info.data_ptr[i * input_info.element_size]);
+            }
+            auto result = std::apply(compute_fun, loaded_inputs);
+            store_common_to_out(result, &data_out[i * out_element_size]);
+          }
+        });
   }
 }
 } // namespace internal
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index bf2fe042a93..bffea2140b5 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -110,6 +110,7 @@ def define_common_targets():
             ":broadcast_util",
             ":dtype_util",
             "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         deps = [
             "//executorch/kernels/portable/cpu:scalar_utils",

From 21958dc7010b9d07fcbca9a1f18a7ca680fd6e89 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 14 Mar 2025 15:49:28 -0700
Subject: [PATCH 33/38] Update

[ghstack-poisoned]
---
 kernels/optimized/cpu/op_where.cpp            | 46 ++++++-------------
 .../cpu/util/broadcast_indexes_range.h        | 26 +++++++----
 kernels/portable/cpu/util/broadcast_util.h    | 39 +++-------------
 kernels/portable/cpu/util/elementwise_util.h  | 36 ++++-----------
 4 files changed, 47 insertions(+), 100 deletions(-)

diff --git a/kernels/optimized/cpu/op_where.cpp b/kernels/optimized/cpu/op_where.cpp
index 7d58ba4852c..fb14e542891 100644
--- a/kernels/optimized/cpu/op_where.cpp
+++ b/kernels/optimized/cpu/op_where.cpp
@@ -48,42 +48,24 @@ Tensor& opt_where_out(
       cond.scalar_type() == ScalarType::Bool) {
     auto out_numel = out.numel();
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-      const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-      const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-      const bool cond_is_broadcasted = !out.sizes().equals(cond.sizes());
-      const bool any_is_broadcasted =
-          (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted);
       const CTYPE_COMPUTE* const data_a = a.const_data_ptr<CTYPE_COMPUTE>();
       const CTYPE_COMPUTE* const data_b = b.const_data_ptr<CTYPE_COMPUTE>();
       const bool* const data_cond = cond.const_data_ptr<bool>();
       CTYPE_COMPUTE* const data_out = out.data_ptr<CTYPE_COMPUTE>();
-      if (any_is_broadcasted) {
-        executorch::extension::parallel_for(
-            0,
-            out_numel,
-            ::executorch::extension::internal::GRAIN_SIZE,
-            [&](const auto begin, const auto end) {
-              auto range = BroadcastIndexesRange<3>(out, a, b, cond);
-              auto begin_it = range.begin();
-              begin_it += begin;
-              for (; (*begin_it)[0] < end; ++begin_it) {
-                const auto [out_index, a_index, b_index, cond_index] =
-                    *begin_it;
-                data_out[out_index] =
-                    data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
-              }
-            });
-      } else {
-        executorch::extension::parallel_for(
-            0,
-            out_numel,
-            ::executorch::extension::internal::GRAIN_SIZE,
-            [&](const auto begin, const auto end) {
-              for (const auto i : c10::irange(begin, end)) {
-                data_out[i] = data_cond[i] ? data_a[i] : data_b[i];
-              }
-            });
-      }
+      executorch::extension::parallel_for(
+          0,
+          out_numel,
+          ::executorch::extension::internal::GRAIN_SIZE,
+          [&](const auto begin, const auto end) {
+            auto range = BroadcastIndexesRange<3>(out, a, b, cond);
+            auto begin_it = range.begin();
+            begin_it += begin;
+            for (; (*begin_it)[0] < end; ++begin_it) {
+              const auto [out_index, a_index, b_index, cond_index] = *begin_it;
+              data_out[out_index] =
+                  data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
+            }
+          });
     });
   } else {
     // Fall back for mixed dtype to keep code size and compile time
diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
index 5fa50d8d212..7b78f4c2814 100644
--- a/kernels/portable/cpu/util/broadcast_indexes_range.h
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -34,14 +34,17 @@ class BroadcastIndexesIterator {
 
   template <typename... Args>
   explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
-      : output_dim_(output.dim()),
-        output_shape_(output.sizes()),
-        effective_input_broadcast_strides_{
-            effective_input_broadcast_stride(output, args)...} {
+      : output_dim_or_zero_if_no_broadcasting_(
+            ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()),
+        output_shape_(output.sizes()) {
     static_assert(
         sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),
         "BroadcastIndexesIterator constructor requires kNumInputs input tensor"
         "arguments!");
+    if (output_dim_or_zero_if_no_broadcasting_ != 0) {
+      effective_input_broadcast_strides_ = {
+          effective_input_broadcast_stride(output, args)...};
+    }
   }
 
   struct make_end_t {
@@ -73,9 +76,14 @@ class BroadcastIndexesIterator {
 
   BroadcastIndexesIterator& operator++() {
     output_index()++;
+    if (output_dim_or_zero_if_no_broadcasting_ == 0) {
+      std::fill(
+          current_indexes_.begin() + 1, current_indexes_.end(), output_index());
+      return *this;
+    }
     // TODO: add optimization for particular input tensors not being
     // broadcasted?
-    for (auto ii = output_dim_ - 1; ii >= 0; --ii) {
+    for (auto ii = output_dim_or_zero_if_no_broadcasting_ - 1; ii >= 0; --ii) {
       // You might wonder what happens if output_shape_[ii] == 0. In
       // that case, output.numel() would be 0, and thus we would have
       // begin() == end() and no iteration.
@@ -121,7 +129,8 @@ class BroadcastIndexesIterator {
         delinearized_output_index_.size());
     for (const auto ii : c10::irange(1, kNumInputs + 1)) {
       current_indexes_[ii] = 0;
-      for (const auto jj : c10::irange(output_dim_)) {
+      for (const auto jj :
+           c10::irange(output_dim_or_zero_if_no_broadcasting_)) {
         current_indexes_[ii] += delinearized_output_index_[jj] *
             effective_input_broadcast_strides_[ii - 1][jj];
       }
@@ -180,7 +189,7 @@ class BroadcastIndexesIterator {
   // followed by kNumInputs input indexes.
   std::array<ssize_t, kNumInputs + 1> current_indexes_ = {0};
   ShapeType delinearized_output_index_ = {0};
-  ssize_t output_dim_;
+  ssize_t output_dim_or_zero_if_no_broadcasting_;
   ArrayRef<exec_aten::SizesType> output_shape_;
   // The linear index for a broadcast tensor is
   // sum(delinearized_output_index_[i] * input_stride_[i] if
@@ -189,8 +198,7 @@ class BroadcastIndexesIterator {
   // output_dim. This is straightforwardly implementable with an
   // adjusted stride array that contains 0s where the padded input
   // shape would contain 1s.
-  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_ = {
-      {{0}}};
+  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_;
 };
 } // namespace internal
 
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index ed536f86c2d..2b10ee24411 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -254,26 +254,13 @@ inline void apply_binary_elementwise_fn(
     const Tensor& a,
     const Tensor& b,
     const Tensor& out) {
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool any_is_broadcasted = (a_is_broadcasted || b_is_broadcasted);
-
   const CTYPE_A* const data_a = a.const_data_ptr<CTYPE_A>();
   const CTYPE_B* const data_b = b.const_data_ptr<CTYPE_B>();
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
-  if (any_is_broadcasted) {
-    for (const auto [out_index, a_index, b_index] :
-         BroadcastIndexesRange<2>(out, a, b)) {
-      data_out[out_index] = compute_fun(data_a[a_index], data_b[b_index]);
-    }
-  } else {
-    for (const auto i : c10::irange(out.numel())) {
-      size_t a_linear_index = i;
-      size_t b_linear_index = i;
-
-      data_out[i] = compute_fun(data_a[a_linear_index], data_b[b_linear_index]);
-    }
+  for (const auto [out_index, a_index, b_index] :
+       BroadcastIndexesRange<2>(out, a, b)) {
+    data_out[out_index] = compute_fun(data_a[a_index], data_b[b_index]);
   }
 }
 
@@ -294,27 +281,15 @@ inline void apply_ternary_elementwise_fn(
     const Tensor& b,
     const Tensor& c,
     const Tensor& out) {
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool c_is_broadcasted = !out.sizes().equals(c.sizes());
-  const bool any_is_broadcasted =
-      (a_is_broadcasted || b_is_broadcasted || c_is_broadcasted);
-
   const CTYPE_A* const data_a = a.const_data_ptr<CTYPE_A>();
   const CTYPE_B* const data_b = b.const_data_ptr<CTYPE_B>();
   const CTYPE_C* const data_c = c.const_data_ptr<CTYPE_C>();
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
-  if (any_is_broadcasted) {
-    for (const auto [out_index, a_index, b_index, c_index] :
-         BroadcastIndexesRange<3>(out, a, b, c)) {
-      data_out[out_index] =
-          compute_fun(data_a[a_index], data_b[b_index], data_c[c_index]);
-    }
-  } else {
-    for (const auto i : c10::irange(out.numel())) {
-      data_out[i] = compute_fun(data_a[i], data_b[i], data_c[i]);
-    }
+  for (const auto [out_index, a_index, b_index, c_index] :
+       BroadcastIndexesRange<3>(out, a, b, c)) {
+    data_out[out_index] =
+        compute_fun(data_a[a_index], data_b[b_index], data_c[c_index]);
   }
 }
 
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index a5bcd6ff98b..23ec481bb7f 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -76,11 +76,6 @@ inline void apply_elementwise_fn(
           internal::check_tensor_dtype(out, out_dtypes, compute_type),
       InvalidArgument, );
 
-  bool any_is_broadcasted = false;
-  if constexpr (kNumInputs > 1) {
-    any_is_broadcasted = (!out.sizes().equals(inputs.first->sizes()) || ...);
-  }
-
   struct InputInfo {
     load_to_common_fn<CTYPE_COMMON> load_to_common;
     const char* data_ptr;
@@ -99,29 +94,16 @@ inline void apply_elementwise_fn(
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
   const auto out_element_size = out.element_size();
 
-  if (any_is_broadcasted) {
-    for (const auto& indexes :
-         BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...)) {
-      std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
-      for (const auto idx : c10::irange(kNumInputs)) {
-        const auto& input_info = inputs_info[idx];
-        loaded_inputs[idx] = input_info.load_to_common(
-            &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]);
-      }
-      auto result = std::apply(compute_fun, loaded_inputs);
-      store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
-    }
-  } else {
-    for (const auto i : c10::irange(out.numel())) {
-      std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
-      for (const auto idx : c10::irange(kNumInputs)) {
-        const auto& input_info = inputs_info[idx];
-        loaded_inputs[idx] = input_info.load_to_common(
-            &input_info.data_ptr[i * input_info.element_size]);
-      }
-      auto result = std::apply(compute_fun, loaded_inputs);
-      store_common_to_out(result, &data_out[i * out_element_size]);
+  for (const auto& indexes :
+       BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...)) {
+    std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+    for (const auto idx : c10::irange(kNumInputs)) {
+      const auto& input_info = inputs_info[idx];
+      loaded_inputs[idx] = input_info.load_to_common(
+          &input_info.data_ptr[indexes[idx + 1] * input_info.element_size]);
     }
+    auto result = std::apply(compute_fun, loaded_inputs);
+    store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
   }
 }
 } // namespace internal

From 5cb625abcaf55c2145021cd098d41b45bb1ca184 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 17 Mar 2025 16:03:04 -0700
Subject: [PATCH 34/38] Update

[ghstack-poisoned]
---
 .../executor_runner/executor_runner.cpp       | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 1e0241958b9..ad8c159a7be 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -249,18 +249,24 @@ int main(int argc, char** argv) {
       (uint32_t)method.error());
   ET_LOG(Info, "Method loaded.");
 
-  // Allocate input tensors and set all of their elements to 1. The `inputs`
-  // variable owns the allocated memory and must live past the last call to
-  // `execute()`.
-  auto inputs = executorch::extension::prepare_input_tensors(*method);
-  ET_CHECK_MSG(
-      inputs.ok(),
-      "Could not prepare inputs: 0x%" PRIx32,
-      (uint32_t)inputs.error());
-  ET_LOG(Info, "Inputs prepared.");
-
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
+    ET_LOG(Info, "Preparing inputs.");
+    // Allocate input tensors and set all of their elements to 1. The `inputs`
+    // variable owns the allocated memory and must live past the last call to
+    // `execute()`.
+    //
+    // NOTE: we have to re-prepare input tensors on every execution
+    // because inputs whose space gets reused by memory planning (if
+    // any such inputs exist) will not be preserved for the next
+    // execution.
+    auto inputs = executorch::extension::prepare_input_tensors(*method);
+    ET_CHECK_MSG(
+        inputs.ok(),
+        "Could not prepare inputs: 0x%" PRIx32,
+        (uint32_t)inputs.error());
+    ET_LOG(Info, "Inputs prepared.");
+
     Error status = method->execute();
     ET_CHECK_MSG(
         status == Error::Ok,

From ac789abf1d728bbc39690642a51904aa169b1269 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 17 Mar 2025 16:17:13 -0700
Subject: [PATCH 35/38] Update

[ghstack-poisoned]
---
 examples/portable/executor_runner/executor_runner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index ad8c159a7be..08907d333c4 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -251,7 +251,7 @@ int main(int argc, char** argv) {
 
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
-    ET_LOG(Info, "Preparing inputs.");
+    ET_LOG(Debug, "Preparing inputs.");
     // Allocate input tensors and set all of their elements to 1. The `inputs`
     // variable owns the allocated memory and must live past the last call to
     // `execute()`.
@@ -265,7 +265,7 @@ int main(int argc, char** argv) {
         inputs.ok(),
         "Could not prepare inputs: 0x%" PRIx32,
         (uint32_t)inputs.error());
-    ET_LOG(Info, "Inputs prepared.");
+    ET_LOG(Debug, "Inputs prepared.");
 
     Error status = method->execute();
     ET_CHECK_MSG(

From a9a9a1e2e096246d193fe684ab05664b4a5713b0 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 17 Mar 2025 16:17:13 -0700
Subject: [PATCH 36/38] Update

[ghstack-poisoned]
---
 .../portable/executor_runner/executor_runner.cpp    | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 08907d333c4..7c75c39f0a9 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -30,6 +30,7 @@
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/log.h>
+#include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 #ifdef ET_EVENT_TRACER_ENABLED
 #include <executorch/devtools/etdump/etdump_flatcc.h>
@@ -249,6 +250,7 @@ int main(int argc, char** argv) {
       (uint32_t)method.error());
   ET_LOG(Info, "Method loaded.");
 
+  et_timestamp_t time_spent_executing = 0;
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
     ET_LOG(Debug, "Preparing inputs.");
@@ -267,17 +269,24 @@ int main(int argc, char** argv) {
         (uint32_t)inputs.error());
     ET_LOG(Debug, "Inputs prepared.");
 
+    const et_timestamp_t before_execute = et_pal_current_ticks();
     Error status = method->execute();
+    const et_timestamp_t after_execute = et_pal_current_ticks();
+    time_spent_executing += after_execute - before_execute;
     ET_CHECK_MSG(
         status == Error::Ok,
         "Execution of method %s failed with status 0x%" PRIx32,
         method_name,
         (uint32_t)status);
   }
+  const auto tick_ratio = et_pal_ticks_to_ns_multiplier();
+  constexpr auto NANOSECONDS_PER_MILLISECOND = 1000000;
   ET_LOG(
       Info,
-      "Model executed successfully %" PRIu32 " time(s).",
-      FLAGS_num_executions);
+      "Model executed successfully %" PRIu32 " time(s) in %f ms.",
+      FLAGS_num_executions,
+      static_cast<double>(time_spent_executing) * tick_ratio.numerator /
+          tick_ratio.denominator / NANOSECONDS_PER_MILLISECOND);
 
   // Print the outputs.
   std::vector<EValue> outputs(method->outputs_size());

From 7369bc2f777b615413abb08b00eab806bfe16993 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Mon, 17 Mar 2025 18:55:52 -0700
Subject: [PATCH 37/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/functional_util.h | 13 ++++++++++---
 kernels/portable/cpu/util/targets.bzl       |  3 +++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernels/portable/cpu/util/functional_util.h b/kernels/portable/cpu/util/functional_util.h
index 609a1a26fa5..d7ea201dbd2 100644
--- a/kernels/portable/cpu/util/functional_util.h
+++ b/kernels/portable/cpu/util/functional_util.h
@@ -12,6 +12,7 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 namespace torch {
 namespace executor {
@@ -53,9 +54,15 @@ inline void apply_unary_map_fn(
     CTYPE_OUT* const data_out,
     const int64_t size,
     const int64_t stride = 1) {
-  for (const auto i : c10::irange(size)) {
-    data_out[i * stride] = map_fun(data_in[i * stride]);
-  }
+  executorch::extension::parallel_for(
+      0,
+      size,
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        for (const auto i : c10::irange(begin, end)) {
+          data_out[i * stride] = map_fun(data_in[i * stride]);
+        }
+      });
 }
 
 //
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index d3274bb3c96..a623b9d4d7a 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -244,6 +244,9 @@ def define_common_targets():
         name = "functional_util",
         srcs = [],
         exported_headers = ["functional_util.h"],
+        exported_deps = [
+            "//executorch/runtime/kernel:thread_parallel_interface",
+        ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
             "//executorch/runtime/core/exec_aten/util:tensor_util",

From a865349295c476578741b954209bff0899b85ecd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 13:49:12 -0700
Subject: [PATCH 38/38] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/broadcast_indexes_range.h         | 5 +++++
 .../portable/cpu/util/test/broadcast_indexes_range_test.cpp | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
index 7b78f4c2814..aaf7207d0c9 100644
--- a/kernels/portable/cpu/util/broadcast_indexes_range.h
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -122,6 +122,11 @@ class BroadcastIndexesIterator {
     }
 
     output_index() += n;
+    if (output_dim_or_zero_if_no_broadcasting_ == 0) {
+      std::fill(
+          current_indexes_.begin() + 1, current_indexes_.end(), output_index());
+      return *this;
+    }
     delinearize_index(
         output_index(),
         output_shape_,
diff --git a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
index 519cd9fe9f9..1023915ea66 100644
--- a/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_indexes_range_test.cpp
@@ -44,7 +44,9 @@ TEST(BroadcastIndexesRangeTest, OneDNotBroadcasted) {
 
   Tensor out = tf.zeros({5});
   int idx = 0;
-  for (const auto& elem : range_to_vec(BroadcastIndexesRange<1>(out, out))) {
+  const auto range = BroadcastIndexesRange<1>(out, out);
+  for (const auto& elem : range_to_vec(range)) {
+    EXPECT_EQ(*(range.begin() + idx), elem);
     EXPECT_EQ(elem[0], idx++);
     EXPECT_EQ(elem[0], elem[1]);
   }
@@ -71,7 +73,7 @@ TEST(BroadcastIndexesRangeTest, ScalarBroadcastToOneD) {
 template <typename Range>
 void test_operator_plus(const Range& range) {
   size_t idx = 0;
-  for (const auto indexes : range) {
+  for (const auto& indexes : range) {
     EXPECT_EQ(*(range.begin() + idx), indexes);
     idx++;
   }