diff --git a/.lintrunner.toml b/.lintrunner.toml
index 7667ac430d1..1a27228d266 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -218,6 +218,8 @@ exclude_patterns = [
     'examples/**',
     'extension/**',
     'kernels/optimized/**',
+    # Justified <functional> include.
+    'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
     'util/**',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73b89b6171e..fabf667cbe1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
diff --git a/Test.cmake b/Test.cmake
index d4b5f6aa1db..6bd7a86e70b 100644
--- a/Test.cmake
+++ b/Test.cmake
@@ -13,7 +13,6 @@ if(BUILD_TESTING)
   add_subdirectory(extension/evalue_util/test)
   add_subdirectory(extension/kernel_util/test)
   add_subdirectory(extension/memory_allocator/test)
-  add_subdirectory(extension/parallel/test)
   add_subdirectory(extension/pytree/test)
   add_subdirectory(kernels/portable/cpu/util/test)
   add_subdirectory(kernels/prim_ops/test)
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index b1ed81b6a7e..bbc9eec3a0e 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -88,7 +88,6 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -131,7 +130,7 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
+  "extension_threadpool",
 ]
 
 [targets.optimized_native_cpu_ops]
@@ -146,7 +145,6 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -227,19 +225,6 @@ deps = [
   "extension_runner_util",
 ]
 
-[targets.extension_parallel]
-buck_targets = [
-  "//extension/parallel:thread_parallel",
-]
-filters = [
-  ".cpp$",
-]
-deps = [
-  "executorch",
-  "executorch_core",
-  "extension_threadpool",
-]
-
 [targets.extension_tensor]
 buck_targets = [
   "//extension/tensor:tensor",
@@ -379,6 +364,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -393,6 +379,7 @@ filters = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
 ]
 
 [targets.xnnpack_schema]
@@ -427,7 +414,6 @@ deps = [
   "executorch",
   "executorch_core",
   "optimized_kernels",
-  "extension_parallel",
   "extension_threadpool",
   "reduce_util",
   "xnnpack_backend",
@@ -465,7 +451,7 @@ deps = [
   "executorch_core",
   "extension_data_loader",
   "extension_module",
-  "extension_parallel",
+  "extension_threadpool",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 9d429490d58..931d31de8ef 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -75,7 +75,6 @@ set(lib_list
     custom_ops
     extension_module
     extension_module_static
-    extension_parallel
     extension_runner_util
     extension_tensor
     extension_threadpool
@@ -131,14 +130,9 @@ endforeach()
 
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
-if(TARGET extension_parallel)
-  set_target_properties(
-    extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
-  )
-endif()
 if(TARGET cpublas)
   set_target_properties(
-    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
   )
 endif()
 if(TARGET extension_threadpool)
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index f0a7775e803..371fcf38a24 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -19,8 +19,8 @@
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index e3e8b30520f..1c4686fe3d0 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -37,7 +37,6 @@ def define_common_targets():
                 "//executorch/kernels/optimized:libblas{}".format(mkl_dep),
                 "//executorch/kernels/optimized:libvec",
                 "//executorch/extension/kernel_util:kernel_util",
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/extension/threadpool:threadpool",
             ],
             deps = [
diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt
deleted file mode 100644
index 7f727aafe46..00000000000
--- a/extension/parallel/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Please keep this file formatted by running:
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-
-if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO))
-  message(FATAL_ERROR "extension/parallel requires extension/threadpool")
-endif()
-
-add_library(extension_parallel thread_parallel.cpp)
-
-target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool)
-target_compile_options(extension_parallel PUBLIC ${_common_compile_options})
-
-install(
-  TARGETS extension_parallel
-  DESTINATION lib
-  INCLUDES
-  DESTINATION ${_common_include_directories})
diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/parallel/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
deleted file mode 100644
index 82a8502c034..00000000000
--- a/extension/parallel/targets.bzl
+++ /dev/null
@@ -1,26 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    runtime.cxx_library(
-        name = "thread_parallel",
-        srcs = [
-            "thread_parallel.cpp",
-        ],
-        exported_headers = [
-            "thread_parallel.h",
-        ],
-        visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        deps = [
-            "//executorch/extension/threadpool:threadpool",
-            "//executorch/runtime/core:core",
-        ],
-    )
diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/parallel/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl
deleted file mode 100644
index 791c0727471..00000000000
--- a/extension/parallel/test/targets.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    runtime.cxx_test(
-        name = "thread_parallel_test",
-        srcs = [
-            "thread_parallel_test.cpp",
-        ],
-        deps = [
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/runtime/platform:platform",
-        ],
-    )
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index 8b174075ae9..8bd1a572cd7 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -8,46 +8,7 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
-
-namespace executorch {
-namespace extension {
-
-/**
- * A helper to run function in parallel.
- *
- * begin, end: describe the extent of the workitems via first and last workitem
- * to be processed
- * grain_size: number of workitems processed by user callback which is
- * described below
- * f: user function applied in parallel to the chunks, signature:
- *   void f(int64_t begin, int64_t end)
- * Returns true if all work items are processed successfully, false otherwise
- *
- * Warning: parallel_for does NOT copy thread local states from the current
- * thread to the worker threads. Users need to protect the access to captured
- * data if they mutate them in f.
- */
-bool parallel_for(
-    const int64_t begin,
-    const int64_t end,
-    const int64_t grain_size,
-    const std::function<void(int64_t, int64_t)>& f);
-
-int64_t get_thread_num();
-
-void set_thread_num(int64_t thread_num);
-
-} // namespace extension
-} // namespace executorch
-
-namespace torch {
-namespace executor {
-// TODO(T197294990): Remove these deprecated aliases once all users have moved
-// to the new `::executorch` namespaces.
-using ::executorch::extension::get_thread_num;
-using ::executorch::extension::parallel_for;
-using ::executorch::extension::set_thread_num;
-} // namespace executor
-} // namespace torch
+// This header is a stub left behind after the move to
+// executorch/runtime/kernel. As such, it is deprecated; include and
+// use the below header directly instead.
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index c1d86acf75d..6e107cb6634 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 add_library(
-  extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp
+  extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
+                       cpuinfo_utils.cpp
 )
 target_link_libraries(
   extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool
@@ -42,3 +43,7 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
+
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 8bb0398b385..1c34dbbc7d4 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -9,6 +9,7 @@ def define_common_targets():
     """
 
     _THREADPOOL_SRCS = [
+        "thread_parallel.cpp",
         "threadpool.cpp",
         "threadpool_guard.cpp",
     ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else [])
@@ -29,6 +30,8 @@ def define_common_targets():
         exported_deps = [
             third_party_dep("pthreadpool"),
             third_party_dep("cpuinfo"),
+            # Allow users to use the header without an extra deps entry.
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         exported_preprocessor_flags = [
             "-DET_USE_THREADPOOL",
diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt
similarity index 53%
rename from extension/parallel/test/CMakeLists.txt
rename to extension/threadpool/test/CMakeLists.txt
index ab37f66c17d..3f9b13f2ab4 100644
--- a/extension/parallel/test/CMakeLists.txt
+++ b/extension/threadpool/test/CMakeLists.txt
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
 # This file should be formatted with
 # ~~~
 # cmake-format -i CMakeLists.txt
@@ -12,28 +14,14 @@
 #
 
 cmake_minimum_required(VERSION 3.19)
-project(extension_parallel_test)
-
-# Use C++17 for test.
-set(CMAKE_CXX_STANDARD 17)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp)
+set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp)
 
 et_cxx_test(
-  extension_parallel_test
-  SOURCES
-  ${_test_srcs}
-  EXTRA_LIBS
-  pthreadpool
-  cpuinfo
+  extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS
   extension_threadpool
 )
-target_include_directories(
-  extension_parallel_test
-  PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-)
diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl
index b8a39d8969a..8bdf776c825 100644
--- a/extension/threadpool/test/targets.bzl
+++ b/extension/threadpool/test/targets.bzl
@@ -18,3 +18,15 @@ def define_common_targets():
             "//executorch/extension/threadpool:threadpool",
         ],
     )
+
+    runtime.cxx_test(
+        name = "thread_parallel_test",
+        srcs = [
+            "thread_parallel_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp
similarity index 77%
rename from extension/parallel/test/thread_parallel_test.cpp
rename to extension/threadpool/test/thread_parallel_test.cpp
index d386429100d..fd72211a789 100644
--- a/extension/parallel/test/thread_parallel_test.cpp
+++ b/extension/threadpool/test/thread_parallel_test.cpp
@@ -11,13 +11,13 @@
 #include <array>
 #include <mutex>
 
-#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/platform.h>
 
 using namespace ::testing;
 using ::executorch::extension::parallel_for;
 
-class ParallelTest : public ::testing::Test {
+class ParallelTest : public ::testing::TestWithParam<bool> {
  protected:
   void SetUp() override {
     data_.fill(0);
@@ -42,12 +42,25 @@ class ParallelTest : public ::testing::Test {
     }
   }
 
+  template <typename Func>
+  bool parallel_for(
+      const int64_t begin,
+      const int64_t end,
+      const int64_t grain_size,
+      const Func& func) {
+    if (GetParam()) {
+      return executorch::extension::parallel_for(begin, end, grain_size, func);
+    }
+    return executorch::extension::internal::parallel_for_no_threadpool(
+        begin, end, grain_size, func);
+  }
+
   std::array<int, 10> data_;
   std::mutex mutex_;
   int sum_of_all_elements_;
 };
 
-TEST_F(ParallelTest, TestAllInvoked) {
+TEST_P(ParallelTest, TestAllInvoked) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -57,7 +70,7 @@ TEST_F(ParallelTest, TestAllInvoked) {
   }
 }
 
-TEST_F(ParallelTest, TestAllInvokedWithMutex) {
+TEST_P(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
   }));
@@ -70,7 +83,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_EQ(sum_of_all_elements_, expected_sum);
 }
 
-TEST_F(ParallelTest, TestInvalidRange) {
+TEST_P(ParallelTest, TestInvalidRange) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -82,7 +95,7 @@ TEST_F(ParallelTest, TestInvalidRange) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvalidRange2) {
+TEST_P(ParallelTest, TestInvalidRange2) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -94,7 +107,7 @@ TEST_F(ParallelTest, TestInvalidRange2) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
+TEST_P(ParallelTest, TestInvokePartialFromBeginning) {
   EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -107,7 +120,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialToEnd) {
+TEST_P(ParallelTest, TestInvokePartialToEnd) {
   EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -120,7 +133,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialMiddle) {
+TEST_P(ParallelTest, TestInvokePartialMiddle) {
   EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -136,7 +149,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2) {
+TEST_P(ParallelTest, TestChunkSize2) {
   EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -146,7 +159,7 @@ TEST_F(ParallelTest, TestChunkSize2) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2Middle) {
+TEST_P(ParallelTest, TestChunkSize2Middle) {
   EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -162,7 +175,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize3) {
+TEST_P(ParallelTest, TestChunkSize3) {
   EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -172,7 +185,7 @@ TEST_F(ParallelTest, TestChunkSize3) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize6) {
+TEST_P(ParallelTest, TestChunkSize6) {
   EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -182,7 +195,7 @@ TEST_F(ParallelTest, TestChunkSize6) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSizeTooLarge) {
+TEST_P(ParallelTest, TestChunkSizeTooLarge) {
   EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -191,3 +204,8 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) {
     EXPECT_EQ(data_[i], i);
   }
 }
+
+INSTANTIATE_TEST_SUITE_P(
+    ParallelTestWithOrWithoutThreadpool,
+    ParallelTest,
+    ::testing::Values(true, false));
diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp
similarity index 97%
rename from extension/parallel/thread_parallel.cpp
rename to extension/threadpool/thread_parallel.cpp
index fa09b240ad1..3c79a6775e6 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/threadpool/thread_parallel.cpp
@@ -10,9 +10,9 @@
 #include <cinttypes>
 #include <tuple>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index c6d31c20263..23e26bfa72b 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -43,7 +43,7 @@ endif()
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
 target_link_libraries(
-  cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool
+  cpublas PUBLIC executorch_core eigen_blas extension_threadpool
 )
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index c2b03cfebdd..fc47b4482d6 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -11,8 +11,8 @@
 #include <executorch/kernels/optimized/utils/math_utils.h>
 #include <executorch/kernels/optimized/utils/unroll.h>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 #include <array>
 
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 659c7afe090..dd246f38984 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -186,7 +186,10 @@ def define_libs(is_fbcode=False):
         ],
     )
 
-    LIBBLAS_DEPS = [third_party_dep("cpuinfo")]
+    LIBBLAS_DEPS = [
+        third_party_dep("cpuinfo"),
+        "//executorch/extension/threadpool:threadpool",
+    ]
 
     for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
         runtime.cxx_library(
@@ -229,7 +232,6 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
             ],
diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index 381e07cbe30..28a34426b23 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -269,28 +269,6 @@ ET_NODISCARD Error get_broadcast_target_size(
       a.sizes(), b.sizes(), out_sizes, out_sizes_len, out_dim);
 }
 
-void delinearize_index(
-    size_t linear_index,
-    executorch::aten::ArrayRef<Tensor::SizesType> shape,
-    size_t* out_indexes,
-    const size_t out_indexes_len) {
-  ET_CHECK(shape.size() <= out_indexes_len);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    auto dim = shape.size() - 1 - i;
-    auto dim_size = shape[dim];
-    out_indexes[dim] = linear_index % dim_size;
-    linear_index /= dim_size;
-  }
-}
-
-void delinearize_index(
-    size_t linear_index,
-    const Tensor& t,
-    size_t* out_indexes,
-    const size_t out_indexes_len) {
-  delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len);
-}
-
 size_t linearize_access_indexes(
     ArrayRef<size_t> indexes_broadcast_to,
     ssize_t broadcast_to_ndim,
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index f6bfae9bdaa..ed536f86c2d 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -10,6 +10,7 @@
 
 #include <c10/util/irange.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
@@ -207,36 +208,6 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size(
 ET_DEPRECATED void free_broadcast_tensor(
     const executorch::aten::Tensor& broadcast_tensor);
 
-/**
- * Delinearize a flattened index to per-dimension indexes.
- *
- * @param[in] linear_index The flattened index
- * @param[in] shape The tensor shape
- * @param[out] out_indexes The per-dimension indexes
- * @param[in] out_indexes_len The maximum size of the out_indexes array
- * @returns void
- */
-void delinearize_index(
-    size_t linear_index,
-    executorch::aten::ArrayRef<Tensor::SizesType> shape,
-    size_t* out_indexes,
-    const size_t out_indexes_len);
-
-/**
- * Delinearize a flattened index to per-dimension indexes.
- *
- * @param[in] linear_index The flattened index
- * @param[in] t The tensor object
- * @param[out] out_indexes The per-dimension indexes
- * @param[in] out_indexes_len The maximum size of the out_indexes array
- * @returns void
- */
-void delinearize_index(
-    size_t linear_index,
-    const Tensor& t,
-    size_t* out_indexes,
-    const size_t out_indexes_len);
-
 /**
  * Return the linear index for broatcast_from tensor, given the indexes and
  * number of dimensions of broadcast_to tensor, and the shape and strides
diff --git a/kernels/portable/cpu/util/delinearize_index.cpp b/kernels/portable/cpu/util/delinearize_index.cpp
new file mode 100644
index 00000000000..45378e6b05d
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearize_index.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/kernels/portable/cpu/util/delinearize_index.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch::executor {
+void delinearize_index(
+    size_t linear_index,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
+    size_t* out_indexes,
+    const size_t out_indexes_len) {
+  ET_CHECK(shape.size() <= out_indexes_len);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    auto dim = shape.size() - 1 - i;
+    auto dim_size = shape[dim];
+    out_indexes[dim] = linear_index % dim_size;
+    linear_index /= dim_size;
+  }
+}
+
+void delinearize_index(
+    size_t linear_index,
+    const Tensor& t,
+    size_t* out_indexes,
+    const size_t out_indexes_len) {
+  delinearize_index(linear_index, t.sizes(), out_indexes, out_indexes_len);
+}
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/delinearize_index.h b/kernels/portable/cpu/util/delinearize_index.h
new file mode 100644
index 00000000000..3441aa6083f
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearize_index.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch::executor {
+/**
+ * Delinearize a flattened index to per-dimension indexes.
+ *
+ * @param[in] linear_index The flattened index
+ * @param[in] shape The tensor shape
+ * @param[out] out_indexes The per-dimension indexes
+ * @param[in] out_indexes_len The maximum size of the out_indexes array
+ * @returns void
+ */
+void delinearize_index(
+    size_t linear_index,
+    executorch::aten::ArrayRef<Tensor::SizesType> shape,
+    size_t* out_indexes,
+    const size_t out_indexes_len);
+
+/**
+ * Delinearize a flattened index to per-dimension indexes.
+ *
+ * @param[in] linear_index The flattened index
+ * @param[in] t The tensor object
+ * @param[out] out_indexes The per-dimension indexes
+ * @param[in] out_indexes_len The maximum size of the out_indexes array
+ * @returns void
+ */
+void delinearize_index(
+    size_t linear_index,
+    const Tensor& t,
+    size_t* out_indexes,
+    const size_t out_indexes_len);
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 1c8edb9d3c7..e1c5cadfe84 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -66,9 +66,13 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "broadcast_util",
-        srcs = ["broadcast_util.cpp"],
+        srcs = [
+            "broadcast_util.cpp",
+            "delinearize_index.cpp",
+        ],
         exported_headers = [
             "broadcast_util.h",
+            "delinearize_index.h",
         ],
         exported_deps = [
             ":broadcast_indexes_range",
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index d49435f2825..5c95f10276d 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -51,6 +51,20 @@ def define_common_targets():
         preprocessor_flags = ["-DMAX_KERNEL_NUM=1"],
     )
 
+    runtime.cxx_library(
+        name = "thread_parallel_interface",
+        exported_headers = ["thread_parallel_interface.h"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/portable_type/c10/c10:c10",
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
new file mode 100644
index 00000000000..52100475c7b
--- /dev/null
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <c10/util/irange.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace extension {
+namespace internal {
+template <typename Func>
+inline bool parallel_for_no_threadpool(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& f) {
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
+#ifndef NDEBUG
+  // Go backwards through the range elementwise to catch code that
+  // assumes parallel_for is in order like a regular for loop.
+  for (const auto i : c10::irange(begin, end)) {
+    const auto offset = i - begin;
+    const auto idx = end - offset - 1;
+    f(idx, idx + 1);
+  }
+#else // NDEBUG
+  f(begin, end);
+#endif
+  return true;
+}
+
+} // namespace internal
+
+#ifdef ET_USE_THREADPOOL
+/**
+ * A helper to run a function in parallel.
+ *
+ * begin, end: describe the extent of the workitems via first and last workitem
+ * to be processed
+ * grain_size: number of workitems processed by user callback which is
+ * described below
+ * f: user function applied in parallel to the chunks, signature:
+ *   void f(int64_t begin, int64_t end)
+ * Returns true if all work items are processed successfully, false otherwise
+ *
+ * Warning: parallel_for does NOT copy thread local states from the current
+ * thread to the worker threads. Users need to protect the access to captured
+ * data if they mutate them in f.
+ */
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+int64_t get_thread_num();
+
+void set_thread_num(int64_t thread_num);
+#else // ET_USE_THREADPOOL
+template <typename Func>
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& func) {
+  return internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+}
+
+inline int64_t get_thread_num() {
+  return 0;
+}
+
+inline void set_thread_num(int64_t thread_num) {
+  ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
+}
+#endif // ET_USE_THREADPOOL
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::get_thread_num;
+using ::executorch::extension::parallel_for;
+using ::executorch::extension::set_thread_num;
+} // namespace executor
+} // namespace torch
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index cc5e625f1e8..be594f9d5f4 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -59,6 +59,16 @@
             "extension_tensor"
         ]
     },
+    {
+        "directory": "extension/threadpool/test",
+        "sources": [
+            "thread_parallel_test.cpp",
+            "threadpool_test.cpp"
+        ],
+        "additional_libs": [
+            "extension_threadpool"
+        ]
+    },
     {
         "directory": "kernels/portable/cpu/util/test",
         "sources": [