pytorch
diff --git a/‎backends/qnnpack/QNNPackBackend.cpp
+1-1 b/‎backends/qnnpack/QNNPackBackend.cpp
+1-1
diff --git a/‎backends/qnnpack/targets.bzl
+1-1 b/‎backends/qnnpack/targets.bzl
+1-1
diff --git a/‎backends/xnnpack/runtime/XNNCompiler.cpp
+1-1 b/‎backends/xnnpack/runtime/XNNCompiler.cpp
+1-1
diff --git a/‎backends/xnnpack/targets.bzl
+1-1 b/‎backends/xnnpack/targets.bzl
+1-1
diff --git a/‎backends/xnnpack/threadpool/TARGETS
+6 b/‎backends/xnnpack/threadpool/TARGETS
+6
diff --git a/‎backends/xnnpack/threadpool/targets.bzl
+42 b/‎backends/xnnpack/threadpool/targets.bzl
+42
diff --git a/‎backends/xnnpack/threadpool/test/TARGETS
+8 b/‎backends/xnnpack/threadpool/test/TARGETS
+8
diff --git a/‎backends/xnnpack/threadpool/test/targets.bzl
+20 b/‎backends/xnnpack/threadpool/test/targets.bzl
+20
diff --git a/‎backends/xnnpack/threadpool/test/threadpool_test.cpp
+188 b/‎backends/xnnpack/threadpool/test/threadpool_test.cpp
+188
@@ -9,7 +9,7 @@
 #include <executorch/backends/qnnpack/executor/QNNExecutor.h>
 #include <executorch/backends/qnnpack/qnnpack_schema_generated.h>
 #include <executorch/backends/qnnpack/utils/utils.h>
-#include <executorch/extension/fb/threadpool/threadpool.h>
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/runtime/backend/backend_registry.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
@@ -83,7 +83,7 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             "//executorch/runtime/backend:backend_registry",
-            "//executorch/extension/fb/threadpool:threadpool",
+            "//executorch/backends/xnnpack/threadpool:threadpool",
             "//executorch/util:memory_utils",
             "//{prefix}caffe2/aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack".format(
                 prefix = (
 
@@ -7,8 +7,8 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/backends/xnnpack/xnnpack_schema_generated.h>
-#include <executorch/extension/fb/threadpool/threadpool.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <unordered_map>
 
 
@@ -54,7 +54,7 @@ def define_common_targets():
             ":xnnpack_schema",
             "//executorch/runtime/backend:backend_registry",
             "//executorch/backends/qnnpack:qnnpack_utils",  # TODO Use (1) portable for choose_qparams(), (2) xnnpack for quantize_per_tensor()
-            "//executorch/extension/fb/threadpool:threadpool",
+            "//executorch/backends/xnnpack/threadpool:threadpool",
             "//executorch/util:memory_utils",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
         ],
 
@@ -0,0 +1,6 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+define_common_targets()
@@ -0,0 +1,42 @@
+load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    _THREADPOOL_SRCS = [
+        "threadpool.cpp",
+        "threadpool_guard.cpp",
+    ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else [])
+
+    _THREADPOOL_HEADERS = [
+        "threadpool.h",
+        "threadpool_guard.h",
+    ] + (["fb/threadpool_use_n_threads.h"] if not runtime.is_oss else [])
+
+    runtime.cxx_library(
+        name = "threadpool",
+        srcs = _THREADPOOL_SRCS,
+        deps = [
+            "//executorch/runtime/core:core",
+        ],
+        exported_headers = _THREADPOOL_HEADERS,
+        exported_deps = [
+            third_party_dep("pthreadpool"),
+        ],
+        external_deps = ["cpuinfo"],
+        exported_preprocessor_flags = [
+            "-DET_USE_THREADPOOL",
+        ],
+        visibility = [
+            "//executorch/...",
+            "//executorch/backends/...",
+            "//executorch/runtime/backend/...",
+            "//executorch/extension/threadpool/test/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
@@ -0,0 +1,20 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    _THREADPOOL_TESTS = [
+        "threadpool_test.cpp",
+    ] + (["fb/threadpool_use_n_threads_test.cpp"] if not runtime.is_oss else [])
+
+    runtime.cxx_test(
+        name = "threadpool_test",
+        srcs = _THREADPOOL_TESTS,
+        deps = [
+            "//executorch/backends/xnnpack/threadpool:threadpool",
+        ],
+    )
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+#include <mutex>
+#include <numeric>
+#include <random>
+
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/backends/xnnpack/threadpool/threadpool_guard.h>
+
+using namespace ::testing;
+
+namespace {
+
+size_t div_round_up(const size_t divident, const size_t divisor) {
+  return (divident + divisor - 1) / divisor;
+}
+
+void resize_and_fill_vector(std::vector<int32_t>& a, const size_t size) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> distrib(1, size * 2);
+  a.resize(size);
+  auto generator = [&distrib, &gen]() { return distrib(gen); };
+  std::generate(a.begin(), a.end(), generator);
+}
+
+void generate_add_test_inputs(
+    std::vector<int32_t>& a,
+    std::vector<int32_t>& b,
+    std::vector<int32_t>& c_ref,
+    std::vector<int32_t>& c,
+    size_t vector_size) {
+  resize_and_fill_vector(a, vector_size);
+  resize_and_fill_vector(b, vector_size);
+  resize_and_fill_vector(c, vector_size);
+  resize_and_fill_vector(c_ref, vector_size);
+  for (size_t i = 0, size = a.size(); i < size; ++i) {
+    c_ref[i] = a[i] + b[i];
+  }
+}
+
+void generate_reduce_test_inputs(
+    std::vector<int32_t>& a,
+    int32_t& c_ref,
+    size_t vector_size) {
+  resize_and_fill_vector(a, vector_size);
+  c_ref = 0;
+  for (size_t i = 0, size = a.size(); i < size; ++i) {
+    c_ref += a[i];
+  }
+}
+
+void run_lambda_with_size(
+    std::function<void(size_t)> f,
+    size_t range,
+    size_t grain_size) {
+  size_t num_grains = div_round_up(range, grain_size);
+
+  auto threadpool = torch::executorch::threadpool::get_threadpool();
+  threadpool->run(f, range);
+}
+} // namespace
+
+TEST(ThreadPoolTest, ParallelAdd) {
+  std::vector<int32_t> a, b, c, c_ref;
+  size_t vector_size = 100;
+  size_t grain_size = 10;
+
+  auto add_lambda = [&](size_t i) {
+    size_t start_index = i * grain_size;
+    size_t end_index = start_index + grain_size;
+    end_index = std::min(end_index, vector_size);
+    for (size_t j = start_index; j < end_index; ++j) {
+      c[j] = a[j] + b[j];
+    }
+  };
+
+  auto threadpool = torch::executorch::threadpool::get_threadpool();
+  EXPECT_GT(threadpool->get_thread_count(), 1);
+
+  generate_add_test_inputs(a, b, c_ref, c, vector_size);
+  run_lambda_with_size(add_lambda, vector_size, grain_size);
+  EXPECT_EQ(c, c_ref);
+
+  // Try smaller grain size
+  grain_size = 5;
+  generate_add_test_inputs(a, b, c_ref, c, vector_size);
+  run_lambda_with_size(add_lambda, vector_size, grain_size);
+  EXPECT_EQ(c, c_ref);
+
+  vector_size = 7;
+  generate_add_test_inputs(a, b, c_ref, c, vector_size);
+  run_lambda_with_size(add_lambda, vector_size, grain_size);
+  EXPECT_EQ(c, c_ref);
+
+  vector_size = 7;
+  grain_size = 5;
+  generate_add_test_inputs(a, b, c_ref, c, vector_size);
+  run_lambda_with_size(add_lambda, vector_size, grain_size);
+  EXPECT_EQ(c, c_ref);
+}
+
+// Test parallel reduction where we acquire lock within lambda
+TEST(ThreadPoolTest, ParallelReduce) {
+  std::vector<int32_t> a;
+  int32_t c = 0, c_ref = 0;
+  size_t vector_size = 100;
+  size_t grain_size = 11;
+  std::mutex m;
+
+  auto reduce_lambda = [&](size_t i) {
+    size_t start_index = i * grain_size;
+    size_t end_index = start_index + grain_size;
+    end_index = std::min(end_index, vector_size);
+    std::lock_guard<std::mutex> lock(m);
+    for (size_t j = start_index; j < end_index; ++j) {
+      c += a[j];
+    }
+  };
+
+  auto threadpool = torch::executorch::threadpool::get_threadpool();
+  EXPECT_GT(threadpool->get_thread_count(), 1);
+
+  generate_reduce_test_inputs(a, c_ref, vector_size);
+  run_lambda_with_size(reduce_lambda, vector_size, grain_size);
+  EXPECT_EQ(c, c_ref);
+
+  vector_size = 7;
+  c = c_ref = 0;
+  generate_reduce_test_inputs(a, c_ref, vector_size);
+  run_lambda_with_size(reduce_lambda, vector_size, grain_size);
+  EXPECT_EQ(c, c_ref);
+}
+
+// Copied from
+// caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
+TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
+  auto threadpool_ptr = torch::executorch::threadpool::get_pthreadpool();
+
+  ASSERT_NE(threadpool_ptr, nullptr);
+  {
+    torch::executorch::threadpool::NoThreadPoolGuard g1;
+    auto threadpool_ptr1 = torch::executorch::threadpool::get_pthreadpool();
+    ASSERT_EQ(threadpool_ptr1, nullptr);
+
+    {
+      torch::executorch::threadpool::NoThreadPoolGuard g2;
+      auto threadpool_ptr2 = torch::executorch::threadpool::get_pthreadpool();
+      ASSERT_EQ(threadpool_ptr2, nullptr);
+    }
+
+    // Guard should restore prev value (nullptr)
+    auto threadpool_ptr3 = torch::executorch::threadpool::get_pthreadpool();
+    ASSERT_EQ(threadpool_ptr3, nullptr);
+  }
+
+  // Guard should restore prev value (pthreadpool_)
+  auto threadpool_ptr4 = torch::executorch::threadpool::get_pthreadpool();
+  ASSERT_NE(threadpool_ptr4, nullptr);
+  ASSERT_EQ(threadpool_ptr4, threadpool_ptr);
+}
+
+TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
+  const std::vector<int64_t> array = {1, 2, 3};
+
+  auto pool = torch::executorch::threadpool::get_threadpool();
+  int64_t inner = 0;
+  {
+    // Run on same thread
+    torch::executorch::threadpool::NoThreadPoolGuard g1;
+    auto fn = [&array, &inner](const size_t task_id) {
+      inner += array[task_id];
+    };
+    pool->run(fn, 3);
+
+    // confirm the guard is on
+    auto threadpool_ptr = torch::executorch::threadpool::get_pthreadpool();
+    ASSERT_EQ(threadpool_ptr, nullptr);
+  }
+  ASSERT_EQ(inner, 6);
+}