Skip to content

Commit 31712d6

Browse files
mcr229facebook-github-bot
authored andcommitted
Move extension/fb/threadpool into backends/xnnpack/threadpool
Summary: we are moving threadpool out of fb due to XNNPACK dependency on threadpool We are also moving threadpool_use_n_threads stuff int fb/ folder within threadpool folder. That way only files with use_n_threads are hidden away by fb Differential Revision: D48251334 fbshipit-source-id: 0fcb0250a8fff4fc95a616e35bd0fe51b3cc3afb
1 parent 67b74cf commit 31712d6

15 files changed

+531
-7
lines changed

backends/qnnpack/QNNPackBackend.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include <executorch/backends/qnnpack/executor/QNNExecutor.h>
1010
#include <executorch/backends/qnnpack/qnnpack_schema_generated.h>
1111
#include <executorch/backends/qnnpack/utils/utils.h>
12-
#include <executorch/extension/fb/threadpool/threadpool.h>
12+
#include <executorch/backends/xnnpack/threadpool/threadpool.h>
1313
#include <executorch/runtime/backend/backend_registry.h>
1414
#include <executorch/runtime/core/error.h>
1515
#include <executorch/runtime/core/evalue.h>

backends/qnnpack/targets.bzl

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def define_common_targets():
8383
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
8484
"//executorch/runtime/core/exec_aten/util:tensor_util",
8585
"//executorch/runtime/backend:backend_registry",
86-
"//executorch/extension/fb/threadpool:threadpool",
86+
"//executorch/backends/xnnpack/threadpool:threadpool",
8787
"//executorch/util:memory_utils",
8888
"//{prefix}caffe2/aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack".format(
8989
prefix = (

backends/xnnpack/runtime/XNNCompiler.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
*/
88

99
#include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
10+
#include <executorch/backends/xnnpack/threadpool/threadpool.h>
1011
#include <executorch/backends/xnnpack/xnnpack_schema_generated.h>
11-
#include <executorch/extension/fb/threadpool/threadpool.h>
1212
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
1313
#include <unordered_map>
1414

backends/xnnpack/targets.bzl

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def define_common_targets():
5454
":xnnpack_schema",
5555
"//executorch/runtime/backend:backend_registry",
5656
"//executorch/backends/qnnpack:qnnpack_utils", # TODO Use (1) portable for choose_qparams(), (2) xnnpack for quantize_per_tensor()
57-
"//executorch/extension/fb/threadpool:threadpool",
57+
"//executorch/backends/xnnpack/threadpool:threadpool",
5858
"//executorch/util:memory_utils",
5959
"//executorch/runtime/core/exec_aten/util:tensor_util",
6060
],

backends/xnnpack/threadpool/TARGETS

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
define_common_targets()
+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
2+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
3+
4+
def define_common_targets():
5+
"""Defines targets that should be shared between fbcode and xplat.
6+
7+
The directory containing this targets.bzl file should also contain both
8+
TARGETS and BUCK files that call this function.
9+
"""
10+
11+
_THREADPOOL_SRCS = [
12+
"threadpool.cpp",
13+
"threadpool_guard.cpp",
14+
] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else [])
15+
16+
_THREADPOOL_HEADERS = [
17+
"threadpool.h",
18+
"threadpool_guard.h",
19+
] + (["fb/threadpool_use_n_threads.h"] if not runtime.is_oss else [])
20+
21+
runtime.cxx_library(
22+
name = "threadpool",
23+
srcs = _THREADPOOL_SRCS,
24+
deps = [
25+
"//executorch/runtime/core:core",
26+
],
27+
exported_headers = _THREADPOOL_HEADERS,
28+
exported_deps = [
29+
third_party_dep("pthreadpool"),
30+
],
31+
external_deps = ["cpuinfo"],
32+
exported_preprocessor_flags = [
33+
"-DET_USE_THREADPOOL",
34+
],
35+
visibility = [
36+
"//executorch/...",
37+
"//executorch/backends/...",
38+
"//executorch/runtime/backend/...",
39+
"//executorch/extension/threadpool/test/...",
40+
"@EXECUTORCH_CLIENTS",
41+
],
42+
)
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Any targets that should be shared between fbcode and xplat must be defined in
2+
# targets.bzl. This file can contain fbcode-only targets.
3+
4+
load(":targets.bzl", "define_common_targets")
5+
6+
oncall("executorch")
7+
8+
define_common_targets()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
"""Defines targets that should be shared between fbcode and xplat.
5+
6+
The directory containing this targets.bzl file should also contain both
7+
TARGETS and BUCK files that call this function.
8+
"""
9+
10+
_THREADPOOL_TESTS = [
11+
"threadpool_test.cpp",
12+
] + (["fb/threadpool_use_n_threads_test.cpp"] if not runtime.is_oss else [])
13+
14+
runtime.cxx_test(
15+
name = "threadpool_test",
16+
srcs = _THREADPOOL_TESTS,
17+
deps = [
18+
"//executorch/backends/xnnpack/threadpool:threadpool",
19+
],
20+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
#include <mutex>
11+
#include <numeric>
12+
#include <random>
13+
14+
#include <executorch/backends/xnnpack/threadpool/threadpool.h>
15+
#include <executorch/backends/xnnpack/threadpool/threadpool_guard.h>
16+
17+
using namespace ::testing;
18+
19+
namespace {
20+
21+
size_t div_round_up(const size_t divident, const size_t divisor) {
22+
return (divident + divisor - 1) / divisor;
23+
}
24+
25+
void resize_and_fill_vector(std::vector<int32_t>& a, const size_t size) {
26+
std::random_device rd;
27+
std::mt19937 gen(rd());
28+
std::uniform_int_distribution<> distrib(1, size * 2);
29+
a.resize(size);
30+
auto generator = [&distrib, &gen]() { return distrib(gen); };
31+
std::generate(a.begin(), a.end(), generator);
32+
}
33+
34+
void generate_add_test_inputs(
35+
std::vector<int32_t>& a,
36+
std::vector<int32_t>& b,
37+
std::vector<int32_t>& c_ref,
38+
std::vector<int32_t>& c,
39+
size_t vector_size) {
40+
resize_and_fill_vector(a, vector_size);
41+
resize_and_fill_vector(b, vector_size);
42+
resize_and_fill_vector(c, vector_size);
43+
resize_and_fill_vector(c_ref, vector_size);
44+
for (size_t i = 0, size = a.size(); i < size; ++i) {
45+
c_ref[i] = a[i] + b[i];
46+
}
47+
}
48+
49+
void generate_reduce_test_inputs(
50+
std::vector<int32_t>& a,
51+
int32_t& c_ref,
52+
size_t vector_size) {
53+
resize_and_fill_vector(a, vector_size);
54+
c_ref = 0;
55+
for (size_t i = 0, size = a.size(); i < size; ++i) {
56+
c_ref += a[i];
57+
}
58+
}
59+
60+
void run_lambda_with_size(
61+
std::function<void(size_t)> f,
62+
size_t range,
63+
size_t grain_size) {
64+
size_t num_grains = div_round_up(range, grain_size);
65+
66+
auto threadpool = torch::executorch::threadpool::get_threadpool();
67+
threadpool->run(f, range);
68+
}
69+
} // namespace
70+
71+
TEST(ThreadPoolTest, ParallelAdd) {
72+
std::vector<int32_t> a, b, c, c_ref;
73+
size_t vector_size = 100;
74+
size_t grain_size = 10;
75+
76+
auto add_lambda = [&](size_t i) {
77+
size_t start_index = i * grain_size;
78+
size_t end_index = start_index + grain_size;
79+
end_index = std::min(end_index, vector_size);
80+
for (size_t j = start_index; j < end_index; ++j) {
81+
c[j] = a[j] + b[j];
82+
}
83+
};
84+
85+
auto threadpool = torch::executorch::threadpool::get_threadpool();
86+
EXPECT_GT(threadpool->get_thread_count(), 1);
87+
88+
generate_add_test_inputs(a, b, c_ref, c, vector_size);
89+
run_lambda_with_size(add_lambda, vector_size, grain_size);
90+
EXPECT_EQ(c, c_ref);
91+
92+
// Try smaller grain size
93+
grain_size = 5;
94+
generate_add_test_inputs(a, b, c_ref, c, vector_size);
95+
run_lambda_with_size(add_lambda, vector_size, grain_size);
96+
EXPECT_EQ(c, c_ref);
97+
98+
vector_size = 7;
99+
generate_add_test_inputs(a, b, c_ref, c, vector_size);
100+
run_lambda_with_size(add_lambda, vector_size, grain_size);
101+
EXPECT_EQ(c, c_ref);
102+
103+
vector_size = 7;
104+
grain_size = 5;
105+
generate_add_test_inputs(a, b, c_ref, c, vector_size);
106+
run_lambda_with_size(add_lambda, vector_size, grain_size);
107+
EXPECT_EQ(c, c_ref);
108+
}
109+
110+
// Test parallel reduction where we acquire lock within lambda
111+
TEST(ThreadPoolTest, ParallelReduce) {
112+
std::vector<int32_t> a;
113+
int32_t c = 0, c_ref = 0;
114+
size_t vector_size = 100;
115+
size_t grain_size = 11;
116+
std::mutex m;
117+
118+
auto reduce_lambda = [&](size_t i) {
119+
size_t start_index = i * grain_size;
120+
size_t end_index = start_index + grain_size;
121+
end_index = std::min(end_index, vector_size);
122+
std::lock_guard<std::mutex> lock(m);
123+
for (size_t j = start_index; j < end_index; ++j) {
124+
c += a[j];
125+
}
126+
};
127+
128+
auto threadpool = torch::executorch::threadpool::get_threadpool();
129+
EXPECT_GT(threadpool->get_thread_count(), 1);
130+
131+
generate_reduce_test_inputs(a, c_ref, vector_size);
132+
run_lambda_with_size(reduce_lambda, vector_size, grain_size);
133+
EXPECT_EQ(c, c_ref);
134+
135+
vector_size = 7;
136+
c = c_ref = 0;
137+
generate_reduce_test_inputs(a, c_ref, vector_size);
138+
run_lambda_with_size(reduce_lambda, vector_size, grain_size);
139+
EXPECT_EQ(c, c_ref);
140+
}
141+
142+
// Copied from
143+
// caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
144+
TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
145+
auto threadpool_ptr = torch::executorch::threadpool::get_pthreadpool();
146+
147+
ASSERT_NE(threadpool_ptr, nullptr);
148+
{
149+
torch::executorch::threadpool::NoThreadPoolGuard g1;
150+
auto threadpool_ptr1 = torch::executorch::threadpool::get_pthreadpool();
151+
ASSERT_EQ(threadpool_ptr1, nullptr);
152+
153+
{
154+
torch::executorch::threadpool::NoThreadPoolGuard g2;
155+
auto threadpool_ptr2 = torch::executorch::threadpool::get_pthreadpool();
156+
ASSERT_EQ(threadpool_ptr2, nullptr);
157+
}
158+
159+
// Guard should restore prev value (nullptr)
160+
auto threadpool_ptr3 = torch::executorch::threadpool::get_pthreadpool();
161+
ASSERT_EQ(threadpool_ptr3, nullptr);
162+
}
163+
164+
// Guard should restore prev value (pthreadpool_)
165+
auto threadpool_ptr4 = torch::executorch::threadpool::get_pthreadpool();
166+
ASSERT_NE(threadpool_ptr4, nullptr);
167+
ASSERT_EQ(threadpool_ptr4, threadpool_ptr);
168+
}
169+
170+
TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
171+
const std::vector<int64_t> array = {1, 2, 3};
172+
173+
auto pool = torch::executorch::threadpool::get_threadpool();
174+
int64_t inner = 0;
175+
{
176+
// Run on same thread
177+
torch::executorch::threadpool::NoThreadPoolGuard g1;
178+
auto fn = [&array, &inner](const size_t task_id) {
179+
inner += array[task_id];
180+
};
181+
pool->run(fn, 3);
182+
183+
// confirm the guard is on
184+
auto threadpool_ptr = torch::executorch::threadpool::get_pthreadpool();
185+
ASSERT_EQ(threadpool_ptr, nullptr);
186+
}
187+
ASSERT_EQ(inner, 6);
188+
}

0 commit comments

Comments
 (0)