Skip to content

Commit 84c3523

Browse files
authored
Merge pull request #2625 from hedaoyuan/nnpack_lib
NNPACKConvFunction
2 parents a71ed27 + 47f1031 commit 84c3523

File tree

6 files changed

+402
-20
lines changed

6 files changed

+402
-20
lines changed

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
4949
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
5050
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
5151
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
52+
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
5253

5354
# CMAKE_BUILD_TYPE
5455
if(NOT CMAKE_BUILD_TYPE)
@@ -129,6 +130,10 @@ if(WITH_GPU)
129130
endif(NOT WITH_DSO)
130131
endif(WITH_GPU)
131132

133+
if(USE_NNPACK)
134+
list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
135+
endif(USE_NNPACK)
136+
132137
add_subdirectory(proto)
133138

134139
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be

paddle/function/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@ if(WITH_GPU)
1010
cuda_compile(cu_objs ${cu_files})
1111
endif()
1212

13+
if(USE_NNPACK)
14+
include(nnpack/nnpack.cmake)
15+
list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
16+
if(WITH_TESTING)
17+
add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
18+
endif()
19+
endif()
20+
1321
add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
1422
add_dependencies(paddle_function ${external_project_dependencies})
1523
add_dependencies(paddle_function paddle_proto)
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "nnpack.h"
16+
#include "paddle/function/ConvOp.h"
17+
18+
DEFINE_bool(nnpack_allocate_outside,
19+
false,
20+
"Allocate and free workspace memory outside the NNPACK interface.");
21+
DEFINE_int32(nnpack_num_threads,
22+
0,
23+
"The number of nnpack threads"
24+
"default: 0; 0 to disable threadpool.");
25+
26+
namespace paddle {
27+
28+
nnp_convolution_algorithm get_nnp_convolution_algorithm(
29+
const std::string& algorithm) {
30+
if (algorithm == "auto") {
31+
return nnp_convolution_algorithm_auto;
32+
} else if (algorithm == "ft8x8") {
33+
return nnp_convolution_algorithm_ft8x8;
34+
} else if (algorithm == "ft16x16") {
35+
return nnp_convolution_algorithm_ft16x16;
36+
} else if (algorithm == "wt8x8") {
37+
return nnp_convolution_algorithm_wt8x8;
38+
} else if (algorithm == "implicit-gemm") {
39+
return nnp_convolution_algorithm_implicit_gemm;
40+
} else if (algorithm == "direct") {
41+
return nnp_convolution_algorithm_direct;
42+
} else {
43+
return nnp_convolution_algorithm_auto;
44+
}
45+
}
46+
47+
template <DeviceType Device>
48+
class NNPACKConvFunction : public ConvFunctionBase {
49+
public:
50+
void init(const FuncConfig& config) override {
51+
ConvFunctionBase::init(config);
52+
CHECK_EQ(groups_, (size_t)1);
53+
algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
54+
// algorithm_ = nnp_convolution_algorithm_auto;
55+
transform_strategy_ = nnp_convolution_transform_strategy_compute;
56+
nnp_status status = nnp_initialize();
57+
CHECK_EQ(status, nnp_status_success);
58+
workspaceBuffer_ = nullptr;
59+
workspaceSize_ = 0;
60+
61+
threadpool_ = nullptr;
62+
if (FLAGS_nnpack_num_threads) {
63+
threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
64+
VLOG(3) << "Number of threads "
65+
<< pthreadpool_get_threads_count(threadpool_);
66+
}
67+
}
68+
69+
~NNPACKConvFunction() {
70+
if (threadpool_) {
71+
pthreadpool_destroy(threadpool_);
72+
}
73+
if (workspaceBuffer_) {
74+
free(workspaceBuffer_);
75+
}
76+
}
77+
78+
virtual void check(const BufferArgs& inputs,
79+
const BufferArgs& outputs) override {
80+
const TensorShape& input = inputs[0].shape();
81+
const TensorShape& filter = inputs[1].shape();
82+
const TensorShape& output = outputs[0].shape();
83+
checkShape(input, filter, output);
84+
}
85+
86+
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
87+
CHECK_EQ(numInputs_, inputs.size());
88+
CHECK_EQ(numOutputs_, outputs.size());
89+
CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
90+
check(inputs, outputs);
91+
const TensorShape& input = inputs[0].shape();
92+
const TensorShape& filter = inputs[1].shape();
93+
const TensorShape& output = outputs[0].shape();
94+
95+
size_t batchSize = input[0];
96+
size_t inputChannels = input[1];
97+
size_t inputHeight = input[2];
98+
size_t inputWidth = input[3];
99+
size_t filterHeight = getFilterHeight(filter);
100+
size_t filterWidth = getFilterWidth(filter);
101+
size_t outputChannels = output[1];
102+
// size_t outputHeight = output[2];
103+
// size_t outputWidth = output[3];
104+
105+
nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
106+
nnp_padding padding = {.top = (size_t)paddingH(),
107+
.right = (size_t)paddingW(),
108+
.bottom = (size_t)paddingH(),
109+
.left = (size_t)paddingW()};
110+
nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
111+
nnp_size outputSubsampling = {.width = (size_t)strideW(),
112+
.height = (size_t)strideH()};
113+
114+
float* inputData = inputs[0].data<float>();
115+
float* filterData = inputs[1].data<float>();
116+
float* outputData = outputs[0].data<float>();
117+
118+
void* bufferPtr = nullptr;
119+
size_t* sizePtr = nullptr;
120+
size_t needSize;
121+
if (FLAGS_nnpack_allocate_outside) {
122+
if (batchSize == 1) {
123+
nnp_status status = nnp_convolution_inference(algorithm_,
124+
transform_strategy_,
125+
inputChannels,
126+
outputChannels,
127+
inputSize,
128+
padding,
129+
kernelSize,
130+
outputSubsampling,
131+
nullptr,
132+
nullptr,
133+
nullptr,
134+
nullptr,
135+
nullptr,
136+
&needSize,
137+
nnp_activation_identity,
138+
nullptr,
139+
nullptr,
140+
nullptr);
141+
CHECK_EQ(status, nnp_status_success);
142+
} else {
143+
// only supports stride = 1
144+
CHECK_EQ(strideH(), 1);
145+
CHECK_EQ(strideW(), 1);
146+
nnp_status status = nnp_convolution_output(algorithm_,
147+
batchSize,
148+
inputChannels,
149+
outputChannels,
150+
inputSize,
151+
padding,
152+
kernelSize,
153+
nullptr,
154+
nullptr,
155+
nullptr,
156+
nullptr,
157+
nullptr,
158+
&needSize,
159+
nnp_activation_identity,
160+
nullptr,
161+
nullptr,
162+
nullptr);
163+
CHECK_EQ(status, nnp_status_success);
164+
}
165+
166+
VLOG(3) << "workspace size is " << needSize;
167+
if (needSize > workspaceSize_) {
168+
workspaceSize_ = needSize;
169+
if (workspaceBuffer_) {
170+
free(workspaceBuffer_);
171+
} else {
172+
posix_memalign(&workspaceBuffer_, 64, needSize);
173+
}
174+
}
175+
176+
if (needSize) {
177+
bufferPtr = workspaceBuffer_;
178+
sizePtr = &needSize;
179+
}
180+
}
181+
182+
if (batchSize == 1) {
183+
nnp_status status =
184+
nnp_convolution_inference(algorithm_,
185+
transform_strategy_,
186+
inputChannels,
187+
outputChannels,
188+
inputSize,
189+
padding,
190+
kernelSize,
191+
outputSubsampling,
192+
inputData,
193+
filterData,
194+
nullptr, /* bias */
195+
outputData,
196+
bufferPtr,
197+
sizePtr,
198+
nnp_activation_identity,
199+
nullptr,
200+
threadpool_, /* threadpool */
201+
nullptr);
202+
CHECK_EQ(status, nnp_status_success);
203+
} else {
204+
// only supports stride = 1
205+
CHECK_EQ(strideH(), 1);
206+
CHECK_EQ(strideW(), 1);
207+
nnp_status status = nnp_convolution_output(algorithm_,
208+
batchSize,
209+
inputChannels,
210+
outputChannels,
211+
inputSize,
212+
padding,
213+
kernelSize,
214+
inputData,
215+
filterData,
216+
nullptr, /* bias */
217+
outputData,
218+
bufferPtr,
219+
sizePtr,
220+
nnp_activation_identity,
221+
nullptr,
222+
threadpool_, /* threadpool */
223+
nullptr);
224+
CHECK_EQ(status, nnp_status_success);
225+
}
226+
}
227+
228+
private:
229+
nnp_convolution_algorithm algorithm_;
230+
nnp_convolution_transform_strategy transform_strategy_;
231+
void* workspaceBuffer_;
232+
size_t workspaceSize_;
233+
pthreadpool_t threadpool_;
234+
};
235+
236+
REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
237+
238+
} // namespace paddle
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include <gtest/gtest.h>
16+
#include "paddle/function/Function.h"
17+
#include "paddle/function/FunctionTest.h"
18+
19+
DEFINE_string(algo,
20+
"auto",
21+
"The algorithm (auto, ft8x8, ft16x16, wt8x8, "
22+
"implicit-gemm, or direct) for computing convolution of NNPACK.");
23+
24+
namespace paddle {
25+
26+
#define IS_NNPACK_SUPPORT(algo, filterSize, stride) \
27+
if (algo == "direct" && filterSize != 1) continue; \
28+
if (algo == "direct" && batchSize != 1) continue; \
29+
if (algo == "wt8x8" && filterSize != 3) continue; \
30+
if (algo == "implicit-gemm" && batchSize != 1) continue; \
31+
if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue;
32+
33+
class ConvolutionTest {
34+
public:
35+
ConvolutionTest(const std::string& conv1,
36+
const std::string& conv2,
37+
std::string algo = "auto") {
38+
for (size_t batchSize : {1, 32}) {
39+
for (size_t inputSize : {7, 14, 54}) {
40+
for (size_t filterSize : {1, 3, 5}) {
41+
for (size_t inputChannels : {3, 64}) {
42+
for (size_t outputChannels : {3, 64, 128}) {
43+
if (inputChannels < outputChannels) break;
44+
for (size_t stride : {1, 2}) {
45+
// if batchSize > 1 NNPACKConv only supports stride = 1
46+
if (batchSize > 1 && stride > 1) break;
47+
for (size_t padding : {0, 1}) {
48+
if (padding >= filterSize) break;
49+
size_t outputSize =
50+
(inputSize - filterSize + 2 * padding + stride) / stride;
51+
IS_NNPACK_SUPPORT(algo, filterSize, stride);
52+
LOG(INFO) << " batchSize=" << batchSize
53+
<< " inputChannels=" << inputChannels
54+
<< " inputHeight=" << inputSize
55+
<< " inputWidth=" << inputSize
56+
<< " outputChannels=" << outputChannels
57+
<< " filterHeight=" << filterSize
58+
<< " filterWidth=" << filterSize
59+
<< " outputHeight=" << outputSize
60+
<< " outputWidth=" << outputSize
61+
<< " stride=" << stride << " padding=" << padding;
62+
63+
std::vector<size_t> paddings = {padding, padding};
64+
std::vector<size_t> strides = {stride, stride};
65+
Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
66+
conv1,
67+
conv2,
68+
FuncConfig()
69+
.set("paddings", paddings)
70+
.set("strides", strides)
71+
.set("groups", (size_t)1)
72+
.set("algo", algo));
73+
74+
TensorShape shape0{
75+
batchSize, inputChannels, inputSize, inputSize};
76+
TensorShape shape1{
77+
outputChannels, inputChannels, filterSize, filterSize};
78+
TensorShape shape2{
79+
batchSize, outputChannels, outputSize, outputSize};
80+
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0));
81+
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1));
82+
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2));
83+
test.run();
84+
}
85+
}
86+
}
87+
}
88+
}
89+
}
90+
}
91+
}
92+
};
93+
94+
TEST(Convolution, NNPACK) {
95+
// NNPACK only supports stride = 1
96+
ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo);
97+
}
98+
99+
} // namespace paddle
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Find the NNPACK library
2+
# NNPACK_ROOT - where to find NNPACK include and library.
3+
#
4+
5+
set(NNPACK_FOUND OFF)
6+
set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
7+
find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
8+
find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
9+
find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
10+
11+
if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
12+
set(NNPACK_FOUND ON)
13+
INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
14+
else()
15+
message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
16+
endif()

0 commit comments

Comments
 (0)